From 8640b85acfc861531dff37e5952a9ae763e0d55f Mon Sep 17 00:00:00 2001
From: HDCharles <charlesdavidhernandez@gmail.com>
Date: Tue, 12 Dec 2023 18:05:07 -0800
Subject: [PATCH] [not for land] testing torchao coverage on torchbench/dynamo
 models

Summary: testing locally accuracy and perf

Test Plan: sh torchao_benchmarks.sh

Reviewers:

Subscribers:

Tasks:

Tags:

ghstack-source-id: 98a007a42e7c024fd8fb87f2d92223ffc528e3c3
Pull Request resolved: https://github.com/pytorch/benchmark/pull/2075
---
 log.log                                       | 34549 ++++++++++++++++
 torchao_benchmarks.sh                         |    25 +
 userbenchmark/dynamo/dynamobench/common.py    |    50 +-
 .../dynamo/dynamobench/torchbench.py          |    11 +
 4 files changed, 34627 insertions(+), 8 deletions(-)
 create mode 100644 log.log
 create mode 100644 torchao_benchmarks.sh
diff --git a/log.log b/log.log
new file mode 100644
index 0000000000..39ac878584
--- /dev/null
+++ b/log.log
@@ -0,0 +1,34549 @@
+start dynamic
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+torchrec_dlrm
+/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/fbgemm_gpu_py.so: undefined symbol: _ZNK5torch8autograd4Node4nameEv
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 36, in <module>
+    run()
+  File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 30, in run
+    benchmark.run(bm_args)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/run.py", line 24, in run
+    main(TorchBenchmarkRunner(), original_dir, args)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3133, in main
+    process_entry(0, runner, original_dir, args)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3090, in process_entry
+    return maybe_fresh_cache(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1714, in inner
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 383, in load_model
+    module = importlib.import_module(c)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/importlib/__init__.py", line 126, in import_module
+    return _bootstrap._gcd_import(name[level:], package, level)
+  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
+  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
+  File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
+  File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
+  File "<frozen importlib._bootstrap_external>", line 883, in exec_module
+  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/__init__.py", line 7, in <module>
+    from .data.dlrm_dataloader import get_dataloader
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/data/dlrm_dataloader.py", line 13, in <module>
+    from torchrec.datasets.criteo import (
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/__init__.py", line 8, in <module>
+    import torchrec.distributed  # noqa
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/__init__.py", line 36, in <module>
+    from torchrec.distributed.model_parallel import DistributedModelParallel  # noqa
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/model_parallel.py", line 21, in <module>
+    from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/__init__.py", line 22, in <module>
+    from torchrec.distributed.planner.planners import EmbeddingShardingPlanner  # noqa
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/planners.py", line 19, in <module>
+    from torchrec.distributed.planner.constants import BATCH_SIZE, MAX_SIZE
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/constants.py", line 10, in <module>
+    from torchrec.distributed.embedding_types import EmbeddingComputeKernel
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/embedding_types.py", line 14, in <module>
+    from fbgemm_gpu.split_table_batched_embeddings_ops_training import EmbeddingLocation
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 23, in <module>
+    from . import _fbgemm_gpu_docs, sparse_ops  # noqa: F401, E402  # noqa: F401, E402
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/_fbgemm_gpu_docs.py", line 19, in <module>
+    torch.ops.fbgemm.jagged_2d_to_dense,
+  File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 822, in __getattr__
+    raise AttributeError(
+AttributeError: '_OpNamespace' 'fbgemm' object has no attribute 'jagged_2d_to_dense'
+Run failed with return code:  1
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+BERT_pytorch
+cuda eval  BERT_pytorch                        int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:04,  6.46it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:04,  6.71it/s]running benchmark:  10%|█         | 3/30 [00:00<00:03,  6.79it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:03,  6.86it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:03,  6.84it/s]running benchmark:  20%|██        | 6/30 [00:00<00:03,  6.85it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:03,  6.84it/s]running benchmark:  27%|██▋       | 8/30 [00:01<00:03,  6.84it/s]running benchmark:  30%|███       | 9/30 [00:01<00:03,  6.85it/s]running benchmark:  33%|███▎      | 10/30 [00:01<00:02,  6.86it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:02,  6.90it/s]running benchmark:  40%|████      | 12/30 [00:01<00:02,  6.84it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:02,  6.83it/s]running benchmark:  47%|████▋     | 14/30 [00:02<00:02,  6.83it/s]running benchmark:  50%|█████     | 15/30 [00:02<00:02,  6.83it/s]running benchmark:  53%|█████▎    | 16/30 [00:02<00:02,  6.84it/s]running benchmark:  57%|█████▋    | 17/30 [00:02<00:01,  6.83it/s]running benchmark:  60%|██████    | 18/30 [00:02<00:01,  6.82it/s]running benchmark:  63%|██████▎   | 19/30 [00:02<00:01,  6.79it/s]running benchmark:  67%|██████▋   | 20/30 [00:02<00:01,  6.81it/s]running benchmark:  70%|███████   | 21/30 [00:03<00:01,  6.80it/s]running benchmark:  73%|███████▎  | 22/30 [00:03<00:01,  6.80it/s]running benchmark:  77%|███████▋  | 23/30 [00:03<00:01,  6.82it/s]running benchmark:  80%|████████  | 24/30 [00:03<00:00,  6.83it/s]running benchmark:  83%|████████▎ | 25/30 [00:03<00:00,  6.82it/s]running benchmark:  87%|████████▋ | 26/30 [00:03<00:00,  6.84it/s]running benchmark:  90%|█████████ | 27/30 [00:03<00:00,  6.84it/s]running benchmark:  93%|█████████▎| 28/30 [00:04<00:00,  6.84it/s]running benchmark:  97%|█████████▋| 29/30 [00:04<00:00,  6.86it/s]running benchmark: 100%|██████████| 30/30 [00:04<00:00,  6.87it/s]running benchmark: 100%|██████████| 30/30 [00:04<00:00,  6.83it/s]
+33348.574ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+Background_Matting
+cuda eval  Background_Matting                  int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:00, 33.47it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 38.96it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 40.73it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 41.56it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 42.00it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 42.24it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 41.26it/s]
+2061.007ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model DALLE2_pytorch does not support bfloat16, running with amp instead
+loading model: 0it [00:11, ?it/s]
+WARNING:common:Model DALLE2_pytorch does not support bfloat16, running with amp instead
+DALLE2_pytorch
+cuda eval  DALLE2_pytorch                      int8dynamic               
+WARNING:common:Model DALLE2_pytorch does not support bfloat16, running with amp instead
+[2023-12-11 17:59:21,183] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 17:59:21,840] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 17:59:22,305] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 17:59:22,771] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 17:59:23,241] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 17:59:24,044] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 17:59:24,515] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 17:59:24,998] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 17:59:25,467] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 17:59:25,927] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 17:59:26,400] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 17:59:26,879] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-11 18:00:03,225] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 18:00:03,819] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 18:00:04,268] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 18:00:04,707] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 18:00:05,139] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 18:00:05,586] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 18:00:06,037] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 18:00:06,488] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 18:00:06,943] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 18:00:07,385] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 18:00:07,857] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 18:00:08,317] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:01<00:55,  1.91s/it]running benchmark:   7%|▋         | 2/30 [00:03<00:53,  1.90s/it]running benchmark:  10%|█         | 3/30 [00:05<00:51,  1.92s/it]running benchmark:  13%|█▎        | 4/30 [00:07<00:49,  1.92s/it]running benchmark:  17%|█▋        | 5/30 [00:09<00:47,  1.91s/it]running benchmark:  20%|██        | 6/30 [00:11<00:45,  1.91s/it]running benchmark:  23%|██▎       | 7/30 [00:13<00:44,  1.95s/it]running benchmark:  27%|██▋       | 8/30 [00:14<00:39,  1.79s/it]running benchmark:  30%|███       | 9/30 [00:16<00:34,  1.64s/it]running benchmark:  33%|███▎      | 10/30 [00:17<00:30,  1.53s/it]running benchmark:  37%|███▋      | 11/30 [00:18<00:27,  1.46s/it]running benchmark:  40%|████      | 12/30 [00:20<00:26,  1.46s/it]running benchmark:  43%|████▎     | 13/30 [00:21<00:25,  1.47s/it]running benchmark:  47%|████▋     | 14/30 [00:23<00:23,  1.48s/it]running benchmark:  50%|█████     | 15/30 [00:24<00:21,  1.42s/it]running benchmark:  53%|█████▎    | 16/30 [00:25<00:19,  1.39s/it]running benchmark:  57%|█████▋    | 17/30 [00:27<00:17,  1.36s/it]running benchmark:  60%|██████    | 18/30 [00:28<00:16,  1.34s/it]running benchmark:  63%|██████▎   | 19/30 [00:29<00:14,  1.33s/it]running benchmark:  67%|██████▋   | 20/30 [00:31<00:13,  1.32s/it]running benchmark:  70%|███████   | 21/30 [00:32<00:11,  1.32s/it]running benchmark:  73%|███████▎  | 22/30 [00:33<00:10,  1.31s/it]running benchmark:  77%|███████▋  | 23/30 [00:35<00:09,  1.31s/it]running benchmark:  80%|████████  | 24/30 [00:36<00:07,  1.31s/it]running benchmark:  83%|████████▎ | 25/30 [00:37<00:06,  1.30s/it]running benchmark:  87%|████████▋ | 26/30 [00:38<00:05,  1.30s/it]running benchmark:  90%|█████████ | 27/30 [00:40<00:03,  1.30s/it]running benchmark:  93%|█████████▎| 28/30 [00:41<00:02,  1.30s/it]running benchmark:  97%|█████████▋| 29/30 [00:42<00:01,  1.30s/it]running benchmark: 100%|██████████| 30/30 [00:44<00:00,  1.30s/it]running benchmark: 100%|██████████| 30/30 [00:44<00:00,  1.47s/it]
+31985.044ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:05, ?it/s]
+LearningToPaint
+cuda eval  LearningToPaint                     int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 117.19it/s]running benchmark:  83%|████████▎ | 25/30 [00:00<00:00, 121.23it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 121.11it/s]
+3902.784ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model Super_SloMo does not support bfloat16, running with amp instead
+loading model: 0it [00:04, ?it/s]
+WARNING:common:Model Super_SloMo does not support bfloat16, running with amp instead
+Super_SloMo
+cuda eval  Super_SloMo                         int8dynamic               
+WARNING:common:Model Super_SloMo does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:03,  9.00it/s]running benchmark:  10%|█         | 3/30 [00:00<00:01, 14.69it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 16.60it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 17.51it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 17.98it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:01, 18.27it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 18.46it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 18.58it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 18.67it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 18.73it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 18.74it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 18.75it/s]running benchmark:  83%|████████▎ | 25/30 [00:01<00:00, 18.76it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 18.74it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 18.75it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 18.16it/s]
+1619.084ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+alexnet
+cuda eval  alexnet                             int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 83.11it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 86.11it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 87.20it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 86.79it/s]
+3351.353ms
+loading model: 0it [00:00, ?it/s]basic_gnn_edgecnn
+loading model: 0it [00:03, ?it/s]
+cuda eval  basic_gnn_edgecnn                   int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 176.78it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 191.59it/s]
+1372.787ms
+loading model: 0it [00:00, ?it/s]basic_gnn_gcn
+loading model: 0it [00:03, ?it/s]
+cuda eval  basic_gnn_gcn                       int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 127.65it/s]running benchmark:  87%|████████▋ | 26/30 [00:00<00:00, 128.52it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 128.46it/s]
+1041.435ms
+loading model: 0it [00:00, ?it/s]basic_gnn_gin
+loading model: 0it [00:03, ?it/s]
+cuda eval  basic_gnn_gin                       int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 337.94it/s]
+1216.339ms
+loading model: 0it [00:00, ?it/s]basic_gnn_sage
+loading model: 0it [00:03, ?it/s]
+cuda eval  basic_gnn_sage                      int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 156.16it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 164.51it/s]
+1270.499ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+cm3leon_generate
+cuda eval  cm3leon_generate                    int8dynamic               
+AUTOTUNE bmm(16x1x825, 16x825x96)
+  bmm 0.0160 ms 100.0%
+  triton_bmm_78901 0.0192 ms 83.1%
+  triton_bmm_78900 0.0201 ms 79.6%
+  triton_bmm_78898 0.0263 ms 60.8%
+  triton_bmm_78897 0.0267 ms 59.9%
+  triton_bmm_78894 0.0268 ms 59.7%
+  triton_bmm_78896 0.0269 ms 59.5%
+  triton_bmm_78893 0.0286 ms 55.9%
+  triton_bmm_78895 0.0300 ms 53.4%
+  triton_bmm_78892 0.0367 ms 43.6%
+SingleProcess AUTOTUNE takes 3.9217 seconds
+AUTOTUNE bmm(16x1x96, 16x96x826)
+  triton_bmm_78968 0.0093 ms 100.0%
+  triton_bmm_78966 0.0097 ms 96.1%
+  triton_bmm_78970 0.0100 ms 93.9%
+  triton_bmm_78965 0.0100 ms 93.3%
+  triton_bmm_78967 0.0100 ms 93.0%
+  triton_bmm_78969 0.0102 ms 91.5%
+  triton_bmm_78972 0.0103 ms 90.4%
+  triton_bmm_78964 0.0104 ms 89.6%
+  triton_bmm_78971 0.0107 ms 87.4%
+  triton_bmm_78975 0.0109 ms 85.9%
+SingleProcess AUTOTUNE takes 3.9113 seconds
+AUTOTUNE bmm(16x1x826, 16x826x96)
+  triton_bmm_78994 0.0121 ms 100.0%
+  triton_bmm_78993 0.0124 ms 97.4%
+  triton_bmm_78996 0.0128 ms 94.8%
+  triton_bmm_78992 0.0140 ms 86.5%
+  triton_bmm_78997 0.0140 ms 86.5%
+  triton_bmm_78991 0.0150 ms 80.6%
+  bmm 0.0168 ms 72.1%
+  triton_bmm_78990 0.0169 ms 71.9%
+  triton_bmm_78989 0.0173 ms 70.2%
+  triton_bmm_78988 0.0252 ms 48.0%
+SingleProcess AUTOTUNE takes 4.0773 seconds
+AUTOTUNE bmm(16x1x96, 16x96x827)
+  triton_bmm_79060 0.0099 ms 100.0%
+  triton_bmm_79064 0.0100 ms 99.4%
+  triton_bmm_79066 0.0100 ms 99.4%
+  triton_bmm_79063 0.0102 ms 96.9%
+  triton_bmm_79065 0.0102 ms 96.7%
+  triton_bmm_79062 0.0103 ms 96.0%
+  triton_bmm_79068 0.0104 ms 95.4%
+  triton_bmm_79071 0.0104 ms 95.4%
+  triton_bmm_79070 0.0105 ms 93.9%
+  triton_bmm_79069 0.0106 ms 93.6%
+SingleProcess AUTOTUNE takes 3.8827 seconds
+AUTOTUNE bmm(16x1x827, 16x827x96)
+  bmm 0.0170 ms 100.0%
+  triton_bmm_79093 0.0188 ms 90.2%
+  triton_bmm_79092 0.0197 ms 86.2%
+  triton_bmm_79088 0.0266 ms 63.9%
+  triton_bmm_79089 0.0267 ms 63.6%
+  triton_bmm_79086 0.0268 ms 63.4%
+  triton_bmm_79090 0.0269 ms 63.1%
+  triton_bmm_79085 0.0282 ms 60.2%
+  triton_bmm_79087 0.0296 ms 57.5%
+  triton_bmm_79084 0.0372 ms 45.7%
+SingleProcess AUTOTUNE takes 3.9412 seconds
+AUTOTUNE bmm(16x1x96, 16x96x828)
+  triton_bmm_79162 0.0094 ms 100.0%
+  triton_bmm_79156 0.0099 ms 94.8%
+  triton_bmm_79160 0.0099 ms 94.8%
+  triton_bmm_79157 0.0100 ms 94.2%
+  triton_bmm_79163 0.0101 ms 92.7%
+  triton_bmm_79158 0.0101 ms 92.4%
+  triton_bmm_79161 0.0102 ms 91.6%
+  triton_bmm_79164 0.0103 ms 90.7%
+  triton_bmm_79159 0.0105 ms 89.3%
+  triton_bmm_79167 0.0109 ms 85.9%
+SingleProcess AUTOTUNE takes 3.5792 seconds
+AUTOTUNE bmm(16x1x828, 16x828x96)
+  triton_bmm_79185 0.0119 ms 100.0%
+  triton_bmm_79186 0.0127 ms 93.9%
+  triton_bmm_79188 0.0128 ms 93.2%
+  triton_bmm_79184 0.0140 ms 85.1%
+  triton_bmm_79189 0.0140 ms 84.9%
+  triton_bmm_79183 0.0150 ms 79.1%
+  bmm 0.0163 ms 73.1%
+  triton_bmm_79182 0.0168 ms 71.0%
+  triton_bmm_79181 0.0178 ms 66.8%
+  triton_bmm_79180 0.0248 ms 47.9%
+SingleProcess AUTOTUNE takes 4.4058 seconds
+AUTOTUNE bmm(16x1x96, 16x96x829)
+  triton_bmm_79256 0.0094 ms 100.0%
+  triton_bmm_79254 0.0098 ms 96.7%
+  triton_bmm_79260 0.0098 ms 96.4%
+  triton_bmm_79258 0.0100 ms 94.6%
+  triton_bmm_79252 0.0105 ms 90.2%
+  triton_bmm_79263 0.0105 ms 89.9%
+  triton_bmm_79262 0.0105 ms 89.7%
+  triton_bmm_79261 0.0107 ms 88.6%
+  triton_bmm_79253 0.0107 ms 88.3%
+  triton_bmm_79255 0.0107 ms 88.1%
+SingleProcess AUTOTUNE takes 3.9336 seconds
+AUTOTUNE bmm(16x1x829, 16x829x96)
+  bmm 0.0170 ms 100.0%
+  triton_bmm_79285 0.0192 ms 88.4%
+  triton_bmm_79284 0.0197 ms 86.2%
+  triton_bmm_79281 0.0263 ms 64.5%
+  triton_bmm_79282 0.0266 ms 64.0%
+  triton_bmm_79278 0.0268 ms 63.4%
+  triton_bmm_79280 0.0271 ms 62.6%
+  triton_bmm_79277 0.0283 ms 60.1%
+  triton_bmm_79279 0.0296 ms 57.5%
+  triton_bmm_79276 0.0371 ms 45.8%
+SingleProcess AUTOTUNE takes 3.7819 seconds
+AUTOTUNE bmm(16x1x96, 16x96x830)
+  triton_bmm_79348 0.0099 ms 100.0%
+  triton_bmm_79352 0.0099 ms 99.7%
+  triton_bmm_79354 0.0100 ms 99.4%
+  triton_bmm_79349 0.0100 ms 99.0%
+  triton_bmm_79355 0.0101 ms 97.5%
+  triton_bmm_79350 0.0103 ms 96.0%
+  triton_bmm_79356 0.0104 ms 95.4%
+  triton_bmm_79358 0.0106 ms 93.6%
+  triton_bmm_79351 0.0107 ms 92.8%
+  triton_bmm_79353 0.0107 ms 92.2%
+SingleProcess AUTOTUNE takes 3.8128 seconds
+AUTOTUNE bmm(16x1x830, 16x830x96)
+  triton_bmm_79377 0.0119 ms 100.0%
+  triton_bmm_79378 0.0123 ms 97.1%
+  triton_bmm_79380 0.0128 ms 93.0%
+  triton_bmm_79376 0.0140 ms 84.9%
+  triton_bmm_79381 0.0145 ms 82.3%
+  triton_bmm_79375 0.0145 ms 82.1%
+  bmm 0.0164 ms 72.8%
+  triton_bmm_79374 0.0165 ms 72.2%
+  triton_bmm_79373 0.0177 ms 67.1%
+  triton_bmm_79372 0.0258 ms 46.2%
+SingleProcess AUTOTUNE takes 4.1376 seconds
+AUTOTUNE bmm(16x1x96, 16x96x831)
+  triton_bmm_79450 0.0095 ms 100.0%
+  triton_bmm_79446 0.0097 ms 98.0%
+  triton_bmm_79444 0.0099 ms 96.4%
+  triton_bmm_79452 0.0099 ms 96.1%
+  triton_bmm_79448 0.0100 ms 95.8%
+  triton_bmm_79445 0.0102 ms 93.7%
+  triton_bmm_79451 0.0103 ms 92.8%
+  triton_bmm_79455 0.0104 ms 91.7%
+  triton_bmm_79449 0.0107 ms 89.5%
+  triton_bmm_79447 0.0107 ms 89.2%
+SingleProcess AUTOTUNE takes 3.6353 seconds
+AUTOTUNE bmm(16x1x831, 16x831x96)
+  bmm 0.0171 ms 100.0%
+  triton_bmm_79477 0.0188 ms 90.7%
+  triton_bmm_79476 0.0194 ms 88.3%
+  triton_bmm_79474 0.0248 ms 69.0%
+  triton_bmm_79470 0.0251 ms 68.2%
+  triton_bmm_79473 0.0252 ms 67.9%
+  triton_bmm_79472 0.0253 ms 67.4%
+  triton_bmm_79471 0.0274 ms 62.4%
+  triton_bmm_79469 0.0285 ms 60.0%
+  triton_bmm_79468 0.0386 ms 44.3%
+SingleProcess AUTOTUNE takes 4.2638 seconds
+AUTOTUNE bmm(16x1x96, 16x96x832)
+  triton_bmm_79540 0.0099 ms 100.0%
+  triton_bmm_79546 0.0099 ms 99.4%
+  triton_bmm_79544 0.0100 ms 99.0%
+  triton_bmm_79548 0.0100 ms 99.0%
+  triton_bmm_79547 0.0101 ms 97.6%
+  triton_bmm_79545 0.0101 ms 97.2%
+  triton_bmm_79542 0.0103 ms 96.0%
+  triton_bmm_79541 0.0104 ms 94.5%
+  triton_bmm_79543 0.0105 ms 94.2%
+  triton_bmm_79550 0.0106 ms 93.3%
+SingleProcess AUTOTUNE takes 3.7141 seconds
+AUTOTUNE bmm(16x1x832, 16x832x96)
+  bmm 0.0123 ms 100.0%
+  triton_bmm_79569 0.0125 ms 98.5%
+  triton_bmm_79570 0.0125 ms 98.0%
+  triton_bmm_79572 0.0131 ms 93.7%
+  triton_bmm_79568 0.0134 ms 91.6%
+  triton_bmm_79573 0.0141 ms 87.3%
+  triton_bmm_79567 0.0150 ms 81.7%
+  triton_bmm_79566 0.0166 ms 73.8%
+  triton_bmm_79565 0.0172 ms 71.4%
+  triton_bmm_79564 0.0251 ms 48.9%
+SingleProcess AUTOTUNE takes 4.1800 seconds
+AUTOTUNE bmm(16x1x96, 16x96x833)
+  triton_bmm_79637 0.0101 ms 100.0%
+  triton_bmm_79644 0.0101 ms 100.0%
+  triton_bmm_79640 0.0102 ms 99.1%
+  triton_bmm_79642 0.0102 ms 99.1%
+  triton_bmm_79641 0.0103 ms 98.6%
+  triton_bmm_79647 0.0104 ms 97.8%
+  triton_bmm_79636 0.0105 ms 96.6%
+  triton_bmm_79638 0.0105 ms 96.4%
+  triton_bmm_79645 0.0106 ms 96.1%
+  triton_bmm_79643 0.0107 ms 94.6%
+SingleProcess AUTOTUNE takes 4.0351 seconds
+AUTOTUNE bmm(16x1x833, 16x833x96)
+  bmm 0.0143 ms 100.0%
+  triton_bmm_79669 0.0186 ms 76.8%
+  triton_bmm_79668 0.0206 ms 69.4%
+  triton_bmm_79665 0.0270 ms 52.8%
+  triton_bmm_79666 0.0271 ms 52.6%
+  triton_bmm_79664 0.0273 ms 52.2%
+  triton_bmm_79662 0.0276 ms 51.7%
+  triton_bmm_79661 0.0287 ms 49.8%
+  triton_bmm_79663 0.0303 ms 47.1%
+  triton_bmm_79660 0.0376 ms 38.0%
+SingleProcess AUTOTUNE takes 3.9751 seconds
+AUTOTUNE bmm(16x1x96, 16x96x834)
+  triton_bmm_79736 0.0097 ms 100.0%
+  triton_bmm_79738 0.0097 ms 99.7%
+  triton_bmm_79740 0.0100 ms 97.1%
+  triton_bmm_79732 0.0101 ms 96.2%
+  triton_bmm_79739 0.0101 ms 95.6%
+  triton_bmm_79737 0.0102 ms 94.7%
+  triton_bmm_79743 0.0103 ms 93.8%
+  triton_bmm_79734 0.0106 ms 91.8%
+  triton_bmm_79735 0.0106 ms 91.3%
+  triton_bmm_79733 0.0107 ms 91.0%
+SingleProcess AUTOTUNE takes 3.9524 seconds
+AUTOTUNE bmm(16x1x834, 16x834x96)
+  triton_bmm_79761 0.0121 ms 100.0%
+  triton_bmm_79762 0.0123 ms 98.2%
+  triton_bmm_79764 0.0135 ms 89.4%
+  triton_bmm_79760 0.0142 ms 85.1%
+  triton_bmm_79765 0.0144 ms 83.8%
+  triton_bmm_79759 0.0148 ms 81.6%
+  bmm 0.0163 ms 74.3%
+  triton_bmm_79758 0.0171 ms 70.7%
+  triton_bmm_79757 0.0175 ms 69.0%
+  triton_bmm_79756 0.0263 ms 45.9%
+SingleProcess AUTOTUNE takes 3.9093 seconds
+AUTOTUNE bmm(16x1x96, 16x96x835)
+  triton_bmm_79832 0.0098 ms 100.0%
+  triton_bmm_79828 0.0099 ms 98.4%
+  triton_bmm_79830 0.0100 ms 97.8%
+  triton_bmm_79836 0.0100 ms 97.4%
+  triton_bmm_79829 0.0102 ms 95.9%
+  triton_bmm_79834 0.0103 ms 95.0%
+  triton_bmm_79833 0.0103 ms 94.7%
+  triton_bmm_79835 0.0103 ms 94.7%
+  triton_bmm_79839 0.0105 ms 92.7%
+  triton_bmm_79838 0.0106 ms 92.4%
+SingleProcess AUTOTUNE takes 3.7138 seconds
+AUTOTUNE bmm(16x1x835, 16x835x96)
+  bmm 0.0148 ms 100.0%
+  triton_bmm_79861 0.0186 ms 79.6%
+  triton_bmm_79860 0.0202 ms 73.4%
+  triton_bmm_79857 0.0271 ms 54.7%
+  triton_bmm_79858 0.0271 ms 54.6%
+  triton_bmm_79856 0.0273 ms 54.2%
+  triton_bmm_79854 0.0276 ms 53.7%
+  triton_bmm_79853 0.0286 ms 51.8%
+  triton_bmm_79855 0.0304 ms 48.8%
+  triton_bmm_79852 0.0376 ms 39.4%
+SingleProcess AUTOTUNE takes 4.1659 seconds
+AUTOTUNE bmm(16x1x96, 16x96x836)
+  triton_bmm_79928 0.0096 ms 100.0%
+  triton_bmm_79930 0.0101 ms 94.3%
+  triton_bmm_79929 0.0102 ms 94.0%
+  triton_bmm_79924 0.0104 ms 91.7%
+  triton_bmm_79926 0.0104 ms 91.7%
+  triton_bmm_79925 0.0105 ms 91.4%
+  triton_bmm_79927 0.0105 ms 91.4%
+  triton_bmm_79933 0.0105 ms 90.9%
+  triton_bmm_79934 0.0105 ms 90.9%
+  triton_bmm_79932 0.0106 ms 90.6%
+SingleProcess AUTOTUNE takes 3.6293 seconds
+AUTOTUNE bmm(16x1x836, 16x836x96)
+  triton_bmm_79953 0.0120 ms 100.0%
+  triton_bmm_79954 0.0123 ms 97.9%
+  triton_bmm_79956 0.0130 ms 92.6%
+  triton_bmm_79952 0.0136 ms 88.3%
+  triton_bmm_79957 0.0140 ms 85.8%
+  triton_bmm_79951 0.0149 ms 80.9%
+  bmm 0.0163 ms 73.9%
+  triton_bmm_79950 0.0171 ms 70.5%
+  triton_bmm_79949 0.0176 ms 68.5%
+  triton_bmm_79948 0.0258 ms 46.6%
+SingleProcess AUTOTUNE takes 3.9370 seconds
+AUTOTUNE bmm(16x1x96, 16x96x837)
+  triton_bmm_80026 0.0098 ms 100.0%
+  triton_bmm_80020 0.0099 ms 98.7%
+  triton_bmm_80021 0.0101 ms 96.2%
+  triton_bmm_80023 0.0102 ms 95.9%
+  triton_bmm_80024 0.0102 ms 95.5%
+  triton_bmm_80025 0.0102 ms 95.3%
+  triton_bmm_80027 0.0103 ms 94.7%
+  triton_bmm_80031 0.0104 ms 93.8%
+  triton_bmm_80022 0.0105 ms 93.0%
+  triton_bmm_80028 0.0106 ms 92.4%
+SingleProcess AUTOTUNE takes 4.1884 seconds
+AUTOTUNE bmm(16x1x837, 16x837x96)
+  bmm 0.0147 ms 100.0%
+  triton_bmm_80053 0.0186 ms 78.8%
+  triton_bmm_80052 0.0206 ms 71.2%
+  triton_bmm_80049 0.0268 ms 54.8%
+  triton_bmm_80050 0.0272 ms 53.9%
+  triton_bmm_80046 0.0272 ms 53.9%
+  triton_bmm_80048 0.0274 ms 53.6%
+  triton_bmm_80045 0.0287 ms 51.1%
+  triton_bmm_80047 0.0304 ms 48.3%
+  triton_bmm_80044 0.0377 ms 38.9%
+SingleProcess AUTOTUNE takes 4.1139 seconds
+AUTOTUNE bmm(16x1x96, 16x96x838)
+  triton_bmm_80120 0.0096 ms 100.0%
+  triton_bmm_80122 0.0097 ms 99.0%
+  triton_bmm_80116 0.0099 ms 97.1%
+  triton_bmm_80117 0.0101 ms 95.2%
+  triton_bmm_80127 0.0104 ms 92.6%
+  triton_bmm_80118 0.0105 ms 91.5%
+  triton_bmm_80124 0.0105 ms 91.5%
+  triton_bmm_80119 0.0106 ms 90.6%
+  triton_bmm_80121 0.0107 ms 89.8%
+  triton_bmm_80123 0.0107 ms 89.8%
+SingleProcess AUTOTUNE takes 3.6122 seconds
+AUTOTUNE bmm(16x1x838, 16x838x96)
+  triton_bmm_80145 0.0120 ms 100.0%
+  triton_bmm_80146 0.0123 ms 97.4%
+  triton_bmm_80148 0.0135 ms 88.6%
+  triton_bmm_80144 0.0136 ms 87.8%
+  triton_bmm_80149 0.0139 ms 86.4%
+  triton_bmm_80143 0.0149 ms 80.4%
+  bmm 0.0168 ms 71.1%
+  triton_bmm_80142 0.0172 ms 69.8%
+  triton_bmm_80141 0.0175 ms 68.2%
+  triton_bmm_80140 0.0258 ms 46.5%
+SingleProcess AUTOTUNE takes 4.0494 seconds
+AUTOTUNE bmm(16x1x96, 16x96x839)
+  triton_bmm_80218 0.0098 ms 100.0%
+  triton_bmm_80212 0.0099 ms 98.4%
+  triton_bmm_80220 0.0101 ms 96.2%
+  triton_bmm_80215 0.0102 ms 95.6%
+  triton_bmm_80216 0.0102 ms 95.3%
+  triton_bmm_80219 0.0103 ms 94.7%
+  triton_bmm_80223 0.0104 ms 93.8%
+  triton_bmm_80214 0.0105 ms 92.7%
+  triton_bmm_80221 0.0106 ms 91.9%
+  triton_bmm_80213 0.0107 ms 91.5%
+SingleProcess AUTOTUNE takes 3.7219 seconds
+AUTOTUNE bmm(16x1x839, 16x839x96)
+  bmm 0.0148 ms 100.0%
+  triton_bmm_80245 0.0186 ms 79.7%
+  triton_bmm_80244 0.0206 ms 71.8%
+  triton_bmm_80241 0.0271 ms 54.6%
+  triton_bmm_80242 0.0274 ms 54.0%
+  triton_bmm_80240 0.0275 ms 53.8%
+  triton_bmm_80238 0.0277 ms 53.5%
+  triton_bmm_80237 0.0292 ms 50.8%
+  triton_bmm_80239 0.0305 ms 48.6%
+  triton_bmm_80236 0.0375 ms 39.5%
+SingleProcess AUTOTUNE takes 3.8322 seconds
+AUTOTUNE bmm(16x1x96, 16x96x840)
+  triton_bmm_80312 0.0095 ms 100.0%
+  triton_bmm_80314 0.0096 ms 99.3%
+  triton_bmm_80308 0.0099 ms 96.4%
+  triton_bmm_80309 0.0099 ms 96.4%
+  triton_bmm_80310 0.0099 ms 96.1%
+  triton_bmm_80316 0.0100 ms 95.8%
+  triton_bmm_80315 0.0101 ms 94.3%
+  triton_bmm_80317 0.0103 ms 92.3%
+  triton_bmm_80319 0.0103 ms 92.3%
+  triton_bmm_80311 0.0106 ms 90.3%
+SingleProcess AUTOTUNE takes 3.6615 seconds
+AUTOTUNE bmm(16x1x840, 16x840x96)
+  triton_bmm_80338 0.0121 ms 100.0%
+  bmm 0.0123 ms 98.4%
+  triton_bmm_80337 0.0130 ms 93.3%
+  triton_bmm_80340 0.0132 ms 91.5%
+  triton_bmm_80336 0.0140 ms 86.5%
+  triton_bmm_80341 0.0145 ms 83.3%
+  triton_bmm_80335 0.0152 ms 79.3%
+  triton_bmm_80334 0.0164 ms 73.7%
+  triton_bmm_80333 0.0181 ms 66.8%
+  triton_bmm_80332 0.0252 ms 47.9%
+SingleProcess AUTOTUNE takes 4.0430 seconds
+AUTOTUNE bmm(16x1x96, 16x96x841)
+  triton_bmm_80408 0.0098 ms 100.0%
+  triton_bmm_80410 0.0098 ms 100.0%
+  triton_bmm_80404 0.0101 ms 96.8%
+  triton_bmm_80405 0.0102 ms 95.9%
+  triton_bmm_80407 0.0102 ms 95.6%
+  triton_bmm_80409 0.0103 ms 95.0%
+  triton_bmm_80411 0.0103 ms 94.4%
+  triton_bmm_80406 0.0106 ms 92.4%
+  triton_bmm_80414 0.0106 ms 92.4%
+  triton_bmm_80412 0.0106 ms 91.9%
+SingleProcess AUTOTUNE takes 3.8863 seconds
+AUTOTUNE bmm(16x1x841, 16x841x96)
+  bmm 0.0149 ms 100.0%
+  triton_bmm_80437 0.0180 ms 82.9%
+  triton_bmm_80436 0.0206 ms 72.3%
+  triton_bmm_80434 0.0268 ms 55.8%
+  triton_bmm_80432 0.0268 ms 55.6%
+  triton_bmm_80433 0.0270 ms 55.2%
+  triton_bmm_80430 0.0276 ms 54.1%
+  triton_bmm_80429 0.0287 ms 52.0%
+  triton_bmm_80431 0.0300 ms 49.7%
+  triton_bmm_80428 0.0363 ms 41.2%
+SingleProcess AUTOTUNE takes 3.7314 seconds
+AUTOTUNE bmm(16x1x96, 16x96x842)
+  triton_bmm_80500 0.0099 ms 100.0%
+  triton_bmm_80501 0.0101 ms 98.4%
+  triton_bmm_80503 0.0102 ms 97.5%
+  triton_bmm_80505 0.0102 ms 97.5%
+  triton_bmm_80504 0.0102 ms 96.9%
+  triton_bmm_80506 0.0102 ms 96.9%
+  triton_bmm_80502 0.0106 ms 93.9%
+  triton_bmm_80508 0.0106 ms 93.7%
+  triton_bmm_80510 0.0106 ms 93.4%
+  triton_bmm_80507 0.0107 ms 92.8%
+SingleProcess AUTOTUNE takes 3.9962 seconds
+AUTOTUNE bmm(16x1x842, 16x842x96)
+  triton_bmm_80529 0.0121 ms 100.0%
+  triton_bmm_80530 0.0123 ms 98.2%
+  triton_bmm_80532 0.0135 ms 89.3%
+  triton_bmm_80528 0.0142 ms 84.9%
+  triton_bmm_80533 0.0144 ms 83.8%
+  triton_bmm_80527 0.0147 ms 82.0%
+  bmm 0.0169 ms 71.5%
+  triton_bmm_80526 0.0171 ms 70.4%
+  triton_bmm_80525 0.0175 ms 68.8%
+  triton_bmm_80524 0.0259 ms 46.7%
+SingleProcess AUTOTUNE takes 4.3985 seconds
+AUTOTUNE bmm(16x1x96, 16x96x843)
+  triton_bmm_80602 0.0098 ms 100.0%
+  triton_bmm_80596 0.0099 ms 98.4%
+  triton_bmm_80598 0.0100 ms 97.8%
+  triton_bmm_80597 0.0102 ms 95.9%
+  triton_bmm_80599 0.0102 ms 95.6%
+  triton_bmm_80600 0.0103 ms 95.0%
+  triton_bmm_80606 0.0106 ms 92.4%
+  triton_bmm_80604 0.0106 ms 91.9%
+  triton_bmm_80601 0.0107 ms 91.0%
+  triton_bmm_80603 0.0107 ms 91.0%
+SingleProcess AUTOTUNE takes 3.7696 seconds
+AUTOTUNE bmm(16x1x843, 16x843x96)
+  bmm 0.0154 ms 100.0%
+  triton_bmm_80629 0.0185 ms 83.0%
+  triton_bmm_80628 0.0207 ms 74.1%
+  triton_bmm_80625 0.0267 ms 57.6%
+  triton_bmm_80626 0.0269 ms 57.0%
+  triton_bmm_80624 0.0270 ms 56.9%
+  triton_bmm_80622 0.0276 ms 55.6%
+  triton_bmm_80621 0.0287 ms 53.5%
+  triton_bmm_80623 0.0305 ms 50.4%
+  triton_bmm_80620 0.0372 ms 41.3%
+SingleProcess AUTOTUNE takes 4.0628 seconds
+AUTOTUNE bmm(16x1x96, 16x96x844)
+  triton_bmm_80696 0.0096 ms 100.0%
+  triton_bmm_80698 0.0096 ms 100.0%
+  triton_bmm_80693 0.0099 ms 96.8%
+  triton_bmm_80694 0.0100 ms 96.5%
+  triton_bmm_80695 0.0100 ms 96.2%
+  triton_bmm_80697 0.0103 ms 93.2%
+  triton_bmm_80692 0.0105 ms 91.7%
+  triton_bmm_80700 0.0105 ms 91.5%
+  triton_bmm_80699 0.0106 ms 90.4%
+  triton_bmm_80703 0.0109 ms 88.2%
+SingleProcess AUTOTUNE takes 3.6426 seconds
+AUTOTUNE bmm(16x1x844, 16x844x96)
+  triton_bmm_80721 0.0124 ms 100.0%
+  triton_bmm_80722 0.0127 ms 98.1%
+  triton_bmm_80724 0.0135 ms 92.0%
+  triton_bmm_80720 0.0136 ms 91.2%
+  triton_bmm_80725 0.0140 ms 89.1%
+  triton_bmm_80719 0.0148 ms 84.0%
+  bmm 0.0169 ms 73.7%
+  triton_bmm_80718 0.0170 ms 73.0%
+  triton_bmm_80717 0.0177 ms 70.4%
+  triton_bmm_80716 0.0253 ms 49.2%
+SingleProcess AUTOTUNE takes 3.8850 seconds
+AUTOTUNE bmm(16x1x96, 16x96x845)
+  triton_bmm_80792 0.0098 ms 100.0%
+  triton_bmm_80794 0.0098 ms 100.0%
+  triton_bmm_80788 0.0099 ms 98.4%
+  triton_bmm_80796 0.0100 ms 97.4%
+  triton_bmm_80789 0.0101 ms 96.2%
+  triton_bmm_80795 0.0103 ms 94.7%
+  triton_bmm_80790 0.0105 ms 93.0%
+  triton_bmm_80799 0.0105 ms 92.7%
+  triton_bmm_80798 0.0106 ms 92.4%
+  triton_bmm_80791 0.0107 ms 91.0%
+SingleProcess AUTOTUNE takes 4.0279 seconds
+AUTOTUNE bmm(16x1x845, 16x845x96)
+  bmm 0.0154 ms 100.0%
+  triton_bmm_80821 0.0180 ms 85.9%
+  triton_bmm_80820 0.0206 ms 74.7%
+  triton_bmm_80817 0.0267 ms 57.7%
+  triton_bmm_80818 0.0268 ms 57.6%
+  triton_bmm_80814 0.0272 ms 56.6%
+  triton_bmm_80816 0.0274 ms 56.2%
+  triton_bmm_80813 0.0293 ms 52.7%
+  triton_bmm_80815 0.0301 ms 51.2%
+  triton_bmm_80812 0.0364 ms 42.4%
+SingleProcess AUTOTUNE takes 3.8303 seconds
+AUTOTUNE bmm(16x1x96, 16x96x846)
+  triton_bmm_80890 0.0097 ms 100.0%
+  triton_bmm_80885 0.0100 ms 97.1%
+  triton_bmm_80888 0.0102 ms 95.0%
+  triton_bmm_80884 0.0104 ms 93.3%
+  triton_bmm_80892 0.0105 ms 92.4%
+  triton_bmm_80886 0.0106 ms 91.6%
+  triton_bmm_80887 0.0106 ms 91.6%
+  triton_bmm_80891 0.0107 ms 91.0%
+  triton_bmm_80889 0.0107 ms 90.7%
+  triton_bmm_80895 0.0109 ms 89.1%
+SingleProcess AUTOTUNE takes 4.3134 seconds
+TIMEOUT
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+dcgan
+cuda eval  dcgan                               int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 447.51it/s]
+1388.364ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model demucs does not support bfloat16, running with amp instead
+loading model: 0it [00:04, ?it/s]
+WARNING:common:Model demucs does not support bfloat16, running with amp instead
+demucs
+cuda eval  demucs                              int8dynamic               
+WARNING:common:Model demucs does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:02, 13.91it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 18.04it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 19.35it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 19.96it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 20.30it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 20.48it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 20.60it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 20.69it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 20.76it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 20.78it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 20.21it/s]
+1256.919ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+densenet121
+cuda eval  densenet121                         int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 19.88it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:02,  9.25it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 16.77it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 21.64it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 24.99it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 27.27it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 28.84it/s]running benchmark:  93%|█████████▎| 28/30 [00:01<00:00, 29.92it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 24.97it/s]
+2154.085ms
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_c4
+WARNING:common:Model detectron2_fasterrcnn_r_101_c4 does not support bfloat16, running with amp instead
+loading model: 0it [00:08, ?it/s]
+WARNING:common:Model detectron2_fasterrcnn_r_101_c4 does not support bfloat16, running with amp instead
+cuda eval  detectron2_fasterrcnn_r_101_c4      int8dynamic               
+WARNING:common:Model detectron2_fasterrcnn_r_101_c4 does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-11 18:42:29,820] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:04,  6.18it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:04,  6.35it/s]running benchmark:  10%|█         | 3/30 [00:00<00:04,  6.43it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:04,  6.47it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:03,  6.41it/s]running benchmark:  20%|██        | 6/30 [00:00<00:03,  6.44it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:03,  6.46it/s]running benchmark:  27%|██▋       | 8/30 [00:01<00:03,  6.47it/s]running benchmark:  30%|███       | 9/30 [00:01<00:03,  6.49it/s]running benchmark:  33%|███▎      | 10/30 [00:01<00:03,  6.50it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:02,  6.51it/s]running benchmark:  40%|████      | 12/30 [00:01<00:02,  6.52it/s]running benchmark:  43%|████▎     | 13/30 [00:02<00:02,  6.53it/s]running benchmark:  47%|████▋     | 14/30 [00:02<00:02,  6.52it/s]running benchmark:  50%|█████     | 15/30 [00:02<00:02,  6.40it/s]running benchmark:  53%|█████▎    | 16/30 [00:02<00:02,  6.44it/s]running benchmark:  57%|█████▋    | 17/30 [00:02<00:02,  6.47it/s]running benchmark:  60%|██████    | 18/30 [00:02<00:01,  6.49it/s]running benchmark:  63%|██████▎   | 19/30 [00:02<00:01,  6.52it/s]running benchmark:  67%|██████▋   | 20/30 [00:03<00:01,  6.52it/s]running benchmark:  70%|███████   | 21/30 [00:03<00:01,  6.50it/s]running benchmark:  73%|███████▎  | 22/30 [00:03<00:01,  6.50it/s]running benchmark:  77%|███████▋  | 23/30 [00:03<00:01,  6.51it/s]running benchmark:  80%|████████  | 24/30 [00:03<00:00,  6.50it/s]running benchmark:  83%|████████▎ | 25/30 [00:03<00:00,  6.49it/s]running benchmark:  87%|████████▋ | 26/30 [00:04<00:00,  6.49it/s]running benchmark:  90%|█████████ | 27/30 [00:04<00:00,  6.49it/s]running benchmark:  93%|█████████▎| 28/30 [00:04<00:00,  6.51it/s]running benchmark:  97%|█████████▋| 29/30 [00:04<00:00,  6.52it/s]running benchmark: 100%|██████████| 30/30 [00:04<00:00,  6.55it/s]running benchmark: 100%|██████████| 30/30 [00:04<00:00,  6.49it/s]
+4596.262ms
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_dc5
+loading model: 0it [00:10, ?it/s]
+WARNING:root:detectron2_fasterrcnn_r_101_dc5 failed to load
+Original Error: "roi_align_forward_kernel" not implemented for 'BFloat16'
+Eager model failed to run
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model
+    self.model_iter_fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward
+    return self.inference(batched_inputs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference
+    results, _ = self.roi_heads(images, features, proposals, None)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 747, in forward
+    pred_instances = self._forward_box(features, proposals)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 798, in _forward_box
+    box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals])
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward
+    return self.level_poolers[0](x[0], pooler_fmt_boxes)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward
+    return roi_align(
+  File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 238, in roi_align
+    return torch.ops.torchvision.roi_align(
+  File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 755, in __call__
+    return self._op(*args, **(kwargs or {}))
+RuntimeError: "roi_align_forward_kernel" not implemented for 'BFloat16'
+
+The above exception was the direct cause of the following exception:
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model
+    self.validate_model(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model
+    raise NotImplementedError("Eager model failed to run") from e
+NotImplementedError: Eager model failed to run
+
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_fpn
+WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:05, ?it/s]
+WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_fasterrcnn_r_101_fpn     int8dynamic               
+WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-11 18:44:22,660] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:02, 12.20it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:01, 13.02it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 13.31it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 12.37it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 12.84it/s]running benchmark:  40%|████      | 12/30 [00:00<00:01, 12.66it/s]running benchmark:  47%|████▋     | 14/30 [00:01<00:01, 13.04it/s]running benchmark:  53%|█████▎    | 16/30 [00:01<00:01, 13.33it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:00, 13.50it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 13.63it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 13.66it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 13.74it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 13.78it/s]running benchmark:  93%|█████████▎| 28/30 [00:02<00:00, 13.82it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 13.84it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 13.40it/s]
+3972.396ms
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_50_c4
+loading model: 0it [00:07, ?it/s]
+WARNING:root:detectron2_fasterrcnn_r_50_c4 failed to load
+Original Error: "roi_align_forward_kernel" not implemented for 'BFloat16'
+Eager model failed to run
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model
+    self.model_iter_fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward
+    return self.inference(batched_inputs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference
+    results, _ = self.roi_heads(images, features, proposals, None)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 477, in forward
+    box_features = self._shared_roi_transform(
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 456, in _shared_roi_transform
+    x = self.pooler(features, boxes)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward
+    return self.level_poolers[0](x[0], pooler_fmt_boxes)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward
+    return roi_align(
+  File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 238, in roi_align
+    return torch.ops.torchvision.roi_align(
+  File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 755, in __call__
+    return self._op(*args, **(kwargs or {}))
+RuntimeError: "roi_align_forward_kernel" not implemented for 'BFloat16'
+
+The above exception was the direct cause of the following exception:
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model
+    self.validate_model(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model
+    raise NotImplementedError("Eager model failed to run") from e
+NotImplementedError: Eager model failed to run
+
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_50_dc5
+WARNING:common:Model detectron2_fasterrcnn_r_50_dc5 does not support bfloat16, running with amp instead
+loading model: 0it [00:08, ?it/s]
+WARNING:common:Model detectron2_fasterrcnn_r_50_dc5 does not support bfloat16, running with amp instead
+cuda eval  detectron2_fasterrcnn_r_50_dc5      int8dynamic               
+WARNING:common:Model detectron2_fasterrcnn_r_50_dc5 does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-11 18:45:39,045] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:06,  4.65it/s]running benchmark:  10%|█         | 3/30 [00:00<00:03,  8.91it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:02, 10.73it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 11.62it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 12.28it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:01, 12.70it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:01, 12.97it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01, 13.17it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:01, 12.71it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 12.95it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 13.14it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 13.28it/s]running benchmark:  83%|████████▎ | 25/30 [00:02<00:00, 13.38it/s]running benchmark:  90%|█████████ | 27/30 [00:02<00:00, 13.45it/s]running benchmark:  97%|█████████▋| 29/30 [00:02<00:00, 13.51it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 12.58it/s]
+3122.102ms
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_50_fpn
+WARNING:common:Model detectron2_fasterrcnn_r_50_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:06, ?it/s]
+WARNING:common:Model detectron2_fasterrcnn_r_50_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_fasterrcnn_r_50_fpn      int8dynamic               
+WARNING:common:Model detectron2_fasterrcnn_r_50_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-11 18:46:50,003] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:03,  9.25it/s]running benchmark:  10%|█         | 3/30 [00:00<00:02, 13.29it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 14.49it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 15.05it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 15.35it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:01, 15.56it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:01, 15.66it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 15.73it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:00, 15.78it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 15.83it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 15.86it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 15.89it/s]running benchmark:  83%|████████▎ | 25/30 [00:01<00:00, 15.90it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 15.91it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 15.93it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 15.52it/s]
+3977.815ms
+loading model: 0it [00:00, ?it/s]detectron2_fcos_r_50_fpn
+WARNING:common:Model detectron2_fcos_r_50_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:05, ?it/s]
+WARNING:common:Model detectron2_fcos_r_50_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_fcos_r_50_fpn            int8dynamic               
+WARNING:common:Model detectron2_fcos_r_50_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 15.17it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:01, 16.31it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 16.82it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 17.06it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 17.26it/s]running benchmark:  40%|████      | 12/30 [00:00<00:01, 17.41it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 17.48it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 17.57it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:00, 17.53it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 17.43it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 17.51it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 17.62it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 17.68it/s]running benchmark:  93%|█████████▎| 28/30 [00:01<00:00, 17.73it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 17.76it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 17.42it/s]
+1156.539ms
+loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_101_c4
+WARNING:common:Model detectron2_maskrcnn_r_101_c4 does not support bfloat16, running with amp instead
+loading model: 0it [00:06, ?it/s]
+WARNING:common:Model detectron2_maskrcnn_r_101_c4 does not support bfloat16, running with amp instead
+cuda eval  detectron2_maskrcnn_r_101_c4        int8dynamic               
+WARNING:common:Model detectron2_maskrcnn_r_101_c4 does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-11 18:49:34,999] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-11 18:49:42,411] [30/1_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:04,  7.17it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:03,  7.86it/s]running benchmark:  10%|█         | 3/30 [00:00<00:03,  8.13it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:03,  8.26it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:03,  8.32it/s]running benchmark:  20%|██        | 6/30 [00:00<00:02,  8.39it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:02,  8.41it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:02,  8.41it/s]running benchmark:  30%|███       | 9/30 [00:01<00:02,  8.41it/s]running benchmark:  33%|███▎      | 10/30 [00:01<00:02,  8.42it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:02,  8.42it/s]running benchmark:  40%|████      | 12/30 [00:01<00:02,  8.41it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:02,  8.42it/s]running benchmark:  47%|████▋     | 14/30 [00:01<00:01,  8.45it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01,  8.47it/s]running benchmark:  53%|█████▎    | 16/30 [00:01<00:01,  8.49it/s]running benchmark:  57%|█████▋    | 17/30 [00:02<00:01,  8.46it/s]running benchmark:  60%|██████    | 18/30 [00:02<00:01,  8.47it/s]running benchmark:  63%|██████▎   | 19/30 [00:02<00:01,  8.45it/s]running benchmark:  67%|██████▋   | 20/30 [00:02<00:01,  8.45it/s]running benchmark:  70%|███████   | 21/30 [00:02<00:01,  8.48it/s]running benchmark:  73%|███████▎  | 22/30 [00:02<00:00,  8.46it/s]running benchmark:  77%|███████▋  | 23/30 [00:02<00:00,  8.39it/s]running benchmark:  80%|████████  | 24/30 [00:02<00:00,  8.36it/s]running benchmark:  83%|████████▎ | 25/30 [00:02<00:00,  8.35it/s]running benchmark:  87%|████████▋ | 26/30 [00:03<00:00,  8.41it/s]running benchmark:  90%|█████████ | 27/30 [00:03<00:00,  8.45it/s]running benchmark:  93%|█████████▎| 28/30 [00:03<00:00,  8.46it/s]running benchmark:  97%|█████████▋| 29/30 [00:03<00:00,  8.48it/s]running benchmark: 100%|██████████| 30/30 [00:03<00:00,  8.50it/s]running benchmark: 100%|██████████| 30/30 [00:03<00:00,  8.40it/s]
+2955.124ms
+loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_101_fpn
+WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:06, ?it/s]
+WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_maskrcnn_r_101_fpn       int8dynamic               
+WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-11 18:51:22,289] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-11 18:51:28,482] [30/1_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:03,  7.79it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:03,  8.03it/s]running benchmark:  10%|█         | 3/30 [00:00<00:03,  8.11it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:03,  8.14it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:03,  8.17it/s]running benchmark:  20%|██        | 6/30 [00:00<00:02,  8.19it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:02,  8.19it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:02,  8.19it/s]running benchmark:  30%|███       | 9/30 [00:01<00:02,  8.16it/s]running benchmark:  33%|███▎      | 10/30 [00:01<00:02,  8.04it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:02,  7.97it/s]running benchmark:  40%|████      | 12/30 [00:01<00:02,  7.84it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:02,  7.77it/s]running benchmark:  47%|████▋     | 14/30 [00:01<00:02,  7.92it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01,  8.00it/s]running benchmark:  53%|█████▎    | 16/30 [00:01<00:01,  8.04it/s]running benchmark:  57%|█████▋    | 17/30 [00:02<00:01,  8.10it/s]running benchmark:  60%|██████    | 18/30 [00:02<00:01,  8.04it/s]running benchmark:  63%|██████▎   | 19/30 [00:02<00:01,  7.88it/s]running benchmark:  67%|██████▋   | 20/30 [00:02<00:01,  7.77it/s]running benchmark:  70%|███████   | 21/30 [00:02<00:01,  7.95it/s]running benchmark:  73%|███████▎  | 22/30 [00:02<00:00,  8.01it/s]running benchmark:  77%|███████▋  | 23/30 [00:02<00:00,  8.08it/s]running benchmark:  80%|████████  | 24/30 [00:02<00:00,  8.11it/s]running benchmark:  83%|████████▎ | 25/30 [00:03<00:00,  8.15it/s]running benchmark:  90%|█████████ | 27/30 [00:03<00:00,  9.60it/s]running benchmark:  97%|█████████▋| 29/30 [00:03<00:00, 10.66it/s]running benchmark: 100%|██████████| 30/30 [00:03<00:00,  8.54it/s]
+5901.405ms
+loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_50_c4
+WARNING:common:Model detectron2_maskrcnn_r_50_c4 does not support bfloat16, running with amp instead
+loading model: 0it [00:05, ?it/s]
+WARNING:common:Model detectron2_maskrcnn_r_50_c4 does not support bfloat16, running with amp instead
+cuda eval  detectron2_maskrcnn_r_50_c4         int8dynamic               
+WARNING:common:Model detectron2_maskrcnn_r_50_c4 does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-11 18:52:29,886] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-11 18:52:39,114] [30/1_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:03,  9.11it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:02,  9.35it/s]running benchmark:  10%|█         | 3/30 [00:00<00:02,  9.53it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:02,  9.62it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:02,  9.60it/s]running benchmark:  20%|██        | 6/30 [00:00<00:02,  9.68it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:02,  9.76it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:02,  9.80it/s]running benchmark:  30%|███       | 9/30 [00:00<00:02,  9.84it/s]running benchmark:  33%|███▎      | 10/30 [00:01<00:02,  9.84it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:01,  9.84it/s]running benchmark:  40%|████      | 12/30 [00:01<00:01,  9.86it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:01,  9.86it/s]running benchmark:  47%|████▋     | 14/30 [00:01<00:01,  9.86it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01,  9.79it/s]running benchmark:  53%|█████▎    | 16/30 [00:01<00:01,  9.79it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:01,  9.80it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:01,  9.80it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:01,  9.81it/s]running benchmark:  67%|██████▋   | 20/30 [00:02<00:01,  9.85it/s]running benchmark:  70%|███████   | 21/30 [00:02<00:00,  9.86it/s]running benchmark:  73%|███████▎  | 22/30 [00:02<00:00,  9.88it/s]running benchmark:  77%|███████▋  | 23/30 [00:02<00:00,  9.88it/s]running benchmark:  80%|████████  | 24/30 [00:02<00:00,  9.88it/s]running benchmark:  83%|████████▎ | 25/30 [00:02<00:00,  9.84it/s]running benchmark:  87%|████████▋ | 26/30 [00:02<00:00,  9.85it/s]running benchmark:  90%|█████████ | 27/30 [00:02<00:00,  9.87it/s]running benchmark:  93%|█████████▎| 28/30 [00:02<00:00,  9.88it/s]running benchmark:  97%|█████████▋| 29/30 [00:02<00:00,  9.89it/s]running benchmark: 100%|██████████| 30/30 [00:03<00:00,  9.90it/s]running benchmark: 100%|██████████| 30/30 [00:03<00:00,  9.81it/s]
+2790.067ms
+loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_50_fpn
+WARNING:common:Model detectron2_maskrcnn_r_50_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:05, ?it/s]
+WARNING:common:Model detectron2_maskrcnn_r_50_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_maskrcnn_r_50_fpn        int8dynamic               
+WARNING:common:Model detectron2_maskrcnn_r_50_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-11 18:53:54,922] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-11 18:54:00,973] [30/1_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:02, 13.31it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:02, 12.42it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 13.24it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 13.64it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 13.92it/s]running benchmark:  40%|████      | 12/30 [00:00<00:01, 14.11it/s]running benchmark:  47%|████▋     | 14/30 [00:01<00:01, 14.23it/s]running benchmark:  53%|█████▎    | 16/30 [00:01<00:00, 14.29it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:00, 13.99it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 14.10it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 14.20it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 14.29it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 14.36it/s]running benchmark:  93%|█████████▎| 28/30 [00:01<00:00, 14.40it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 14.44it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 14.08it/s]
+3531.051ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:12, ?it/s]
+dlrm
+cuda eval  dlrm                                int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:03,  8.43it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:03,  8.62it/s]running benchmark:  10%|█         | 3/30 [00:00<00:03,  8.70it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:02,  8.69it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:02,  8.72it/s]running benchmark:  20%|██        | 6/30 [00:00<00:02,  8.15it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:02,  8.26it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:02,  8.42it/s]running benchmark:  30%|███       | 9/30 [00:01<00:02,  8.48it/s]running benchmark:  33%|███▎      | 10/30 [00:01<00:02,  8.52it/s]running benchmark:  40%|████      | 12/30 [00:01<00:01, 10.06it/s]running benchmark:  47%|████▋     | 14/30 [00:01<00:01, 11.43it/s]running benchmark:  53%|█████▎    | 16/30 [00:01<00:01, 12.34it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:00, 12.82it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 13.40it/s]running benchmark:  73%|███████▎  | 22/30 [00:02<00:00, 13.71it/s]running benchmark:  80%|████████  | 24/30 [00:02<00:00, 13.91it/s]running benchmark:  87%|████████▋ | 26/30 [00:02<00:00, 14.02it/s]running benchmark:  93%|█████████▎| 28/30 [00:02<00:00, 14.21it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 14.38it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 11.61it/s]
+42416.647ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model doctr_det_predictor does not support bfloat16, running with amp instead
+loading model: 0it [00:05, ?it/s]
+WARNING:common:Model doctr_det_predictor does not support bfloat16, running with amp instead
+doctr_det_predictor
+cuda eval  doctr_det_predictor                 int8dynamic               
+WARNING:common:Model doctr_det_predictor does not support bfloat16, running with amp instead
+[2023-12-11 18:55:19,305] [1/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+Fatal glibc error: malloc.c:2496 (sysmalloc): assertion failed: (old_top == initial_top (av) && old_size == 0) || ((unsigned long) (old_size) >= MINSIZE && prev_inuse (old_top) && ((unsigned long) old_end & (pagesize - 1)) == 0)
+Run failed with return code:  -6
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead
+loading model: 0it [00:06, ?it/s]
+WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead
+doctr_reco_predictor
+cuda eval  doctr_reco_predictor                int8dynamic               
+WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 152.16it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 155.00it/s]
+3428.812ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+drq
+cuda eval  drq                                 int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 125.18it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 128.82it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 128.51it/s]
+15617.719ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+fastNLP_Bert
+cuda eval  fastNLP_Bert                        int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:04,  6.05it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:04,  6.25it/s]running benchmark:  10%|█         | 3/30 [00:00<00:04,  6.26it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:04,  6.30it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:04,  6.21it/s]running benchmark:  20%|██        | 6/30 [00:00<00:03,  6.25it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:03,  6.26it/s]running benchmark:  27%|██▋       | 8/30 [00:01<00:03,  6.30it/s]running benchmark:  30%|███       | 9/30 [00:01<00:03,  6.33it/s]running benchmark:  33%|███▎      | 10/30 [00:01<00:03,  6.38it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:02,  6.42it/s]running benchmark:  40%|████      | 12/30 [00:01<00:02,  6.36it/s]running benchmark:  43%|████▎     | 13/30 [00:02<00:02,  6.37it/s]running benchmark:  47%|████▋     | 14/30 [00:02<00:02,  6.24it/s]running benchmark:  50%|█████     | 15/30 [00:02<00:02,  6.31it/s]running benchmark:  53%|█████▎    | 16/30 [00:02<00:02,  6.35it/s]running benchmark:  57%|█████▋    | 17/30 [00:02<00:02,  6.39it/s]running benchmark:  60%|██████    | 18/30 [00:02<00:01,  6.36it/s]running benchmark:  63%|██████▎   | 19/30 [00:03<00:01,  6.40it/s]running benchmark:  67%|██████▋   | 20/30 [00:03<00:01,  6.44it/s]running benchmark:  70%|███████   | 21/30 [00:03<00:01,  6.45it/s]running benchmark:  73%|███████▎  | 22/30 [00:03<00:01,  6.46it/s]running benchmark:  77%|███████▋  | 23/30 [00:03<00:01,  6.46it/s]running benchmark:  80%|████████  | 24/30 [00:03<00:00,  6.45it/s]running benchmark:  83%|████████▎ | 25/30 [00:03<00:00,  6.45it/s]running benchmark:  87%|████████▋ | 26/30 [00:04<00:00,  6.43it/s]running benchmark:  90%|█████████ | 27/30 [00:04<00:00,  6.45it/s]running benchmark:  93%|█████████▎| 28/30 [00:04<00:00,  6.45it/s]running benchmark:  97%|█████████▋| 29/30 [00:04<00:00,  6.45it/s]running benchmark: 100%|██████████| 30/30 [00:04<00:00,  6.46it/s]running benchmark: 100%|██████████| 30/30 [00:04<00:00,  6.38it/s]
+28866.499ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+functorch_dp_cifar10
+cuda eval  functorch_dp_cifar10                int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 163.82it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 167.22it/s]
+6937.703ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+functorch_maml_omniglot
+cuda eval  functorch_maml_omniglot             int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 154.33it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 155.19it/s]
+23651.436ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+hf_Albert
+cuda eval  hf_Albert                           int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:06,  4.74it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:05,  5.02it/s]running benchmark:  10%|█         | 3/30 [00:00<00:05,  5.13it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:05,  5.13it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:04,  5.20it/s]running benchmark:  20%|██        | 6/30 [00:01<00:04,  5.27it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:04,  5.31it/s]running benchmark:  27%|██▋       | 8/30 [00:01<00:04,  5.35it/s]running benchmark:  30%|███       | 9/30 [00:01<00:03,  5.37it/s]running benchmark:  33%|███▎      | 10/30 [00:01<00:03,  5.38it/s]running benchmark:  37%|███▋      | 11/30 [00:02<00:03,  5.40it/s]running benchmark:  40%|████      | 12/30 [00:02<00:03,  5.41it/s]running benchmark:  43%|████▎     | 13/30 [00:02<00:03,  5.39it/s]running benchmark:  47%|████▋     | 14/30 [00:02<00:02,  5.40it/s]running benchmark:  50%|█████     | 15/30 [00:02<00:02,  5.41it/s]running benchmark:  53%|█████▎    | 16/30 [00:03<00:02,  5.43it/s]running benchmark:  57%|█████▋    | 17/30 [00:03<00:02,  5.45it/s]running benchmark:  60%|██████    | 18/30 [00:03<00:02,  5.43it/s]running benchmark:  63%|██████▎   | 19/30 [00:03<00:02,  5.43it/s]running benchmark:  67%|██████▋   | 20/30 [00:03<00:01,  5.44it/s]running benchmark:  70%|███████   | 21/30 [00:03<00:01,  5.45it/s]running benchmark:  73%|███████▎  | 22/30 [00:04<00:01,  5.43it/s]running benchmark:  77%|███████▋  | 23/30 [00:04<00:01,  5.42it/s]running benchmark:  80%|████████  | 24/30 [00:04<00:01,  5.40it/s]running benchmark:  83%|████████▎ | 25/30 [00:04<00:00,  5.42it/s]running benchmark:  87%|████████▋ | 26/30 [00:04<00:00,  5.43it/s]running benchmark:  90%|█████████ | 27/30 [00:05<00:00,  5.41it/s]running benchmark:  93%|█████████▎| 28/30 [00:05<00:00,  5.40it/s]running benchmark:  97%|█████████▋| 29/30 [00:05<00:00,  5.39it/s]running benchmark: 100%|██████████| 30/30 [00:05<00:00,  5.40it/s]running benchmark: 100%|██████████| 30/30 [00:05<00:00,  5.37it/s]
+93375.255ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:08, ?it/s]
+hf_Bart
+cuda eval  hf_Bart                             int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:03<01:42,  3.53s/it]running benchmark:   7%|▋         | 2/30 [00:07<01:39,  3.55s/it]running benchmark:  10%|█         | 3/30 [00:10<01:35,  3.53s/it]running benchmark:  13%|█▎        | 4/30 [00:14<01:32,  3.55s/it]running benchmark:  17%|█▋        | 5/30 [00:17<01:29,  3.57s/it]running benchmark:  20%|██        | 6/30 [00:21<01:26,  3.58s/it]running benchmark:  23%|██▎       | 7/30 [00:25<01:22,  3.60s/it]running benchmark:  27%|██▋       | 8/30 [00:28<01:19,  3.60s/it]running benchmark:  30%|███       | 9/30 [00:32<01:15,  3.60s/it]running benchmark:  33%|███▎      | 10/30 [00:35<01:11,  3.60s/it]running benchmark:  37%|███▋      | 11/30 [00:39<01:08,  3.61s/it]running benchmark:  40%|████      | 12/30 [00:43<01:04,  3.61s/it]running benchmark:  43%|████▎     | 13/30 [00:46<01:01,  3.60s/it]running benchmark:  47%|████▋     | 14/30 [00:50<00:57,  3.60s/it]running benchmark:  50%|█████     | 15/30 [00:53<00:53,  3.59s/it]running benchmark:  53%|█████▎    | 16/30 [00:57<00:50,  3.59s/it]running benchmark:  57%|█████▋    | 17/30 [01:00<00:46,  3.58s/it]running benchmark:  60%|██████    | 18/30 [01:04<00:42,  3.57s/it]running benchmark:  63%|██████▎   | 19/30 [01:08<00:39,  3.55s/it]running benchmark:  67%|██████▋   | 20/30 [01:11<00:35,  3.54s/it]running benchmark:  70%|███████   | 21/30 [01:15<00:31,  3.55s/it]running benchmark:  73%|███████▎  | 22/30 [01:18<00:28,  3.55s/it]running benchmark:  77%|███████▋  | 23/30 [01:22<00:24,  3.56s/it]running benchmark:  80%|████████  | 24/30 [01:25<00:21,  3.57s/it]running benchmark:  83%|████████▎ | 25/30 [01:29<00:17,  3.56s/it]running benchmark:  87%|████████▋ | 26/30 [01:32<00:14,  3.57s/it]running benchmark:  90%|█████████ | 27/30 [01:36<00:10,  3.58s/it]running benchmark:  93%|█████████▎| 28/30 [01:40<00:07,  3.59s/it]running benchmark:  97%|█████████▋| 29/30 [01:43<00:03,  3.59s/it]running benchmark: 100%|██████████| 30/30 [01:47<00:00,  3.58s/it]running benchmark: 100%|██████████| 30/30 [01:47<00:00,  3.58s/it]
+937221.430ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:07, ?it/s]
+hf_BigBird
+cuda eval  hf_BigBird                          int8dynamic               
+[2023-12-11 19:05:35,366] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 19:05:38,989] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 19:05:41,659] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 19:05:44,019] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 19:05:46,338] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 19:05:49,003] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 19:05:51,317] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 19:05:53,682] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 19:05:55,985] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 19:05:58,703] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 19:06:01,020] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 19:06:03,339] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:31<15:05, 31.21s/it]running benchmark:   7%|▋         | 2/30 [01:02<14:32, 31.14s/it]running benchmark:  10%|█         | 3/30 [01:33<14:00, 31.13s/it]running benchmark:  13%|█▎        | 4/30 [02:04<13:28, 31.11s/it]running benchmark:  17%|█▋        | 5/30 [02:35<12:56, 31.04s/it]running benchmark:  20%|██        | 6/30 [03:07<12:30, 31.28s/it]running benchmark:  23%|██▎       | 7/30 [03:39<12:06, 31.57s/it]running benchmark:  27%|██▋       | 8/30 [04:11<11:35, 31.62s/it]running benchmark:  30%|███       | 9/30 [04:41<10:55, 31.19s/it]running benchmark:  33%|███▎      | 10/30 [05:11<10:20, 31.02s/it]running benchmark:  37%|███▋      | 11/30 [05:42<09:45, 30.84s/it]running benchmark:  40%|████      | 12/30 [06:12<09:13, 30.74s/it]running benchmark:  43%|████▎     | 13/30 [06:43<08:40, 30.62s/it]running benchmark:  47%|████▋     | 14/30 [07:13<08:09, 30.58s/it]running benchmark:  50%|█████     | 15/30 [07:44<07:39, 30.61s/it]running benchmark:  53%|█████▎    | 16/30 [08:15<07:10, 30.76s/it]running benchmark:  57%|█████▋    | 17/30 [08:46<06:40, 30.82s/it]running benchmark:  60%|██████    | 18/30 [09:17<06:10, 30.87s/it]running benchmark:  63%|██████▎   | 19/30 [09:47<05:38, 30.76s/it]running benchmark:  67%|██████▋   | 20/30 [10:18<05:07, 30.74s/it]running benchmark:  70%|███████   | 21/30 [10:49<04:36, 30.68s/it]running benchmark:  73%|███████▎  | 22/30 [11:19<04:04, 30.50s/it]running benchmark:  77%|███████▋  | 23/30 [11:49<03:33, 30.47s/it]running benchmark:  80%|████████  | 24/30 [12:20<03:02, 30.44s/it]running benchmark:  83%|████████▎ | 25/30 [12:50<02:31, 30.38s/it]running benchmark:  87%|████████▋ | 26/30 [13:20<02:01, 30.43s/it]running benchmark:  90%|█████████ | 27/30 [13:51<01:31, 30.61s/it]running benchmark:  93%|█████████▎| 28/30 [14:23<01:01, 30.90s/it]running benchmark:  97%|█████████▋| 29/30 [14:55<00:31, 31.21s/it]running benchmark: 100%|██████████| 30/30 [15:26<00:00, 31.28s/it]running benchmark: 100%|██████████| 30/30 [15:26<00:00, 30.89s/it]
+1480282.577ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+hf_DistilBert
+cuda eval  hf_DistilBert                       int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:02<01:07,  2.34s/it]running benchmark:   7%|▋         | 2/30 [00:04<01:03,  2.27s/it]running benchmark:  10%|█         | 3/30 [00:06<01:00,  2.24s/it]running benchmark:  13%|█▎        | 4/30 [00:09<00:58,  2.25s/it]running benchmark:  17%|█▋        | 5/30 [00:11<00:55,  2.23s/it]running benchmark:  20%|██        | 6/30 [00:13<00:53,  2.23s/it]running benchmark:  23%|██▎       | 7/30 [00:15<00:50,  2.21s/it]running benchmark:  27%|██▋       | 8/30 [00:17<00:48,  2.23s/it]running benchmark:  30%|███       | 9/30 [00:20<00:46,  2.20s/it]running benchmark:  33%|███▎      | 10/30 [00:22<00:44,  2.21s/it]running benchmark:  37%|███▋      | 11/30 [00:24<00:41,  2.20s/it]running benchmark:  40%|████      | 12/30 [00:26<00:39,  2.21s/it]running benchmark:  43%|████▎     | 13/30 [00:28<00:37,  2.21s/it]running benchmark:  47%|████▋     | 14/30 [00:31<00:35,  2.23s/it]running benchmark:  50%|█████     | 15/30 [00:33<00:33,  2.21s/it]running benchmark:  53%|█████▎    | 16/30 [00:35<00:31,  2.22s/it]running benchmark:  57%|█████▋    | 17/30 [00:37<00:28,  2.21s/it]running benchmark:  60%|██████    | 18/30 [00:39<00:26,  2.22s/it]running benchmark:  63%|██████▎   | 19/30 [00:42<00:24,  2.20s/it]running benchmark:  67%|██████▋   | 20/30 [00:44<00:22,  2.22s/it]running benchmark:  70%|███████   | 21/30 [00:46<00:19,  2.13s/it]running benchmark:  73%|███████▎  | 22/30 [00:48<00:17,  2.13s/it]running benchmark:  77%|███████▋  | 23/30 [00:50<00:14,  2.10s/it]running benchmark:  80%|████████  | 24/30 [00:52<00:12,  2.11s/it]running benchmark:  83%|████████▎ | 25/30 [00:54<00:10,  2.09s/it]running benchmark:  87%|████████▋ | 26/30 [00:56<00:08,  2.10s/it]running benchmark:  90%|█████████ | 27/30 [00:58<00:06,  2.13s/it]running benchmark:  93%|█████████▎| 28/30 [01:01<00:04,  2.14s/it]running benchmark:  97%|█████████▋| 29/30 [01:03<00:02,  2.13s/it]running benchmark: 100%|██████████| 30/30 [01:05<00:00,  2.13s/it]running benchmark: 100%|██████████| 30/30 [01:05<00:00,  2.18s/it]
+1224833.902ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:05, ?it/s]
+hf_GPT2
+cuda eval  hf_GPT2                             int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:07<03:42,  7.68s/it]running benchmark:   7%|▋         | 2/30 [00:15<03:33,  7.62s/it]running benchmark:  10%|█         | 3/30 [00:22<03:23,  7.55s/it]running benchmark:  13%|█▎        | 4/30 [00:30<03:16,  7.58s/it]running benchmark:  17%|█▋        | 5/30 [00:37<03:09,  7.58s/it]running benchmark:  20%|██        | 6/30 [00:45<03:01,  7.55s/it]running benchmark:  23%|██▎       | 7/30 [00:53<02:54,  7.58s/it]running benchmark:  27%|██▋       | 8/30 [01:00<02:47,  7.61s/it]running benchmark:  30%|███       | 9/30 [01:08<02:37,  7.52s/it]running benchmark:  33%|███▎      | 10/30 [01:15<02:30,  7.52s/it]running benchmark:  37%|███▋      | 11/30 [01:23<02:22,  7.52s/it]running benchmark:  40%|████      | 12/30 [01:30<02:15,  7.53s/it]running benchmark:  43%|████▎     | 13/30 [01:38<02:07,  7.49s/it]running benchmark:  47%|████▋     | 14/30 [01:45<01:58,  7.42s/it]running benchmark:  50%|█████     | 15/30 [01:52<01:50,  7.38s/it]running benchmark:  53%|█████▎    | 16/30 [02:00<01:43,  7.42s/it]running benchmark:  57%|█████▋    | 17/30 [02:07<01:36,  7.44s/it]running benchmark:  60%|██████    | 18/30 [02:15<01:29,  7.45s/it]running benchmark:  63%|██████▎   | 19/30 [02:22<01:22,  7.47s/it]running benchmark:  67%|██████▋   | 20/30 [02:30<01:15,  7.53s/it]running benchmark:  70%|███████   | 21/30 [02:37<01:08,  7.57s/it]running benchmark:  73%|███████▎  | 22/30 [02:45<01:00,  7.60s/it]running benchmark:  77%|███████▋  | 23/30 [02:53<00:53,  7.61s/it]running benchmark:  80%|████████  | 24/30 [03:00<00:45,  7.57s/it]running benchmark:  83%|████████▎ | 25/30 [03:08<00:37,  7.52s/it]running benchmark:  87%|████████▋ | 26/30 [03:15<00:30,  7.53s/it]running benchmark:  90%|█████████ | 27/30 [03:23<00:22,  7.53s/it]running benchmark:  93%|█████████▎| 28/30 [03:30<00:14,  7.48s/it]running benchmark:  97%|█████████▋| 29/30 [03:37<00:07,  7.45s/it]running benchmark: 100%|██████████| 30/30 [03:45<00:00,  7.46s/it]running benchmark: 100%|██████████| 30/30 [03:45<00:00,  7.51s/it]
+2096703.311ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:21, ?it/s]
+hf_GPT2_large
+cuda eval  hf_GPT2_large                       int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:12<05:49, 12.04s/it]running benchmark:   7%|▋         | 2/30 [00:23<05:35, 11.99s/it]running benchmark:  10%|█         | 3/30 [00:36<05:24, 12.02s/it]running benchmark:  13%|█▎        | 4/30 [00:48<05:12, 12.02s/it]running benchmark:  17%|█▋        | 5/30 [01:00<05:00, 12.01s/it]running benchmark:  20%|██        | 6/30 [01:12<04:48, 12.02s/it]running benchmark:  23%|██▎       | 7/30 [01:24<04:35, 12.00s/it]running benchmark:  27%|██▋       | 8/30 [01:36<04:24, 12.03s/it]running benchmark:  30%|███       | 9/30 [01:48<04:13, 12.06s/it]running benchmark:  33%|███▎      | 10/30 [02:00<04:00, 12.03s/it]running benchmark:  37%|███▋      | 11/30 [02:12<03:48, 12.04s/it]running benchmark:  40%|████      | 12/30 [02:24<03:38, 12.12s/it]running benchmark:  43%|████▎     | 13/30 [02:37<03:27, 12.22s/it]running benchmark:  47%|████▋     | 14/30 [02:49<03:17, 12.32s/it]running benchmark:  50%|█████     | 15/30 [03:02<03:05, 12.36s/it]running benchmark:  53%|█████▎    | 16/30 [03:14<02:53, 12.39s/it]running benchmark:  57%|█████▋    | 17/30 [03:26<02:39, 12.25s/it]running benchmark:  60%|██████    | 18/30 [03:38<02:26, 12.18s/it]running benchmark:  63%|██████▎   | 19/30 [03:50<02:14, 12.24s/it]running benchmark:  67%|██████▋   | 20/30 [04:03<02:02, 12.27s/it]running benchmark:  70%|███████   | 21/30 [04:15<01:50, 12.27s/it]running benchmark:  73%|███████▎  | 22/30 [04:27<01:38, 12.29s/it]running benchmark:  77%|███████▋  | 23/30 [04:40<01:26, 12.36s/it]running benchmark:  80%|████████  | 24/30 [04:52<01:14, 12.39s/it]running benchmark:  83%|████████▎ | 25/30 [05:04<01:01, 12.33s/it]running benchmark:  87%|████████▋ | 26/30 [05:17<00:49, 12.34s/it]running benchmark:  90%|█████████ | 27/30 [05:29<00:36, 12.32s/it]running benchmark:  93%|█████████▎| 28/30 [05:41<00:24, 12.30s/it]running benchmark:  97%|█████████▋| 29/30 [05:54<00:12, 12.31s/it]running benchmark: 100%|██████████| 30/30 [06:06<00:00, 12.29s/it]running benchmark: 100%|██████████| 30/30 [06:06<00:00, 12.21s/it]
+744398.221ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:07, ?it/s]
+hf_Longformer
+cuda eval  hf_Longformer                       int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:32<15:49, 32.74s/it]running benchmark:   7%|▋         | 2/30 [01:05<15:11, 32.55s/it]running benchmark:  10%|█         | 3/30 [01:37<14:34, 32.38s/it]running benchmark:  13%|█▎        | 4/30 [02:09<14:01, 32.35s/it]running benchmark:  17%|█▋        | 5/30 [02:41<13:27, 32.30s/it]running benchmark:  20%|██        | 6/30 [03:14<12:55, 32.33s/it]running benchmark:  23%|██▎       | 7/30 [03:45<12:16, 32.00s/it]running benchmark:  27%|██▋       | 8/30 [04:16<11:35, 31.60s/it]running benchmark:  30%|███       | 9/30 [04:46<10:57, 31.30s/it]running benchmark:  33%|███▎      | 10/30 [05:18<10:25, 31.26s/it]running benchmark:  37%|███▋      | 11/30 [05:48<09:51, 31.13s/it]running benchmark:  40%|████      | 12/30 [06:19<09:19, 31.07s/it]running benchmark:  43%|████▎     | 13/30 [06:50<08:48, 31.08s/it]running benchmark:  47%|████▋     | 14/30 [07:21<08:15, 30.98s/it]running benchmark:  50%|█████     | 15/30 [07:53<07:48, 31.20s/it]running benchmark:  53%|█████▎    | 16/30 [08:25<07:21, 31.57s/it]running benchmark:  57%|█████▋    | 17/30 [08:58<06:52, 31.75s/it]running benchmark:  60%|██████    | 18/30 [09:30<06:22, 31.89s/it]running benchmark:  63%|██████▎   | 19/30 [10:00<05:45, 31.40s/it]running benchmark:  67%|██████▋   | 20/30 [10:30<05:10, 31.09s/it]running benchmark:  70%|███████   | 21/30 [11:01<04:37, 30.87s/it]running benchmark:  73%|███████▎  | 22/30 [11:31<04:05, 30.71s/it]running benchmark:  77%|███████▋  | 23/30 [12:01<03:34, 30.57s/it]running benchmark:  80%|████████  | 24/30 [12:33<03:04, 30.78s/it]running benchmark:  83%|████████▎ | 25/30 [13:03<02:33, 30.74s/it]running benchmark:  87%|████████▋ | 26/30 [13:33<02:02, 30.54s/it]running benchmark:  90%|█████████ | 27/30 [14:03<01:31, 30.39s/it]running benchmark:  93%|█████████▎| 28/30 [14:33<01:00, 30.31s/it]running benchmark:  97%|█████████▋| 29/30 [15:04<00:30, 30.25s/it]running benchmark: 100%|██████████| 30/30 [15:35<00:00, 30.60s/it]running benchmark: 100%|██████████| 30/30 [15:35<00:00, 31.18s/it]
+1447322.068ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+hf_Reformer
+cuda eval  hf_Reformer                         int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:02, 12.90it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:01, 13.06it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 13.01it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 13.15it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 13.21it/s]running benchmark:  40%|████      | 12/30 [00:00<00:01, 13.13it/s]running benchmark:  47%|████▋     | 14/30 [00:01<00:01, 13.14it/s]running benchmark:  53%|█████▎    | 16/30 [00:01<00:01, 13.14it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:00, 13.14it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 13.20it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 13.20it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 13.25it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 13.14it/s]running benchmark:  93%|█████████▎| 28/30 [00:02<00:00, 13.22it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 13.27it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 13.18it/s]
+21776.899ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:05, ?it/s]
+hf_T5
+cuda eval  hf_T5                               int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:07,  3.83it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:06,  4.29it/s]running benchmark:  10%|█         | 3/30 [00:00<00:06,  4.43it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:05,  4.52it/s]running benchmark:  17%|█▋        | 5/30 [00:01<00:05,  4.56it/s]running benchmark:  20%|██        | 6/30 [00:01<00:05,  4.61it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:04,  4.64it/s]running benchmark:  27%|██▋       | 8/30 [00:01<00:04,  4.66it/s]running benchmark:  30%|███       | 9/30 [00:01<00:04,  4.67it/s]running benchmark:  33%|███▎      | 10/30 [00:02<00:04,  4.68it/s]running benchmark:  37%|███▋      | 11/30 [00:02<00:04,  4.69it/s]running benchmark:  40%|████      | 12/30 [00:02<00:03,  4.70it/s]running benchmark:  43%|████▎     | 13/30 [00:02<00:03,  4.71it/s]running benchmark:  47%|████▋     | 14/30 [00:03<00:03,  4.70it/s]running benchmark:  50%|█████     | 15/30 [00:03<00:03,  4.71it/s]running benchmark:  53%|█████▎    | 16/30 [00:03<00:02,  4.71it/s]running benchmark:  57%|█████▋    | 17/30 [00:03<00:02,  4.72it/s]running benchmark:  60%|██████    | 18/30 [00:03<00:02,  4.72it/s]running benchmark:  63%|██████▎   | 19/30 [00:04<00:02,  4.69it/s]running benchmark:  67%|██████▋   | 20/30 [00:04<00:02,  4.70it/s]running benchmark:  70%|███████   | 21/30 [00:04<00:01,  4.71it/s]running benchmark:  73%|███████▎  | 22/30 [00:04<00:01,  4.71it/s]running benchmark:  77%|███████▋  | 23/30 [00:04<00:01,  4.70it/s]running benchmark:  80%|████████  | 24/30 [00:05<00:01,  4.67it/s]running benchmark:  83%|████████▎ | 25/30 [00:05<00:01,  4.64it/s]running benchmark:  87%|████████▋ | 26/30 [00:05<00:00,  4.64it/s]running benchmark:  90%|█████████ | 27/30 [00:05<00:00,  4.67it/s]running benchmark:  93%|█████████▎| 28/30 [00:06<00:00,  4.66it/s]running benchmark:  97%|█████████▋| 29/30 [00:06<00:00,  4.68it/s]running benchmark: 100%|██████████| 30/30 [00:06<00:00,  4.68it/s]running benchmark: 100%|██████████| 30/30 [00:06<00:00,  4.65it/s]
+19741.184ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:07, ?it/s]
+hf_T5_base
+cuda eval  hf_T5_base                          int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:15,  1.85it/s]running benchmark:   7%|▋         | 2/30 [00:01<00:13,  2.03it/s]running benchmark:  10%|█         | 3/30 [00:01<00:12,  2.09it/s]running benchmark:  13%|█▎        | 4/30 [00:01<00:12,  2.12it/s]running benchmark:  17%|█▋        | 5/30 [00:02<00:11,  2.12it/s]running benchmark:  20%|██        | 6/30 [00:02<00:11,  2.14it/s]running benchmark:  23%|██▎       | 7/30 [00:03<00:10,  2.12it/s]running benchmark:  27%|██▋       | 8/30 [00:03<00:10,  2.15it/s]running benchmark:  30%|███       | 9/30 [00:04<00:09,  2.14it/s]running benchmark:  33%|███▎      | 10/30 [00:04<00:09,  2.16it/s]running benchmark:  37%|███▋      | 11/30 [00:05<00:08,  2.18it/s]running benchmark:  40%|████      | 12/30 [00:05<00:08,  2.18it/s]running benchmark:  43%|████▎     | 13/30 [00:06<00:07,  2.18it/s]running benchmark:  47%|████▋     | 14/30 [00:06<00:07,  2.19it/s]running benchmark:  50%|█████     | 15/30 [00:06<00:06,  2.19it/s]running benchmark:  53%|█████▎    | 16/30 [00:07<00:06,  2.19it/s]running benchmark:  57%|█████▋    | 17/30 [00:07<00:05,  2.18it/s]running benchmark:  60%|██████    | 18/30 [00:08<00:05,  2.19it/s]running benchmark:  63%|██████▎   | 19/30 [00:08<00:05,  2.19it/s]running benchmark:  67%|██████▋   | 20/30 [00:09<00:04,  2.19it/s]running benchmark:  70%|███████   | 21/30 [00:09<00:04,  2.20it/s]running benchmark:  73%|███████▎  | 22/30 [00:10<00:03,  2.19it/s]running benchmark:  77%|███████▋  | 23/30 [00:10<00:03,  2.18it/s]running benchmark:  80%|████████  | 24/30 [00:11<00:02,  2.17it/s]running benchmark:  83%|████████▎ | 25/30 [00:11<00:02,  2.16it/s]running benchmark:  87%|████████▋ | 26/30 [00:12<00:01,  2.16it/s]running benchmark:  90%|█████████ | 27/30 [00:12<00:01,  2.17it/s]running benchmark:  93%|█████████▎| 28/30 [00:12<00:00,  2.18it/s]running benchmark:  97%|█████████▋| 29/30 [00:13<00:00,  2.18it/s]running benchmark: 100%|██████████| 30/30 [00:13<00:00,  2.19it/s]running benchmark: 100%|██████████| 30/30 [00:13<00:00,  2.16it/s]
+14857.495ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:06, ?it/s]
+hf_T5_generate
+cuda eval  hf_T5_generate                      int8dynamic               
+TIMEOUT
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:16, ?it/s]
+hf_T5_large
+cuda eval  hf_T5_large                         int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:25,  1.15it/s]running benchmark:   7%|▋         | 2/30 [00:01<00:23,  1.19it/s]running benchmark:  10%|█         | 3/30 [00:02<00:21,  1.23it/s]running benchmark:  13%|█▎        | 4/30 [00:03<00:20,  1.25it/s]running benchmark:  17%|█▋        | 5/30 [00:04<00:19,  1.26it/s]running benchmark:  20%|██        | 6/30 [00:04<00:19,  1.26it/s]running benchmark:  23%|██▎       | 7/30 [00:05<00:18,  1.26it/s]running benchmark:  27%|██▋       | 8/30 [00:06<00:17,  1.27it/s]running benchmark:  30%|███       | 9/30 [00:07<00:16,  1.27it/s]running benchmark:  33%|███▎      | 10/30 [00:07<00:15,  1.27it/s]running benchmark:  37%|███▋      | 11/30 [00:08<00:14,  1.28it/s]running benchmark:  40%|████      | 12/30 [00:09<00:14,  1.28it/s]running benchmark:  43%|████▎     | 13/30 [00:10<00:13,  1.28it/s]running benchmark:  47%|████▋     | 14/30 [00:11<00:12,  1.27it/s]running benchmark:  50%|█████     | 15/30 [00:11<00:11,  1.27it/s]running benchmark:  53%|█████▎    | 16/30 [00:12<00:10,  1.28it/s]running benchmark:  57%|█████▋    | 17/30 [00:13<00:10,  1.28it/s]running benchmark:  60%|██████    | 18/30 [00:14<00:09,  1.27it/s]running benchmark:  63%|██████▎   | 19/30 [00:15<00:08,  1.27it/s]running benchmark:  67%|██████▋   | 20/30 [00:15<00:07,  1.27it/s]running benchmark:  70%|███████   | 21/30 [00:16<00:07,  1.27it/s]running benchmark:  73%|███████▎  | 22/30 [00:17<00:06,  1.27it/s]running benchmark:  77%|███████▋  | 23/30 [00:18<00:05,  1.26it/s]running benchmark:  80%|████████  | 24/30 [00:18<00:04,  1.27it/s]running benchmark:  83%|████████▎ | 25/30 [00:19<00:03,  1.28it/s]running benchmark:  87%|████████▋ | 26/30 [00:20<00:03,  1.28it/s]running benchmark:  90%|█████████ | 27/30 [00:21<00:02,  1.27it/s]running benchmark:  93%|█████████▎| 28/30 [00:22<00:01,  1.26it/s]running benchmark:  97%|█████████▋| 29/30 [00:22<00:00,  1.27it/s]running benchmark: 100%|██████████| 30/30 [00:23<00:00,  1.27it/s]running benchmark: 100%|██████████| 30/30 [00:23<00:00,  1.27it/s]
+52050.693ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+hf_Whisper
+cuda eval  hf_Whisper                          int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:03,  9.18it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:02,  9.64it/s]running benchmark:  10%|█         | 3/30 [00:00<00:02,  9.78it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:02,  9.86it/s]running benchmark:  20%|██        | 6/30 [00:00<00:02, 10.00it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:02, 10.04it/s]running benchmark:  33%|███▎      | 10/30 [00:01<00:02,  9.99it/s]running benchmark:  40%|████      | 12/30 [00:01<00:01, 10.03it/s]running benchmark:  47%|████▋     | 14/30 [00:01<00:01,  9.97it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01,  9.90it/s]running benchmark:  53%|█████▎    | 16/30 [00:01<00:01,  9.83it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:01,  9.79it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:01,  9.75it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:01,  9.72it/s]running benchmark:  67%|██████▋   | 20/30 [00:02<00:01,  9.74it/s]running benchmark:  70%|███████   | 21/30 [00:02<00:00,  9.76it/s]running benchmark:  73%|███████▎  | 22/30 [00:02<00:00,  9.81it/s]running benchmark:  77%|███████▋  | 23/30 [00:02<00:00,  9.86it/s]running benchmark:  80%|████████  | 24/30 [00:02<00:00,  9.87it/s]running benchmark:  83%|████████▎ | 25/30 [00:02<00:00,  9.83it/s]running benchmark:  90%|█████████ | 27/30 [00:02<00:00,  9.91it/s]running benchmark:  93%|█████████▎| 28/30 [00:02<00:00,  9.91it/s]running benchmark:  97%|█████████▋| 29/30 [00:02<00:00,  9.92it/s]running benchmark: 100%|██████████| 30/30 [00:03<00:00,  9.94it/s]running benchmark: 100%|██████████| 30/30 [00:03<00:00,  9.88it/s]
+25397.931ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:05, ?it/s]
+WARNING:root:hf_clip failed to load
+hf_clip
+Original Error: 'str' object has no attribute 'shape'
+Eager model failed to run
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model
+    self.model_iter_fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.py", line 1142, in forward
+    vision_outputs = self.vision_model(
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.py", line 883, in forward
+    hidden_states = self.embeddings(pixel_values)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.py", line 194, in forward
+    batch_size = pixel_values.shape[0]
+AttributeError: 'str' object has no attribute 'shape'
+
+The above exception was the direct cause of the following exception:
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model
+    self.validate_model(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model
+    raise NotImplementedError("Eager model failed to run") from e
+NotImplementedError: Eager model failed to run
+
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+lennard_jones
+cuda eval  lennard_jones                       int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 1450.91it/s]
+1403.591ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+llama
+cuda eval  llama                               int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:04,  6.67it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:04,  6.81it/s]running benchmark:  10%|█         | 3/30 [00:00<00:03,  6.86it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:03,  6.87it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:03,  6.89it/s]running benchmark:  20%|██        | 6/30 [00:00<00:03,  6.89it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:03,  6.87it/s]running benchmark:  27%|██▋       | 8/30 [00:01<00:03,  6.88it/s]running benchmark:  30%|███       | 9/30 [00:01<00:03,  6.91it/s]running benchmark:  33%|███▎      | 10/30 [00:01<00:02,  6.92it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:02,  6.93it/s]running benchmark:  40%|████      | 12/30 [00:01<00:02,  6.93it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:02,  6.94it/s]running benchmark:  47%|████▋     | 14/30 [00:02<00:02,  6.92it/s]running benchmark:  50%|█████     | 15/30 [00:02<00:02,  6.94it/s]running benchmark:  53%|█████▎    | 16/30 [00:02<00:02,  6.93it/s]running benchmark:  57%|█████▋    | 17/30 [00:02<00:01,  6.92it/s]running benchmark:  60%|██████    | 18/30 [00:02<00:01,  6.87it/s]running benchmark:  63%|██████▎   | 19/30 [00:02<00:01,  6.81it/s]running benchmark:  67%|██████▋   | 20/30 [00:02<00:01,  6.74it/s]running benchmark:  70%|███████   | 21/30 [00:03<00:01,  6.67it/s]running benchmark:  73%|███████▎  | 22/30 [00:03<00:01,  6.67it/s]running benchmark:  77%|███████▋  | 23/30 [00:03<00:01,  6.66it/s]running benchmark:  80%|████████  | 24/30 [00:03<00:00,  6.67it/s]running benchmark:  83%|████████▎ | 25/30 [00:03<00:00,  6.66it/s]running benchmark:  87%|████████▋ | 26/30 [00:03<00:00,  6.73it/s]running benchmark:  90%|█████████ | 27/30 [00:03<00:00,  6.79it/s]running benchmark:  93%|█████████▎| 28/30 [00:04<00:00,  6.77it/s]running benchmark:  97%|█████████▋| 29/30 [00:04<00:00,  6.77it/s]running benchmark: 100%|██████████| 30/30 [00:04<00:00,  6.79it/s]running benchmark: 100%|██████████| 30/30 [00:04<00:00,  6.82it/s]
+49613.459ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [01:01, ?it/s]
+llama_v2_7b_16h
+cuda eval  llama_v2_7b_16h                     int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:09,  3.19it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:08,  3.42it/s]running benchmark:  10%|█         | 3/30 [00:00<00:07,  3.50it/s]running benchmark:  13%|█▎        | 4/30 [00:01<00:07,  3.57it/s]running benchmark:  17%|█▋        | 5/30 [00:01<00:06,  3.61it/s]running benchmark:  20%|██        | 6/30 [00:01<00:06,  3.64it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:06,  3.65it/s]running benchmark:  27%|██▋       | 8/30 [00:02<00:06,  3.65it/s]running benchmark:  30%|███       | 9/30 [00:02<00:05,  3.66it/s]running benchmark:  33%|███▎      | 10/30 [00:02<00:05,  3.67it/s]running benchmark:  37%|███▋      | 11/30 [00:03<00:05,  3.68it/s]running benchmark:  40%|████      | 12/30 [00:03<00:04,  3.67it/s]running benchmark:  43%|████▎     | 13/30 [00:03<00:04,  3.68it/s]running benchmark:  47%|████▋     | 14/30 [00:03<00:04,  3.67it/s]running benchmark:  50%|█████     | 15/30 [00:04<00:04,  3.55it/s]running benchmark:  53%|█████▎    | 16/30 [00:04<00:03,  3.58it/s]running benchmark:  57%|█████▋    | 17/30 [00:04<00:03,  3.60it/s]running benchmark:  60%|██████    | 18/30 [00:04<00:03,  3.63it/s]running benchmark:  63%|██████▎   | 19/30 [00:05<00:03,  3.61it/s]running benchmark:  67%|██████▋   | 20/30 [00:05<00:02,  3.63it/s]running benchmark:  70%|███████   | 21/30 [00:05<00:02,  3.64it/s]running benchmark:  73%|███████▎  | 22/30 [00:06<00:02,  3.66it/s]running benchmark:  77%|███████▋  | 23/30 [00:06<00:01,  3.64it/s]running benchmark:  80%|████████  | 24/30 [00:06<00:01,  3.67it/s]running benchmark:  83%|████████▎ | 25/30 [00:06<00:01,  3.68it/s]running benchmark:  87%|████████▋ | 26/30 [00:07<00:01,  3.69it/s]running benchmark:  90%|█████████ | 27/30 [00:07<00:00,  3.68it/s]running benchmark:  93%|█████████▎| 28/30 [00:07<00:00,  3.69it/s]running benchmark:  97%|█████████▋| 29/30 [00:07<00:00,  3.68it/s]running benchmark: 100%|██████████| 30/30 [00:08<00:00,  3.70it/s]running benchmark: 100%|██████████| 30/30 [00:08<00:00,  3.64it/s]
+17377.755ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+maml_omniglot
+cuda eval  maml_omniglot                       int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 153.73it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 152.12it/s]
+22172.430ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+mnasnet1_0
+cuda eval  mnasnet1_0                          int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 96.02it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 99.47it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 98.85it/s]
+4119.311ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+mobilenet_v2
+cuda eval  mobilenet_v2                        int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:00, 62.16it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 63.55it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 64.06it/s]running benchmark:  93%|█████████▎| 28/30 [00:00<00:00, 64.31it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 64.02it/s]
+9025.795ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+WARNING:root:mobilenet_v2_quantized_qat failed to load
+mobilenet_v2_quantized_qat
+The eval test only supports CPU.
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 443, in load_model
+    benchmark = benchmark_cls(
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/util/model.py", line 24, in __call__
+    obj = type.__call__(cls, *args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/models/mobilenet_v2_quantized_qat/__init__.py", line 21, in __init__
+    raise NotImplementedError("The eval test only supports CPU.")
+NotImplementedError: The eval test only supports CPU.
+
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+mobilenet_v3_large
+cuda eval  mobilenet_v3_large                  int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  20%|██        | 6/30 [00:00<00:00, 55.04it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 68.41it/s]running benchmark:  73%|███████▎  | 22/30 [00:00<00:00, 72.83it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 74.11it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 71.64it/s]
+5970.958ms
+loading model: 0it [00:00, ?it/s]NCCL version 2.19.3+cuda12.0
+loading model: 0it [00:02, ?it/s]
+moco
+cuda eval  moco                                int8dynamic               
+ERROR:common:Backend eager failed in warmup()
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2380, in warmup
+    fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/parallel/distributed.py", line 1523, in forward
+    else self._run_ddp_forward(*inputs, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/parallel/distributed.py", line 1359, in _run_ddp_forward
+    return self.module(*inputs, **kwargs)  # type: ignore[index]
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/models/moco/moco/builder.py", line 130, in forward
+    self._momentum_update_key_encoder()  # update the key encoder
+  File "/home/cdhernandez/local/pytorch/torch/utils/_contextlib.py", line 115, in decorate_context
+    return func(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/models/moco/moco/builder.py", line 50, in _momentum_update_key_encoder
+    param_k.mul_(self.m).add_(param_q.mul(1. - self.m))
+TypeError: add_(): argument 'other' (position 1) must be Tensor, not NoneType
+Run failed with return code:  255
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:05, ?it/s]
+nanogpt
+number of parameters: 123.69M
+num decayed parameter tensors: 50, with 124,354,560 parameters
+num non-decayed parameter tensors: 98, with 121,344 parameters
+using fused AdamW: True
+cuda eval  nanogpt                             int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:04,  5.82it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:04,  5.91it/s]running benchmark:  10%|█         | 3/30 [00:00<00:04,  5.92it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:04,  5.95it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:04,  5.93it/s]running benchmark:  20%|██        | 6/30 [00:01<00:04,  5.97it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:03,  5.93it/s]running benchmark:  27%|██▋       | 8/30 [00:01<00:03,  5.93it/s]running benchmark:  30%|███       | 9/30 [00:01<00:03,  5.91it/s]running benchmark:  33%|███▎      | 10/30 [00:01<00:03,  5.95it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:03,  5.98it/s]running benchmark:  40%|████      | 12/30 [00:02<00:02,  6.02it/s]running benchmark:  43%|████▎     | 13/30 [00:02<00:02,  6.03it/s]running benchmark:  47%|████▋     | 14/30 [00:02<00:02,  5.98it/s]running benchmark:  50%|█████     | 15/30 [00:02<00:02,  6.00it/s]running benchmark:  53%|█████▎    | 16/30 [00:02<00:02,  6.03it/s]running benchmark:  57%|█████▋    | 17/30 [00:02<00:02,  6.06it/s]running benchmark:  60%|██████    | 18/30 [00:03<00:01,  6.07it/s]running benchmark:  63%|██████▎   | 19/30 [00:03<00:01,  6.08it/s]running benchmark:  67%|██████▋   | 20/30 [00:03<00:01,  6.05it/s]running benchmark:  70%|███████   | 21/30 [00:03<00:01,  6.04it/s]running benchmark:  73%|███████▎  | 22/30 [00:03<00:01,  6.05it/s]running benchmark:  77%|███████▋  | 23/30 [00:03<00:01,  6.07it/s]running benchmark:  80%|████████  | 24/30 [00:03<00:00,  6.07it/s]running benchmark:  83%|████████▎ | 25/30 [00:04<00:00,  6.09it/s]running benchmark:  87%|████████▋ | 26/30 [00:04<00:00,  6.09it/s]running benchmark:  90%|█████████ | 27/30 [00:04<00:00,  6.04it/s]running benchmark:  93%|█████████▎| 28/30 [00:04<00:00,  6.06it/s]running benchmark:  97%|█████████▋| 29/30 [00:04<00:00,  6.08it/s]running benchmark: 100%|██████████| 30/30 [00:04<00:00,  6.10it/s]running benchmark: 100%|██████████| 30/30 [00:04<00:00,  6.02it/s]
+94431.285ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+nvidia_deeprecommender
+cuda eval  nvidia_deeprecommender              int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 199.18it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 203.68it/s]
+924.294ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+opacus_cifar10
+cuda eval  opacus_cifar10                      int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 159.62it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 157.83it/s]
+6378.452ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:26, ?it/s]
+phi_1_5
+cuda eval  phi_1_5                             int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:07,  3.94it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:06,  4.15it/s]running benchmark:  10%|█         | 3/30 [00:00<00:06,  4.25it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:06,  4.33it/s]running benchmark:  17%|█▋        | 5/30 [00:01<00:05,  4.40it/s]running benchmark:  20%|██        | 6/30 [00:01<00:05,  4.42it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:05,  4.44it/s]running benchmark:  27%|██▋       | 8/30 [00:01<00:04,  4.45it/s]running benchmark:  30%|███       | 9/30 [00:02<00:04,  4.46it/s]running benchmark:  33%|███▎      | 10/30 [00:02<00:04,  4.45it/s]running benchmark:  37%|███▋      | 11/30 [00:02<00:04,  4.48it/s]running benchmark:  40%|████      | 12/30 [00:02<00:04,  4.48it/s]running benchmark:  43%|████▎     | 13/30 [00:02<00:03,  4.50it/s]running benchmark:  47%|████▋     | 14/30 [00:03<00:03,  4.52it/s]running benchmark:  50%|█████     | 15/30 [00:03<00:03,  4.54it/s]running benchmark:  53%|█████▎    | 16/30 [00:03<00:03,  4.54it/s]running benchmark:  57%|█████▋    | 17/30 [00:03<00:02,  4.51it/s]running benchmark:  60%|██████    | 18/30 [00:04<00:02,  4.52it/s]running benchmark:  63%|██████▎   | 19/30 [00:04<00:02,  4.53it/s]running benchmark:  67%|██████▋   | 20/30 [00:04<00:02,  4.53it/s]running benchmark:  70%|███████   | 21/30 [00:04<00:01,  4.52it/s]running benchmark:  73%|███████▎  | 22/30 [00:04<00:01,  4.52it/s]running benchmark:  77%|███████▋  | 23/30 [00:05<00:01,  4.53it/s]running benchmark:  80%|████████  | 24/30 [00:05<00:01,  4.54it/s]running benchmark:  83%|████████▎ | 25/30 [00:05<00:01,  4.54it/s]running benchmark:  87%|████████▋ | 26/30 [00:05<00:00,  4.53it/s]running benchmark:  90%|█████████ | 27/30 [00:06<00:00,  4.50it/s]running benchmark:  93%|█████████▎| 28/30 [00:06<00:00,  4.51it/s]running benchmark:  97%|█████████▋| 29/30 [00:06<00:00,  4.52it/s]running benchmark: 100%|██████████| 30/30 [00:06<00:00,  4.50it/s]running benchmark: 100%|██████████| 30/30 [00:06<00:00,  4.48it/s]
+18213.948ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+phlippe_densenet
+cuda eval  phlippe_densenet                    int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:00, 44.67it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 64.31it/s]running benchmark:  73%|███████▎  | 22/30 [00:00<00:00, 71.77it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 74.85it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 70.58it/s]
+5831.329ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+phlippe_resnet
+cuda eval  phlippe_resnet                      int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 167.22it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 171.60it/s]
+5884.330ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+pyhpc_equation_of_state
+cuda eval  pyhpc_equation_of_state             int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 198.90it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 206.28it/s]
+17223.140ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+pyhpc_isoneutral_mixing
+cuda eval  pyhpc_isoneutral_mixing             int8dynamic               
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 88.78it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 88.33it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 87.47it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 87.61it/s]
+6080.868ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model pyhpc_turbulent_kinetic_energy does not support bfloat16, running with amp instead
+loading model: 0it [00:02, ?it/s]
+WARNING:common:Model pyhpc_turbulent_kinetic_energy does not support bfloat16, running with amp instead
+pyhpc_turbulent_kinetic_energy
+cuda eval  pyhpc_turbulent_kinetic_energy      int8dynamic               
+WARNING:common:Model pyhpc_turbulent_kinetic_energy does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  20%|██        | 6/30 [00:00<00:00, 55.25it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 71.40it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 76.70it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 75.07it/s]
+4545.456ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+pytorch_CycleGAN_and_pix2pix
+cuda eval  pytorch_CycleGAN_and_pix2pix        int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 149.92it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 154.26it/s]
+2363.859ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+pytorch_stargan
+cuda eval  pytorch_stargan                     int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 114.76it/s]running benchmark:  83%|████████▎ | 25/30 [00:00<00:00, 119.94it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 119.82it/s]
+1935.717ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+pytorch_unet
+cuda eval  pytorch_unet                        int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:00, 37.50it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 42.85it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 44.56it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 45.36it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 45.81it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 46.06it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 45.11it/s]
+1810.993ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+resnet152
+cuda eval  resnet152                           int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  10%|█         | 3/30 [00:00<00:01, 25.47it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:00, 29.11it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 30.45it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 31.04it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 31.25it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 31.40it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 31.29it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 30.80it/s]
+2599.820ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+resnet18
+cuda eval  resnet18                            int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 124.95it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 129.42it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 129.03it/s]
+6675.375ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+resnet50
+cuda eval  resnet50                            int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:00, 67.29it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 72.65it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 74.32it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 73.97it/s]
+2178.083ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+WARNING:root:resnet50_quantized_qat failed to load
+resnet50_quantized_qat
+The eval test only supports CPU.
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 443, in load_model
+    benchmark = benchmark_cls(
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/util/model.py", line 24, in __call__
+    obj = type.__call__(cls, *args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/models/resnet50_quantized_qat/__init__.py", line 21, in __init__
+    raise NotImplementedError("The eval test only supports CPU.")
+NotImplementedError: The eval test only supports CPU.
+
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+resnext50_32x4d
+cuda eval  resnext50_32x4d                     int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  20%|██        | 6/30 [00:00<00:00, 51.73it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 56.18it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 57.78it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 58.60it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 59.06it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 58.00it/s]
+6936.967ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:12, ?it/s]
+sam
+cuda eval  sam                                 int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:21,  1.33it/s]running benchmark:   7%|▋         | 2/30 [00:01<00:18,  1.52it/s]running benchmark:  10%|█         | 3/30 [00:01<00:16,  1.60it/s]running benchmark:  13%|█▎        | 4/30 [00:02<00:15,  1.64it/s]running benchmark:  17%|█▋        | 5/30 [00:03<00:14,  1.67it/s]running benchmark:  20%|██        | 6/30 [00:03<00:14,  1.68it/s]running benchmark:  23%|██▎       | 7/30 [00:04<00:13,  1.70it/s]running benchmark:  27%|██▋       | 8/30 [00:04<00:12,  1.70it/s]running benchmark:  30%|███       | 9/30 [00:05<00:12,  1.71it/s]running benchmark:  33%|███▎      | 10/30 [00:06<00:11,  1.71it/s]running benchmark:  37%|███▋      | 11/30 [00:06<00:11,  1.72it/s]running benchmark:  40%|████      | 12/30 [00:07<00:10,  1.72it/s]running benchmark:  43%|████▎     | 13/30 [00:07<00:09,  1.72it/s]running benchmark:  47%|████▋     | 14/30 [00:08<00:09,  1.72it/s]running benchmark:  50%|█████     | 15/30 [00:08<00:08,  1.72it/s]running benchmark:  53%|█████▎    | 16/30 [00:09<00:08,  1.72it/s]running benchmark:  57%|█████▋    | 17/30 [00:10<00:07,  1.72it/s]running benchmark:  60%|██████    | 18/30 [00:10<00:06,  1.72it/s]running benchmark:  63%|██████▎   | 19/30 [00:11<00:06,  1.72it/s]running benchmark:  67%|██████▋   | 20/30 [00:11<00:05,  1.72it/s]running benchmark:  70%|███████   | 21/30 [00:12<00:05,  1.72it/s]running benchmark:  73%|███████▎  | 22/30 [00:12<00:04,  1.72it/s]running benchmark:  77%|███████▋  | 23/30 [00:13<00:04,  1.72it/s]running benchmark:  80%|████████  | 24/30 [00:14<00:03,  1.72it/s]running benchmark:  83%|████████▎ | 25/30 [00:14<00:02,  1.72it/s]running benchmark:  87%|████████▋ | 26/30 [00:15<00:02,  1.46it/s]running benchmark:  90%|█████████ | 27/30 [00:16<00:02,  1.33it/s]running benchmark:  93%|█████████▎| 28/30 [00:17<00:01,  1.22it/s]running benchmark:  97%|█████████▋| 29/30 [00:18<00:00,  1.33it/s]running benchmark: 100%|██████████| 30/30 [00:18<00:00,  1.42it/s]running benchmark: 100%|██████████| 30/30 [00:18<00:00,  1.60it/s]
+10137.983ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+shufflenet_v2_x1_0
+cuda eval  shufflenet_v2_x1_0                  int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:00, 66.59it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 77.57it/s]running benchmark:  83%|████████▎ | 25/30 [00:00<00:00, 80.72it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 79.58it/s]
+4224.532ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+soft_actor_critic
+cuda eval  soft_actor_critic                   int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 147.47it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 149.34it/s]
+15090.046ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+speech_transformer
+cuda eval  speech_transformer                  int8dynamic               
+AUTOTUNE int_mm(2040x320, 320x512, 2040x512)
+  triton_mm_8 0.0136 ms 100.0%
+  triton_mm_1 0.0139 ms 98.2%
+  triton_mm_2 0.0141 ms 96.6%
+  triton_mm_4 0.0145 ms 93.6%
+  triton_mm_3 0.0148 ms 91.8%
+  triton_mm_0 0.0153 ms 88.9%
+  triton_mm_7 0.0186 ms 73.1%
+  triton_mm_5 0.0186 ms 73.0%
+  triton_mm_6 0.0192 ms 70.7%
+  triton_mm_9 0.0199 ms 68.4%
+SingleProcess AUTOTUNE takes 7.1144 seconds
+AUTOTUNE int_mm(2040x512, 512x512, 2040x512)
+  triton_mm_19 0.0150 ms 100.0%
+  triton_mm_12 0.0170 ms 88.1%
+  triton_mm_13 0.0172 ms 87.2%
+  triton_mm_15 0.0182 ms 82.4%
+  triton_mm_14 0.0187 ms 80.0%
+  triton_mm_11 0.0190 ms 78.8%
+  triton_mm_20 0.0214 ms 70.0%
+  triton_mm_21 0.0216 ms 69.4%
+  triton_mm_18 0.0241 ms 62.1%
+  triton_mm_16 0.0256 ms 58.5%
+SingleProcess AUTOTUNE takes 7.2167 seconds
+AUTOTUNE bmm(80x204x64, 80x64x204)
+  triton_bmm_34 0.0176 ms 100.0%
+  triton_bmm_35 0.0181 ms 97.2%
+  triton_bmm_36 0.0194 ms 91.1%
+  triton_bmm_41 0.0200 ms 88.3%
+  triton_bmm_42 0.0202 ms 87.5%
+  triton_bmm_37 0.0203 ms 87.0%
+  triton_bmm_40 0.0210 ms 84.1%
+  triton_bmm_43 0.0212 ms 83.1%
+  triton_bmm_33 0.0219 ms 80.4%
+  triton_bmm_44 0.0242 ms 72.8%
+SingleProcess AUTOTUNE takes 4.9300 seconds
+AUTOTUNE bmm(80x204x204, 80x204x64)
+  triton_bmm_57 0.0174 ms 100.0%
+  triton_bmm_56 0.0178 ms 97.8%
+  triton_bmm_58 0.0181 ms 96.1%
+  triton_bmm_59 0.0188 ms 93.0%
+  triton_bmm_63 0.0189 ms 92.2%
+  triton_bmm_60 0.0195 ms 89.3%
+  triton_bmm_64 0.0196 ms 89.2%
+  triton_bmm_62 0.0200 ms 87.2%
+  triton_bmm_66 0.0209 ms 83.5%
+  bmm 0.0230 ms 75.8%
+SingleProcess AUTOTUNE takes 4.3932 seconds
+AUTOTUNE int_mm(2040x512, 512x2048, 2040x2048)
+  triton_mm_80 0.0321 ms 100.0%
+  triton_mm_81 0.0328 ms 97.9%
+  triton_mm_87 0.0339 ms 94.6%
+  triton_mm_83 0.0365 ms 87.7%
+  triton_mm_82 0.0380 ms 84.5%
+  triton_mm_89 0.0385 ms 83.4%
+  triton_mm_88 0.0390 ms 82.2%
+  triton_mm_79 0.0400 ms 80.1%
+  triton_mm_86 0.0415 ms 77.3%
+  triton_mm_84 0.0773 ms 41.5%
+SingleProcess AUTOTUNE takes 7.1405 seconds
+AUTOTUNE int_mm(2040x2048, 2048x512, 2040x512)
+  triton_mm_98 0.0312 ms 100.0%
+  triton_mm_100 0.0396 ms 78.8%
+  triton_mm_99 0.0399 ms 78.1%
+  triton_mm_94 0.0405 ms 76.9%
+  triton_mm_92 0.0407 ms 76.5%
+  triton_mm_91 0.0410 ms 76.1%
+  triton_mm_93 0.0426 ms 73.2%
+  triton_mm_90 0.0494 ms 63.1%
+  triton_mm_97 0.0660 ms 47.2%
+  triton_mm_96 0.0676 ms 46.1%
+SingleProcess AUTOTUNE takes 7.2817 seconds
+skipping cudagraphs due to ['non-cuda device in graph']
+AUTOTUNE int_mm(220x512, 512x512, 220x512)
+  triton_mm_559 0.0109 ms 100.0%
+  triton_mm_557 0.0111 ms 97.7%
+  triton_mm_556 0.0113 ms 96.6%
+  triton_mm_554 0.0133 ms 81.5%
+  triton_mm_555 0.0138 ms 79.1%
+  triton_mm_553 0.0154 ms 70.8%
+  triton_mm_552 0.0156 ms 69.5%
+  triton_mm_551 0.0160 ms 67.9%
+  triton_mm_560 0.0218 ms 50.0%
+  triton_mm_561 0.0219 ms 49.7%
+SingleProcess AUTOTUNE takes 7.4213 seconds
+AUTOTUNE bmm(80x22x64, 80x64x22)
+  triton_bmm_574 0.0069 ms 100.0%
+  triton_bmm_576 0.0069 ms 100.0%
+  triton_bmm_577 0.0071 ms 97.3%
+  triton_bmm_575 0.0073 ms 93.9%
+  triton_bmm_578 0.0075 ms 91.7%
+  triton_bmm_573 0.0075 ms 91.5%
+  bmm 0.0078 ms 88.1%
+  triton_bmm_580 0.0080 ms 86.3%
+  triton_bmm_579 0.0084 ms 81.4%
+SingleProcess AUTOTUNE takes 2.4677 seconds
+AUTOTUNE bmm(80x22x22, 80x22x64)
+  triton_bmm_592 0.0068 ms 100.0%
+  triton_bmm_594 0.0068 ms 100.0%
+  triton_bmm_597 0.0068 ms 100.0%
+  triton_bmm_595 0.0073 ms 93.9%
+  triton_bmm_593 0.0073 ms 93.4%
+  triton_bmm_596 0.0073 ms 93.4%
+  triton_bmm_598 0.0073 ms 93.4%
+  triton_bmm_599 0.0073 ms 93.4%
+  triton_bmm_601 0.0078 ms 88.1%
+  triton_bmm_600 0.0078 ms 87.9%
+SingleProcess AUTOTUNE takes 2.9252 seconds
+AUTOTUNE bmm(80x22x64, 80x64x204)
+  triton_bmm_637 0.0097 ms 100.0%
+  triton_bmm_636 0.0099 ms 97.7%
+  triton_bmm_635 0.0100 ms 97.1%
+  triton_bmm_643 0.0100 ms 96.2%
+  triton_bmm_645 0.0102 ms 94.7%
+  triton_bmm_639 0.0102 ms 94.4%
+  triton_bmm_644 0.0103 ms 93.5%
+  triton_bmm_641 0.0105 ms 92.1%
+  triton_bmm_646 0.0106 ms 91.0%
+  triton_bmm_638 0.0108 ms 89.9%
+SingleProcess AUTOTUNE takes 4.0706 seconds
+AUTOTUNE bmm(80x22x204, 80x204x64)
+  triton_bmm_662 0.0098 ms 100.0%
+  triton_bmm_660 0.0105 ms 93.6%
+  triton_bmm_664 0.0105 ms 93.3%
+  triton_bmm_661 0.0106 ms 92.7%
+  triton_bmm_659 0.0107 ms 91.3%
+  triton_bmm_665 0.0110 ms 88.7%
+  triton_bmm_658 0.0129 ms 75.7%
+  triton_bmm_663 0.0129 ms 75.7%
+  bmm 0.0139 ms 70.5%
+  triton_bmm_666 0.0161 ms 60.8%
+SingleProcess AUTOTUNE takes 3.3757 seconds
+AUTOTUNE int_mm(220x512, 512x2048, 220x2048)
+  triton_mm_687 0.0124 ms 100.0%
+  triton_mm_683 0.0137 ms 90.2%
+  triton_mm_682 0.0141 ms 87.3%
+  triton_mm_680 0.0160 ms 77.1%
+  triton_mm_681 0.0161 ms 76.6%
+  triton_mm_679 0.0173 ms 71.3%
+  triton_mm_684 0.0178 ms 69.5%
+  triton_mm_685 0.0183 ms 67.6%
+  triton_mm_688 0.0213 ms 58.0%
+  triton_mm_689 0.0219 ms 56.5%
+SingleProcess AUTOTUNE takes 7.8144 seconds
+AUTOTUNE int_mm(220x2048, 2048x512, 220x512)
+  triton_mm_698 0.0214 ms 100.0%
+  triton_mm_695 0.0216 ms 99.1%
+  triton_mm_696 0.0222 ms 96.5%
+  triton_mm_693 0.0280 ms 76.5%
+  triton_mm_694 0.0292 ms 73.3%
+  triton_mm_691 0.0378 ms 56.8%
+  triton_mm_692 0.0387 ms 55.5%
+  triton_mm_699 0.0395 ms 54.3%
+  triton_mm_700 0.0398 ms 53.9%
+  triton_mm_690 0.0458 ms 46.8%
+SingleProcess AUTOTUNE takes 8.4661 seconds
+AUTOTUNE int_mm(220x512, 512x1014, 220x1014)
+  triton_mm_1459 0.0114 ms 100.0%
+  triton_mm_1455 0.0139 ms 82.0%
+  triton_mm_1456 0.0140 ms 80.9%
+  triton_mm_1454 0.0145 ms 78.2%
+  triton_mm_1457 0.0148 ms 77.0%
+  triton_mm_1453 0.0159 ms 71.6%
+  triton_mm_1452 0.0161 ms 70.7%
+  triton_mm_1451 0.0166 ms 68.3%
+  triton_mm_1461 0.0235 ms 48.4%
+  triton_mm_1460 0.0246 ms 46.2%
+SingleProcess AUTOTUNE takes 8.3264 seconds
+skipping cudagraphs due to ['non-cuda device in graph']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:07,  3.77it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:07,  3.86it/s]running benchmark:  10%|█         | 3/30 [00:00<00:06,  3.98it/s]running benchmark:  13%|█▎        | 4/30 [00:01<00:06,  4.03it/s]running benchmark:  17%|█▋        | 5/30 [00:01<00:05,  4.23it/s]running benchmark:  20%|██        | 6/30 [00:01<00:05,  4.10it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:05,  4.23it/s]running benchmark:  27%|██▋       | 8/30 [00:01<00:05,  4.17it/s]running benchmark:  30%|███       | 9/30 [00:02<00:05,  4.15it/s]running benchmark:  33%|███▎      | 10/30 [00:02<00:04,  4.28it/s]running benchmark:  37%|███▋      | 11/30 [00:02<00:04,  4.34it/s]running benchmark:  40%|████      | 12/30 [00:02<00:04,  4.41it/s]running benchmark:  43%|████▎     | 13/30 [00:03<00:04,  4.21it/s]running benchmark:  47%|████▋     | 14/30 [00:03<00:03,  4.29it/s]running benchmark:  50%|█████     | 15/30 [00:03<00:03,  4.39it/s]running benchmark:  53%|█████▎    | 16/30 [00:03<00:03,  4.30it/s]running benchmark:  57%|█████▋    | 17/30 [00:04<00:02,  4.36it/s]running benchmark:  60%|██████    | 18/30 [00:04<00:02,  4.48it/s]running benchmark:  63%|██████▎   | 19/30 [00:04<00:02,  4.54it/s]running benchmark:  67%|██████▋   | 20/30 [00:04<00:02,  4.35it/s]running benchmark:  70%|███████   | 21/30 [00:04<00:02,  4.20it/s]running benchmark:  73%|███████▎  | 22/30 [00:05<00:01,  4.33it/s]running benchmark:  77%|███████▋  | 23/30 [00:05<00:01,  4.27it/s]running benchmark:  80%|████████  | 24/30 [00:05<00:01,  4.38it/s]running benchmark:  83%|████████▎ | 25/30 [00:05<00:01,  4.48it/s]running benchmark:  87%|████████▋ | 26/30 [00:06<00:00,  4.52it/s]running benchmark:  90%|█████████ | 27/30 [00:06<00:00,  4.32it/s]running benchmark:  93%|█████████▎| 28/30 [00:06<00:00,  4.40it/s]running benchmark:  97%|█████████▋| 29/30 [00:06<00:00,  4.50it/s]running benchmark: 100%|██████████| 30/30 [00:06<00:00,  4.36it/s]running benchmark: 100%|██████████| 30/30 [00:06<00:00,  4.30it/s]
+20070.656ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+squeezenet1_1
+cuda eval  squeezenet1_1                       int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 297.31it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 296.76it/s]
+3509.240ms
+loading model: 0it [00:00, ?it/s]stable_diffusion_text_encoder
+
+Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s][A
+Loading pipeline components...:  33%|███▎      | 2/6 [00:00<00:01,  3.80it/s][A
+Loading pipeline components...:  50%|█████     | 3/6 [00:00<00:00,  4.96it/s][A
+Loading pipeline components...:  67%|██████▋   | 4/6 [00:00<00:00,  6.02it/s][A
+Loading pipeline components...:  83%|████████▎ | 5/6 [00:01<00:00,  3.15it/s][ALoading pipeline components...: 100%|██████████| 6/6 [00:01<00:00,  4.44it/s]
+loading model: 0it [00:08, ?it/s]
+cuda eval  stable_diffusion_text_encoder       int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:28,  1.03it/s]running benchmark:   7%|▋         | 2/30 [00:01<00:26,  1.07it/s]running benchmark:  10%|█         | 3/30 [00:02<00:24,  1.09it/s]running benchmark:  13%|█▎        | 4/30 [00:03<00:23,  1.09it/s]running benchmark:  17%|█▋        | 5/30 [00:04<00:22,  1.10it/s]running benchmark:  20%|██        | 6/30 [00:05<00:21,  1.10it/s]running benchmark:  23%|██▎       | 7/30 [00:06<00:20,  1.10it/s]running benchmark:  27%|██▋       | 8/30 [00:07<00:19,  1.10it/s]running benchmark:  30%|███       | 9/30 [00:08<00:19,  1.10it/s]running benchmark:  33%|███▎      | 10/30 [00:09<00:18,  1.10it/s]running benchmark:  37%|███▋      | 11/30 [00:10<00:17,  1.10it/s]running benchmark:  40%|████      | 12/30 [00:10<00:16,  1.10it/s]running benchmark:  43%|████▎     | 13/30 [00:11<00:15,  1.10it/s]running benchmark:  47%|████▋     | 14/30 [00:12<00:14,  1.10it/s]running benchmark:  50%|█████     | 15/30 [00:13<00:13,  1.10it/s]running benchmark:  53%|█████▎    | 16/30 [00:14<00:12,  1.10it/s]running benchmark:  57%|█████▋    | 17/30 [00:15<00:11,  1.10it/s]running benchmark:  60%|██████    | 18/30 [00:16<00:10,  1.10it/s]running benchmark:  63%|██████▎   | 19/30 [00:17<00:09,  1.10it/s]running benchmark:  67%|██████▋   | 20/30 [00:18<00:09,  1.10it/s]running benchmark:  70%|███████   | 21/30 [00:19<00:08,  1.10it/s]running benchmark:  73%|███████▎  | 22/30 [00:19<00:07,  1.11it/s]running benchmark:  77%|███████▋  | 23/30 [00:20<00:06,  1.10it/s]running benchmark:  80%|████████  | 24/30 [00:21<00:05,  1.11it/s]running benchmark:  83%|████████▎ | 25/30 [00:22<00:04,  1.10it/s]running benchmark:  87%|████████▋ | 26/30 [00:23<00:03,  1.10it/s]running benchmark:  90%|█████████ | 27/30 [00:24<00:02,  1.10it/s]running benchmark:  93%|█████████▎| 28/30 [00:25<00:01,  1.11it/s]running benchmark:  97%|█████████▋| 29/30 [00:26<00:00,  1.10it/s]running benchmark: 100%|██████████| 30/30 [00:27<00:00,  1.11it/s]running benchmark: 100%|██████████| 30/30 [00:27<00:00,  1.10it/s]
+231351.443ms
+loading model: 0it [00:00, ?it/s]stable_diffusion_unet
+
+Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s][A
+Loading pipeline components...:  17%|█▋        | 1/6 [00:00<00:01,  4.81it/s][A
+Loading pipeline components...:  67%|██████▋   | 4/6 [00:00<00:00,  6.18it/s][ALoading pipeline components...: 100%|██████████| 6/6 [00:00<00:00,  7.89it/s]
+> /home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/diffusers/models/attention_processor.py(1236)__call__()
+-> hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+(Pdb) TIMEOUT
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:05, ?it/s]
+timm_efficientdet
+cuda eval  timm_efficientdet                   int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:07,  3.78it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:07,  3.78it/s]running benchmark:  10%|█         | 3/30 [00:00<00:07,  3.67it/s]running benchmark:  13%|█▎        | 4/30 [00:01<00:07,  3.60it/s]running benchmark:  17%|█▋        | 5/30 [00:01<00:06,  3.67it/s]running benchmark:  20%|██        | 6/30 [00:01<00:06,  3.72it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:06,  3.74it/s]running benchmark:  27%|██▋       | 8/30 [00:02<00:05,  3.70it/s]running benchmark:  30%|███       | 9/30 [00:02<00:05,  3.70it/s]running benchmark:  33%|███▎      | 10/30 [00:02<00:05,  3.65it/s]running benchmark:  37%|███▋      | 11/30 [00:02<00:05,  3.63it/s]running benchmark:  40%|████      | 12/30 [00:03<00:04,  3.66it/s]running benchmark:  43%|████▎     | 13/30 [00:03<00:04,  3.68it/s]running benchmark:  47%|████▋     | 14/30 [00:03<00:04,  3.69it/s]running benchmark:  50%|█████     | 15/30 [00:04<00:04,  3.68it/s]running benchmark:  53%|█████▎    | 16/30 [00:04<00:03,  3.67it/s]running benchmark:  57%|█████▋    | 17/30 [00:04<00:03,  3.70it/s]running benchmark:  60%|██████    | 18/30 [00:04<00:03,  3.72it/s]running benchmark:  63%|██████▎   | 19/30 [00:05<00:02,  3.74it/s]running benchmark:  67%|██████▋   | 20/30 [00:05<00:02,  3.76it/s]running benchmark:  70%|███████   | 21/30 [00:05<00:02,  3.77it/s]running benchmark:  73%|███████▎  | 22/30 [00:05<00:02,  3.77it/s]running benchmark:  77%|███████▋  | 23/30 [00:06<00:01,  3.74it/s]running benchmark:  80%|████████  | 24/30 [00:06<00:01,  3.71it/s]running benchmark:  83%|████████▎ | 25/30 [00:06<00:01,  3.73it/s]running benchmark:  87%|████████▋ | 26/30 [00:07<00:01,  3.68it/s]running benchmark:  90%|█████████ | 27/30 [00:07<00:00,  3.70it/s]running benchmark:  93%|█████████▎| 28/30 [00:07<00:00,  3.72it/s]running benchmark:  97%|█████████▋| 29/30 [00:07<00:00,  3.69it/s]running benchmark: 100%|██████████| 30/30 [00:08<00:00,  3.72it/s]running benchmark: 100%|██████████| 30/30 [00:08<00:00,  3.70it/s]
+1472.034ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+timm_efficientnet
+cuda eval  timm_efficientnet                   int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:00, 43.11it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 50.51it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 53.13it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 54.32it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 55.05it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 53.53it/s]
+2500.746ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+timm_nfnet
+cuda eval  timm_nfnet                          int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:04,  6.87it/s]running benchmark:  10%|█         | 3/30 [00:00<00:02, 11.49it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 13.10it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 13.87it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 14.31it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:01, 14.58it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:01, 14.73it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01, 14.83it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:00, 14.89it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 14.96it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 14.98it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 15.02it/s]running benchmark:  83%|████████▎ | 25/30 [00:01<00:00, 15.02it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 15.02it/s]running benchmark:  97%|█████████▋| 29/30 [00:02<00:00, 15.03it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 14.48it/s]
+1923.920ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+timm_regnet
+cuda eval  timm_regnet                         int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  10%|█         | 3/30 [00:00<00:01, 23.90it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:00, 29.02it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 30.73it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 31.50it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 31.99it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 32.30it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 32.47it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 31.61it/s]
+1478.363ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+timm_resnest
+cuda eval  timm_resnest                        int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 85.67it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 90.30it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 91.92it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 91.06it/s]
+2104.750ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+timm_vision_transformer
+cuda eval  timm_vision_transformer             int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:03,  8.21it/s]running benchmark:  10%|█         | 3/30 [00:00<00:02,  9.46it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:02,  9.63it/s]running benchmark:  20%|██        | 6/30 [00:00<00:02,  9.80it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:02,  9.93it/s]running benchmark:  30%|███       | 9/30 [00:00<00:02,  9.93it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:01,  9.99it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:01, 10.03it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01, 10.05it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:01, 10.09it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:01, 10.06it/s]running benchmark:  70%|███████   | 21/30 [00:02<00:00, 10.07it/s]running benchmark:  77%|███████▋  | 23/30 [00:02<00:00, 10.08it/s]running benchmark:  83%|████████▎ | 25/30 [00:02<00:00, 10.11it/s]running benchmark:  90%|█████████ | 27/30 [00:02<00:00, 10.12it/s]running benchmark:  97%|█████████▋| 29/30 [00:02<00:00, 10.08it/s]running benchmark: 100%|██████████| 30/30 [00:03<00:00, 10.00it/s]
+20823.217ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:23, ?it/s]
+timm_vision_transformer_large
+cuda eval  timm_vision_transformer_large       int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:27,  1.04it/s]running benchmark:   7%|▋         | 2/30 [00:01<00:22,  1.23it/s]running benchmark:  10%|█         | 3/30 [00:02<00:20,  1.31it/s]running benchmark:  13%|█▎        | 4/30 [00:03<00:19,  1.35it/s]running benchmark:  17%|█▋        | 5/30 [00:03<00:18,  1.37it/s]running benchmark:  20%|██        | 6/30 [00:04<00:17,  1.38it/s]running benchmark:  23%|██▎       | 7/30 [00:05<00:16,  1.39it/s]running benchmark:  27%|██▋       | 8/30 [00:05<00:15,  1.40it/s]running benchmark:  30%|███       | 9/30 [00:06<00:14,  1.40it/s]running benchmark:  33%|███▎      | 10/30 [00:07<00:14,  1.41it/s]running benchmark:  37%|███▋      | 11/30 [00:08<00:13,  1.41it/s]running benchmark:  40%|████      | 12/30 [00:08<00:12,  1.40it/s]running benchmark:  43%|████▎     | 13/30 [00:09<00:12,  1.40it/s]running benchmark:  47%|████▋     | 14/30 [00:10<00:11,  1.40it/s]running benchmark:  50%|█████     | 15/30 [00:10<00:10,  1.41it/s]running benchmark:  53%|█████▎    | 16/30 [00:11<00:09,  1.41it/s]running benchmark:  57%|█████▋    | 17/30 [00:12<00:09,  1.41it/s]running benchmark:  60%|██████    | 18/30 [00:13<00:08,  1.41it/s]running benchmark:  63%|██████▎   | 19/30 [00:13<00:07,  1.41it/s]running benchmark:  67%|██████▋   | 20/30 [00:14<00:07,  1.41it/s]running benchmark:  70%|███████   | 21/30 [00:15<00:06,  1.41it/s]running benchmark:  73%|███████▎  | 22/30 [00:15<00:05,  1.41it/s]running benchmark:  77%|███████▋  | 23/30 [00:16<00:04,  1.41it/s]running benchmark:  80%|████████  | 24/30 [00:17<00:04,  1.41it/s]running benchmark:  83%|████████▎ | 25/30 [00:17<00:03,  1.41it/s]running benchmark:  87%|████████▋ | 26/30 [00:18<00:02,  1.41it/s]running benchmark:  90%|█████████ | 27/30 [00:19<00:02,  1.41it/s]running benchmark:  93%|█████████▎| 28/30 [00:20<00:01,  1.41it/s]running benchmark:  97%|█████████▋| 29/30 [00:20<00:00,  1.41it/s]running benchmark: 100%|██████████| 30/30 [00:21<00:00,  1.41it/s]running benchmark: 100%|██████████| 30/30 [00:21<00:00,  1.39it/s]
+7255.759ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+timm_vovnet
+cuda eval  timm_vovnet                         int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:00, 69.87it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 76.24it/s]running benchmark:  83%|████████▎ | 25/30 [00:00<00:00, 78.07it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 77.46it/s]
+2115.538ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+torch_multimodal_clip
+cuda eval  torch_multimodal_clip               int8dynamic               
+skipping cudagraphs due to ['non-cuda device in graph']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:04,  6.45it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:04,  6.49it/s]running benchmark:  10%|█         | 3/30 [00:00<00:04,  6.47it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:04,  6.46it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:03,  6.48it/s]running benchmark:  20%|██        | 6/30 [00:00<00:03,  6.50it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:03,  6.49it/s]running benchmark:  27%|██▋       | 8/30 [00:01<00:03,  6.50it/s]running benchmark:  30%|███       | 9/30 [00:01<00:03,  6.50it/s]running benchmark:  33%|███▎      | 10/30 [00:01<00:03,  6.47it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:02,  6.48it/s]running benchmark:  40%|████      | 12/30 [00:01<00:02,  6.50it/s]running benchmark:  43%|████▎     | 13/30 [00:02<00:02,  6.51it/s]running benchmark:  47%|████▋     | 14/30 [00:02<00:02,  6.52it/s]running benchmark:  50%|█████     | 15/30 [00:02<00:02,  6.51it/s]running benchmark:  53%|█████▎    | 16/30 [00:02<00:02,  6.50it/s]running benchmark:  57%|█████▋    | 17/30 [00:02<00:01,  6.52it/s]running benchmark:  60%|██████    | 18/30 [00:02<00:01,  6.54it/s]running benchmark:  63%|██████▎   | 19/30 [00:02<00:01,  6.56it/s]running benchmark:  67%|██████▋   | 20/30 [00:03<00:01,  6.54it/s]running benchmark:  70%|███████   | 21/30 [00:03<00:01,  6.55it/s]running benchmark:  73%|███████▎  | 22/30 [00:03<00:01,  6.55it/s]running benchmark:  77%|███████▋  | 23/30 [00:03<00:01,  6.52it/s]running benchmark:  80%|████████  | 24/30 [00:03<00:00,  6.54it/s]running benchmark:  83%|████████▎ | 25/30 [00:03<00:00,  6.55it/s]running benchmark:  87%|████████▋ | 26/30 [00:03<00:00,  6.54it/s]running benchmark:  90%|█████████ | 27/30 [00:04<00:00,  6.54it/s]running benchmark:  93%|█████████▎| 28/30 [00:04<00:00,  6.53it/s]running benchmark:  97%|█████████▋| 29/30 [00:04<00:00,  6.51it/s]running benchmark: 100%|██████████| 30/30 [00:04<00:00,  6.49it/s]running benchmark: 100%|██████████| 30/30 [00:04<00:00,  6.51it/s]
+17669.343ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model tts_angular does not support bfloat16, running with amp instead
+loading model: 0it [00:01, ?it/s]
+WARNING:common:Model tts_angular does not support bfloat16, running with amp instead
+tts_angular
+cuda eval  tts_angular                         int8dynamic               
+WARNING:common:Model tts_angular does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 90.53it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 91.30it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 91.71it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 91.47it/s]
+2644.919ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+vgg16
+cuda eval  vgg16                               int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:06,  4.15it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:06,  4.36it/s]running benchmark:  10%|█         | 3/30 [00:00<00:06,  4.43it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:05,  4.48it/s]running benchmark:  17%|█▋        | 5/30 [00:01<00:05,  4.50it/s]running benchmark:  20%|██        | 6/30 [00:01<00:05,  4.48it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:05,  4.49it/s]running benchmark:  27%|██▋       | 8/30 [00:01<00:04,  4.49it/s]running benchmark:  30%|███       | 9/30 [00:02<00:04,  4.50it/s]running benchmark:  33%|███▎      | 10/30 [00:02<00:04,  4.45it/s]running benchmark:  37%|███▋      | 11/30 [00:02<00:04,  4.47it/s]running benchmark:  40%|████      | 12/30 [00:02<00:04,  4.49it/s]running benchmark:  43%|████▎     | 13/30 [00:02<00:03,  4.50it/s]running benchmark:  47%|████▋     | 14/30 [00:03<00:03,  4.49it/s]running benchmark:  50%|█████     | 15/30 [00:03<00:03,  4.49it/s]running benchmark:  53%|█████▎    | 16/30 [00:03<00:03,  4.50it/s]running benchmark:  57%|█████▋    | 17/30 [00:03<00:02,  4.51it/s]running benchmark:  60%|██████    | 18/30 [00:04<00:02,  4.52it/s]running benchmark:  63%|██████▎   | 19/30 [00:04<00:02,  4.52it/s]running benchmark:  67%|██████▋   | 20/30 [00:04<00:02,  4.51it/s]running benchmark:  70%|███████   | 21/30 [00:04<00:01,  4.52it/s]running benchmark:  73%|███████▎  | 22/30 [00:04<00:01,  4.47it/s]running benchmark:  77%|███████▋  | 23/30 [00:05<00:01,  4.48it/s]running benchmark:  80%|████████  | 24/30 [00:05<00:01,  4.47it/s]running benchmark:  83%|████████▎ | 25/30 [00:05<00:01,  4.49it/s]running benchmark:  87%|████████▋ | 26/30 [00:05<00:00,  4.51it/s]running benchmark:  90%|█████████ | 27/30 [00:06<00:00,  4.50it/s]running benchmark:  93%|█████████▎| 28/30 [00:06<00:00,  4.51it/s]running benchmark:  97%|█████████▋| 29/30 [00:06<00:00,  4.49it/s]running benchmark: 100%|██████████| 30/30 [00:06<00:00,  4.50it/s]running benchmark: 100%|██████████| 30/30 [00:06<00:00,  4.49it/s]
+147741.887ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model vision_maskrcnn does not support bfloat16, running with float16 instead
+loading model: 0it [00:03, ?it/s]
+WARNING:common:Model vision_maskrcnn does not support bfloat16, running with float16 instead
+vision_maskrcnn
+cuda eval  vision_maskrcnn                     int8dynamic               
+WARNING:common:Model vision_maskrcnn does not support bfloat16, running with float16 instead
+ERROR:common:Backend dynamo failed in warmup()
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2380, in warmup
+    fn(model, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 488, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/vision/torchvision/models/detection/generalized_rcnn.py", line 104, in forward
+    proposals, proposal_losses = self.rpn(images, features, targets)
+  File "/home/cdhernandez/local/vision/torchvision/models/detection/generalized_rcnn.py", line 105, in resume_in_forward
+    detections, detector_losses = self.roi_heads(features, proposals, images.image_sizes, targets)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/vision/torchvision/models/detection/roi_heads.py", line 761, in forward
+    box_features = self.box_roi_pool(features, proposals, image_shapes)
+  File "/home/cdhernandez/local/vision/torchvision/models/detection/roi_heads.py", line 775, in resume_in_forward
+    boxes, scores, labels = self.postprocess_detections(class_logits, box_regression, proposals, image_shapes)
+  File "/home/cdhernandez/local/vision/torchvision/models/detection/roi_heads.py", line 804, in resume_in_forward
+    mask_features = self.mask_roi_pool(features, mask_proposals, image_shapes)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 654, in catch_errors
+    return callback(frame, cache_entry, hooks, frame_state)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 727, in _convert_frame
+    result = inner_convert(frame, cache_entry, hooks, frame_state)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 383, in _convert_frame_assert
+    compiled_product = _compile(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 646, in _compile
+    guarded_code = compile_inner(code, one_graph, hooks, transform)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 562, in compile_inner
+    out_code = transform_code_object(code, transform)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/bytecode_transformation.py", line 1033, in transform_code_object
+    transformations(instructions, code_options)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 151, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 527, in transform
+    tracer.run()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2144, in run
+    super().run()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 818, in run
+    and self.step()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 781, in step
+    getattr(self, inst.opname)(inst)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2259, in RETURN_VALUE
+    self.output.compile_subgraph(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 945, in compile_subgraph
+    self.compile_and_call_fx_graph(tx, pass2.graph_output_vars(), root)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1087, in compile_and_call_fx_graph
+    compiled_fn = self.call_user_compiler(gm)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1159, in call_user_compiler
+    raise BackendCompilerFailed(self.compiler_fn, e).with_traceback(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1140, in call_user_compiler
+    compiled_fn = compiler_fn(gm, self.example_inputs())
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_dynamo.py", line 117, in debug_wrapper
+    compiled_gm = compiler_fn(gm, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/__init__.py", line 1672, in __call__
+    return compile_fx(model_, inputs_, config_patches=self.config)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 952, in compile_fx
+    return compile_fx(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1168, in compile_fx
+    return aot_autograd(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/backends/common.py", line 55, in compiler_fn
+    cg = aot_module_simplified(gm, example_inputs, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 887, in aot_module_simplified
+    compiled_fn = create_aot_dispatcher_function(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 600, in create_aot_dispatcher_function
+    compiled_fn = compiler_fn(flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 425, in aot_wrapper_dedupe
+    return compiler_fn(flat_fn, leaf_flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 630, in aot_wrapper_synthetic_base
+    return compiler_fn(flat_fn, flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py", line 97, in aot_dispatch_base
+    compiled_fw = compiler(fw_module, updated_flat_args)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1100, in fw_compiler_base
+    return inner_compile(
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_aot.py", line 83, in debug_wrapper
+    inner_compiled_fn = compiler_fn(gm, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/debug.py", line 305, in inner
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 320, in compile_fx_inner
+    compiled_graph = fx_codegen_and_compile(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 535, in fx_codegen_and_compile
+    graph.run(*example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 518, in run
+    return super().run(*args)
+  File "/home/cdhernandez/local/pytorch/torch/fx/interpreter.py", line 138, in run
+    self.env[node] = self.run_node(node)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 815, in run_node
+    result = self.call_function(n.target, args, kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 695, in call_function
+    raise LoweringException(e, target, args, kwargs).with_traceback(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 692, in call_function
+    out = lowerings[target](*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/lowering.py", line 291, in wrapped
+    out = decomp_fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/kernel/conv.py", line 367, in convolution
+    result = convolution(x, weight, None, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/lowering.py", line 291, in wrapped
+    out = decomp_fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/kernel/conv.py", line 457, in convolution
+    return autotune_select_algorithm("convolution", choices, args, layout)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/select_algorithm.py", line 991, in autotune_select_algorithm
+    return _ALGORITHM_SELECTOR_CACHE(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/select_algorithm.py", line 748, in __call__
+    timings = self.lookup(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 291, in lookup
+    timings = benchmark(choices)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/select_algorithm.py", line 739, in autotune
+    return make_benchmark_fn()(choices)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/select_algorithm.py", line 865, in benchmark_in_current_process
+    raise AssertionError(  # noqa: TRY200
+torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised:
+LoweringException: AssertionError: Incorrect result from choice ExternKernelCaller(extern_kernels.convolution)
+
+expected size 256==256, stride 196==1 at dim=1
+  target: aten.convolution.default
+  args[0]: TensorBox(StorageBox(
+    InputBuffer(name='arg12_1', layout=FixedLayout('cuda', torch.float16, size=[0, 256, 14, 14], stride=[50176, 196, 14, 1]))
+  ))
+  args[1]: TensorBox(StorageBox(
+    InputBuffer(name='arg0_1', layout=FixedLayout('cuda', torch.float16, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
+  ))
+  args[2]: TensorBox(StorageBox(
+    InputBuffer(name='arg1_1', layout=FixedLayout('cuda', torch.float16, size=[256], stride=[1]))
+  ))
+  args[3]: [1, 1]
+  args[4]: [1, 1]
+  args[5]: [1, 1]
+  args[6]: False
+  args[7]: [0, 0]
+  args[8]: 1
+
+Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information
+
+
+You can suppress this exception and fall back to eager by setting:
+    import torch._dynamo
+    torch._dynamo.config.suppress_errors = True
+
+Run failed with return code:  255
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+yolov3
+cuda eval  yolov3                              int8dynamic               
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:00, 46.47it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 52.24it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 54.23it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 55.08it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 55.62it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 54.48it/s]
+1842.352ms
+
+Summary for tag=0.000000:
+speedup             gmean=0.00x mean=0.000x
+abs_latency         gmean=0.00x mean=0.000x
+compilation_latency mean=0.000 seconds
+compression_ratio   mean=0.000x
+eager_peak_mem      gmean=0.00x mean=0.000x
+dynamo_peak_mem     gmean=0.00x mean=0.000x
+calls_captured      gmean=0.00x mean=0.000x
+unique_graphs       gmean=0.00x mean=0.000x
+graph_breaks        gmean=0.00x mean=0.000x
+unique_graph_breaks gmean=0.00x mean=0.000x
+
+Summary for tag=int8dynamic:
+speedup             gmean=9.20x mean=113.389x
+abs_latency         gmean=4.24x mean=10.510x
+compilation_latency mean=34.839 seconds
+compression_ratio   mean=1.263x
+eager_peak_mem      gmean=0.38x mean=0.878x
+dynamo_peak_mem     gmean=0.36x mean=0.844x
+calls_captured      gmean=233.44x mean=564.988x
+unique_graphs       gmean=1.86x mean=7.136x
+graph_breaks        gmean=0.00x mean=5.160x
+unique_graph_breaks gmean=0.00x mean=1.333x
+start int8 weight only
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+torchrec_dlrm
+/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/fbgemm_gpu_py.so: undefined symbol: _ZNK5torch8autograd4Node4nameEv
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 36, in <module>
+    run()
+  File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 30, in run
+    benchmark.run(bm_args)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/run.py", line 24, in run
+    main(TorchBenchmarkRunner(), original_dir, args)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3133, in main
+    process_entry(0, runner, original_dir, args)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3090, in process_entry
+    return maybe_fresh_cache(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1714, in inner
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 383, in load_model
+    module = importlib.import_module(c)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/importlib/__init__.py", line 126, in import_module
+    return _bootstrap._gcd_import(name[level:], package, level)
+  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
+  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
+  File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
+  File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
+  File "<frozen importlib._bootstrap_external>", line 883, in exec_module
+  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/__init__.py", line 7, in <module>
+    from .data.dlrm_dataloader import get_dataloader
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/data/dlrm_dataloader.py", line 13, in <module>
+    from torchrec.datasets.criteo import (
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/__init__.py", line 8, in <module>
+    import torchrec.distributed  # noqa
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/__init__.py", line 36, in <module>
+    from torchrec.distributed.model_parallel import DistributedModelParallel  # noqa
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/model_parallel.py", line 21, in <module>
+    from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/__init__.py", line 22, in <module>
+    from torchrec.distributed.planner.planners import EmbeddingShardingPlanner  # noqa
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/planners.py", line 19, in <module>
+    from torchrec.distributed.planner.constants import BATCH_SIZE, MAX_SIZE
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/constants.py", line 10, in <module>
+    from torchrec.distributed.embedding_types import EmbeddingComputeKernel
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/embedding_types.py", line 14, in <module>
+    from fbgemm_gpu.split_table_batched_embeddings_ops_training import EmbeddingLocation
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 23, in <module>
+    from . import _fbgemm_gpu_docs, sparse_ops  # noqa: F401, E402  # noqa: F401, E402
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/_fbgemm_gpu_docs.py", line 19, in <module>
+    torch.ops.fbgemm.jagged_2d_to_dense,
+  File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 822, in __getattr__
+    raise AttributeError(
+AttributeError: '_OpNamespace' 'fbgemm' object has no attribute 'jagged_2d_to_dense'
+Run failed with return code:  1
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+BERT_pytorch
+cuda eval  BERT_pytorch                        int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:00, 45.08it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 46.04it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 47.41it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 48.40it/s]running benchmark:  83%|████████▎ | 25/30 [00:00<00:00, 48.75it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 48.46it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 47.97it/s]
+3814.490ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+Background_Matting
+cuda eval  Background_Matting                  int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  10%|█         | 3/30 [00:00<00:01, 26.95it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 36.48it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 39.47it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 40.82it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 41.57it/s]running benchmark:  93%|█████████▎| 28/30 [00:00<00:00, 42.03it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 40.49it/s]
+2064.742ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model DALLE2_pytorch does not support bfloat16, running with amp instead
+loading model: 0it [00:11, ?it/s]
+WARNING:common:Model DALLE2_pytorch does not support bfloat16, running with amp instead
+DALLE2_pytorch
+cuda eval  DALLE2_pytorch                      int8weightonly            
+WARNING:common:Model DALLE2_pytorch does not support bfloat16, running with amp instead
+[2023-12-11 21:50:41,294] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 21:50:41,547] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 21:50:41,734] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 21:50:41,923] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 21:50:42,114] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 21:50:42,306] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 21:50:42,496] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 21:50:42,684] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 21:50:42,872] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 21:50:43,062] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 21:50:43,248] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 21:50:43,436] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-11 21:51:11,315] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 21:51:11,492] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 21:51:11,670] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 21:51:11,851] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 21:51:12,020] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 21:51:12,199] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 21:51:12,377] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 21:51:12,544] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 21:51:12,709] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 21:51:12,876] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 21:51:13,048] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 21:51:13,213] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:07,  3.93it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:06,  4.20it/s]running benchmark:  10%|█         | 3/30 [00:00<00:06,  4.32it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:05,  4.40it/s]running benchmark:  17%|█▋        | 5/30 [00:01<00:05,  4.40it/s]running benchmark:  20%|██        | 6/30 [00:01<00:05,  4.41it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:05,  4.38it/s]running benchmark:  27%|██▋       | 8/30 [00:01<00:04,  4.40it/s]running benchmark:  30%|███       | 9/30 [00:02<00:04,  4.43it/s]running benchmark:  33%|███▎      | 10/30 [00:02<00:04,  4.42it/s]running benchmark:  37%|███▋      | 11/30 [00:02<00:04,  4.44it/s]running benchmark:  40%|████      | 12/30 [00:02<00:04,  4.46it/s]running benchmark:  43%|████▎     | 13/30 [00:02<00:03,  4.46it/s]running benchmark:  47%|████▋     | 14/30 [00:03<00:03,  4.41it/s]running benchmark:  50%|█████     | 15/30 [00:03<00:03,  4.38it/s]running benchmark:  53%|█████▎    | 16/30 [00:03<00:03,  4.36it/s]running benchmark:  57%|█████▋    | 17/30 [00:03<00:02,  4.34it/s]running benchmark:  60%|██████    | 18/30 [00:04<00:02,  4.36it/s]running benchmark:  63%|██████▎   | 19/30 [00:04<00:02,  4.31it/s]running benchmark:  67%|██████▋   | 20/30 [00:04<00:02,  4.32it/s]running benchmark:  70%|███████   | 21/30 [00:04<00:02,  4.25it/s]running benchmark:  73%|███████▎  | 22/30 [00:05<00:01,  4.32it/s]running benchmark:  77%|███████▋  | 23/30 [00:05<00:01,  4.35it/s]running benchmark:  80%|████████  | 24/30 [00:05<00:01,  4.38it/s]running benchmark:  83%|████████▎ | 25/30 [00:05<00:01,  4.39it/s]running benchmark:  87%|████████▋ | 26/30 [00:05<00:00,  4.44it/s]running benchmark:  90%|█████████ | 27/30 [00:06<00:00,  4.49it/s]running benchmark:  93%|█████████▎| 28/30 [00:06<00:00,  4.46it/s]running benchmark:  97%|█████████▋| 29/30 [00:06<00:00,  4.41it/s]running benchmark: 100%|██████████| 30/30 [00:06<00:00,  4.40it/s]running benchmark: 100%|██████████| 30/30 [00:06<00:00,  4.38it/s]
+3814.544ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+LearningToPaint
+cuda eval  LearningToPaint                     int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 175.37it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 185.92it/s]
+2049.162ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model Super_SloMo does not support bfloat16, running with amp instead
+loading model: 0it [00:03, ?it/s]
+WARNING:common:Model Super_SloMo does not support bfloat16, running with amp instead
+Super_SloMo
+cuda eval  Super_SloMo                         int8weightonly            
+WARNING:common:Model Super_SloMo does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:03,  9.04it/s]running benchmark:  10%|█         | 3/30 [00:00<00:01, 14.75it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 16.63it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 17.54it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 18.04it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:01, 18.31it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 18.50it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 18.63it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 18.73it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 18.78it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 18.83it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 18.84it/s]running benchmark:  83%|████████▎ | 25/30 [00:01<00:00, 18.86it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 18.86it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 18.87it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 18.24it/s]
+1619.740ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+alexnet
+cuda eval  alexnet                             int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 139.99it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 145.71it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 144.81it/s]
+1331.078ms
+loading model: 0it [00:00, ?it/s]basic_gnn_edgecnn
+loading model: 0it [00:03, ?it/s]
+cuda eval  basic_gnn_edgecnn                   int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 203.66it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 208.77it/s]
+1380.741ms
+loading model: 0it [00:00, ?it/s]basic_gnn_gcn
+loading model: 0it [00:03, ?it/s]
+cuda eval  basic_gnn_gcn                       int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 132.02it/s]running benchmark:  93%|█████████▎| 28/30 [00:00<00:00, 133.22it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 132.97it/s]
+1048.041ms
+loading model: 0it [00:00, ?it/s]basic_gnn_gin
+loading model: 0it [00:03, ?it/s]
+cuda eval  basic_gnn_gin                       int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 339.22it/s]
+1216.365ms
+loading model: 0it [00:00, ?it/s]basic_gnn_sage
+loading model: 0it [00:02, ?it/s]
+cuda eval  basic_gnn_sage                      int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 157.47it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 164.86it/s]
+1266.802ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:05, ?it/s]
+cm3leon_generate
+cuda eval  cm3leon_generate                    int8weightonly            
+AUTOTUNE bmm(16x1x846, 16x846x96)
+  triton_bmm_80913 0.0120 ms 100.0%
+  triton_bmm_80914 0.0127 ms 94.5%
+  triton_bmm_80916 0.0134 ms 89.5%
+  triton_bmm_80917 0.0139 ms 86.8%
+  triton_bmm_80912 0.0142 ms 84.9%
+  triton_bmm_80911 0.0147 ms 82.1%
+  bmm 0.0163 ms 73.9%
+  triton_bmm_80910 0.0166 ms 72.3%
+  triton_bmm_80909 0.0179 ms 67.3%
+  triton_bmm_80908 0.0262 ms 46.0%
+SingleProcess AUTOTUNE takes 5.4495 seconds
+AUTOTUNE bmm(16x1x96, 16x96x847)
+  triton_bmm_80984 0.0097 ms 100.0%
+  triton_bmm_80981 0.0101 ms 96.2%
+  triton_bmm_80985 0.0102 ms 95.3%
+  triton_bmm_80986 0.0103 ms 94.7%
+  triton_bmm_80987 0.0103 ms 94.4%
+  triton_bmm_80991 0.0105 ms 93.0%
+  triton_bmm_80980 0.0105 ms 92.7%
+  triton_bmm_80982 0.0105 ms 92.4%
+  triton_bmm_80988 0.0106 ms 91.8%
+  triton_bmm_80983 0.0107 ms 90.7%
+SingleProcess AUTOTUNE takes 3.8914 seconds
+AUTOTUNE bmm(16x1x847, 16x847x96)
+  bmm 0.0155 ms 100.0%
+  triton_bmm_81013 0.0185 ms 83.4%
+  triton_bmm_81012 0.0202 ms 76.7%
+  triton_bmm_81009 0.0271 ms 57.0%
+  triton_bmm_81010 0.0271 ms 57.0%
+  triton_bmm_81008 0.0273 ms 56.7%
+  triton_bmm_81006 0.0276 ms 56.0%
+  triton_bmm_81005 0.0290 ms 53.3%
+  triton_bmm_81007 0.0298 ms 51.8%
+  triton_bmm_81004 0.0364 ms 42.4%
+SingleProcess AUTOTUNE takes 3.7731 seconds
+AUTOTUNE bmm(16x1x96, 16x96x848)
+  triton_bmm_81080 0.0096 ms 100.0%
+  triton_bmm_81076 0.0099 ms 97.1%
+  triton_bmm_81078 0.0099 ms 97.1%
+  triton_bmm_81079 0.0100 ms 96.5%
+  triton_bmm_81081 0.0101 ms 95.2%
+  triton_bmm_81083 0.0101 ms 95.2%
+  triton_bmm_81084 0.0101 ms 95.2%
+  triton_bmm_81082 0.0101 ms 94.6%
+  triton_bmm_81085 0.0102 ms 94.2%
+  triton_bmm_81087 0.0103 ms 93.2%
+SingleProcess AUTOTUNE takes 3.7550 seconds
+AUTOTUNE bmm(16x1x848, 16x848x96)
+  triton_bmm_81106 0.0121 ms 100.0%
+  triton_bmm_81105 0.0128 ms 95.0%
+  bmm 0.0128 ms 94.5%
+  triton_bmm_81108 0.0130 ms 93.6%
+  triton_bmm_81104 0.0140 ms 86.9%
+  triton_bmm_81109 0.0142 ms 85.2%
+  triton_bmm_81103 0.0147 ms 82.8%
+  triton_bmm_81102 0.0168 ms 72.1%
+  triton_bmm_81101 0.0180 ms 67.4%
+  triton_bmm_81100 0.0251 ms 48.4%
+SingleProcess AUTOTUNE takes 4.2261 seconds
+AUTOTUNE bmm(16x1x96, 16x96x849)
+  triton_bmm_81176 0.0097 ms 100.0%
+  triton_bmm_81178 0.0097 ms 100.0%
+  triton_bmm_81173 0.0101 ms 96.2%
+  triton_bmm_81180 0.0101 ms 95.9%
+  triton_bmm_81177 0.0102 ms 95.0%
+  triton_bmm_81179 0.0103 ms 94.4%
+  triton_bmm_81172 0.0105 ms 93.0%
+  triton_bmm_81174 0.0105 ms 93.0%
+  triton_bmm_81181 0.0107 ms 91.3%
+  triton_bmm_81175 0.0107 ms 90.7%
+SingleProcess AUTOTUNE takes 3.7665 seconds
+AUTOTUNE bmm(16x1x849, 16x849x96)
+  bmm 0.0142 ms 100.0%
+  triton_bmm_81205 0.0181 ms 78.1%
+  triton_bmm_81204 0.0203 ms 70.0%
+  triton_bmm_81198 0.0272 ms 52.2%
+  triton_bmm_81201 0.0272 ms 52.2%
+  triton_bmm_81202 0.0272 ms 52.2%
+  triton_bmm_81200 0.0274 ms 51.8%
+  triton_bmm_81197 0.0287 ms 49.4%
+  triton_bmm_81199 0.0304 ms 46.6%
+  triton_bmm_81196 0.0371 ms 38.2%
+SingleProcess AUTOTUNE takes 3.8178 seconds
+AUTOTUNE bmm(16x1x96, 16x96x850)
+  triton_bmm_81268 0.0100 ms 100.0%
+  triton_bmm_81270 0.0100 ms 100.0%
+  triton_bmm_81276 0.0100 ms 99.7%
+  triton_bmm_81269 0.0101 ms 98.7%
+  triton_bmm_81275 0.0101 ms 98.4%
+  triton_bmm_81271 0.0101 ms 98.1%
+  triton_bmm_81272 0.0101 ms 98.1%
+  triton_bmm_81274 0.0102 ms 97.8%
+  triton_bmm_81279 0.0103 ms 96.3%
+  triton_bmm_81278 0.0106 ms 94.0%
+SingleProcess AUTOTUNE takes 4.0855 seconds
+AUTOTUNE bmm(16x1x850, 16x850x96)
+  triton_bmm_81297 0.0124 ms 100.0%
+  triton_bmm_81298 0.0127 ms 97.7%
+  triton_bmm_81300 0.0130 ms 95.8%
+  triton_bmm_81301 0.0139 ms 89.8%
+  triton_bmm_81296 0.0142 ms 87.8%
+  triton_bmm_81295 0.0148 ms 84.0%
+  triton_bmm_81294 0.0166 ms 74.8%
+  bmm 0.0168 ms 74.0%
+  triton_bmm_81293 0.0180 ms 69.2%
+  triton_bmm_81292 0.0257 ms 48.4%
+SingleProcess AUTOTUNE takes 4.4930 seconds
+AUTOTUNE bmm(16x1x96, 16x96x851)
+  triton_bmm_81370 0.0098 ms 100.0%
+  triton_bmm_81364 0.0100 ms 97.8%
+  triton_bmm_81369 0.0102 ms 95.6%
+  triton_bmm_81368 0.0103 ms 95.0%
+  triton_bmm_81371 0.0103 ms 94.7%
+  triton_bmm_81375 0.0104 ms 94.1%
+  triton_bmm_81366 0.0105 ms 93.0%
+  triton_bmm_81372 0.0106 ms 92.1%
+  triton_bmm_81365 0.0107 ms 91.6%
+  triton_bmm_81367 0.0107 ms 91.0%
+SingleProcess AUTOTUNE takes 3.6989 seconds
+AUTOTUNE bmm(16x1x851, 16x851x96)
+  bmm 0.0149 ms 100.0%
+  triton_bmm_81397 0.0185 ms 80.5%
+  triton_bmm_81396 0.0207 ms 72.1%
+  triton_bmm_81393 0.0268 ms 55.7%
+  triton_bmm_81394 0.0268 ms 55.7%
+  triton_bmm_81392 0.0270 ms 55.2%
+  triton_bmm_81390 0.0276 ms 54.1%
+  triton_bmm_81389 0.0291 ms 51.3%
+  triton_bmm_81391 0.0304 ms 49.1%
+  triton_bmm_81388 0.0367 ms 40.6%
+SingleProcess AUTOTUNE takes 3.7686 seconds
+AUTOTUNE bmm(16x1x96, 16x96x852)
+  triton_bmm_81462 0.0099 ms 100.0%
+  triton_bmm_81460 0.0099 ms 99.7%
+  triton_bmm_81463 0.0100 ms 98.7%
+  triton_bmm_81468 0.0100 ms 98.7%
+  triton_bmm_81464 0.0101 ms 97.5%
+  triton_bmm_81467 0.0101 ms 97.5%
+  triton_bmm_81466 0.0102 ms 96.9%
+  triton_bmm_81461 0.0104 ms 94.5%
+  triton_bmm_81469 0.0105 ms 93.6%
+  triton_bmm_81470 0.0106 ms 93.1%
+SingleProcess AUTOTUNE takes 4.1891 seconds
+AUTOTUNE bmm(16x1x852, 16x852x96)
+  triton_bmm_81490 0.0122 ms 100.0%
+  triton_bmm_81489 0.0124 ms 98.7%
+  triton_bmm_81493 0.0134 ms 91.2%
+  triton_bmm_81492 0.0135 ms 90.7%
+  triton_bmm_81488 0.0137 ms 89.5%
+  triton_bmm_81487 0.0152 ms 80.4%
+  bmm 0.0163 ms 74.9%
+  triton_bmm_81486 0.0165 ms 74.0%
+  triton_bmm_81485 0.0176 ms 69.3%
+  triton_bmm_81484 0.0257 ms 47.5%
+SingleProcess AUTOTUNE takes 4.3135 seconds
+AUTOTUNE bmm(16x1x96, 16x96x853)
+  triton_bmm_81564 0.0100 ms 100.0%
+  triton_bmm_81557 0.0101 ms 99.1%
+  triton_bmm_81561 0.0102 ms 98.4%
+  triton_bmm_81562 0.0103 ms 97.5%
+  triton_bmm_81560 0.0103 ms 97.2%
+  triton_bmm_81556 0.0105 ms 95.7%
+  triton_bmm_81558 0.0105 ms 95.1%
+  triton_bmm_81563 0.0107 ms 93.4%
+  triton_bmm_81559 0.0108 ms 92.6%
+  triton_bmm_81567 0.0109 ms 91.8%
+SingleProcess AUTOTUNE takes 3.8857 seconds
+AUTOTUNE bmm(16x1x853, 16x853x96)
+  bmm 0.0145 ms 100.0%
+  triton_bmm_81589 0.0185 ms 78.3%
+  triton_bmm_81588 0.0207 ms 69.9%
+  triton_bmm_81585 0.0268 ms 54.1%
+  triton_bmm_81586 0.0268 ms 53.9%
+  triton_bmm_81582 0.0272 ms 53.1%
+  triton_bmm_81584 0.0274 ms 52.8%
+  triton_bmm_81581 0.0287 ms 50.4%
+  triton_bmm_81583 0.0300 ms 48.2%
+  triton_bmm_81580 0.0370 ms 39.1%
+SingleProcess AUTOTUNE takes 4.3133 seconds
+AUTOTUNE bmm(16x1x96, 16x96x854)
+  triton_bmm_81656 0.0097 ms 100.0%
+  triton_bmm_81658 0.0097 ms 99.7%
+  triton_bmm_81654 0.0100 ms 96.8%
+  triton_bmm_81653 0.0101 ms 96.2%
+  triton_bmm_81659 0.0101 ms 95.9%
+  triton_bmm_81652 0.0105 ms 92.7%
+  triton_bmm_81660 0.0106 ms 91.8%
+  triton_bmm_81657 0.0106 ms 91.3%
+  triton_bmm_81662 0.0106 ms 91.3%
+  triton_bmm_81655 0.0107 ms 91.0%
+SingleProcess AUTOTUNE takes 3.6244 seconds
+AUTOTUNE bmm(16x1x854, 16x854x96)
+  triton_bmm_81682 0.0123 ms 100.0%
+  triton_bmm_81681 0.0124 ms 99.0%
+  triton_bmm_81684 0.0135 ms 90.8%
+  triton_bmm_81680 0.0138 ms 89.1%
+  triton_bmm_81685 0.0143 ms 85.9%
+  bmm 0.0169 ms 72.5%
+  triton_bmm_81678 0.0171 ms 71.9%
+  triton_bmm_81677 0.0180 ms 68.1%
+  triton_bmm_81676 0.0258 ms 47.5%
+  triton_bmm_81679 0.0263 ms 46.7%
+SingleProcess AUTOTUNE takes 4.0484 seconds
+AUTOTUNE bmm(16x1x96, 16x96x855)
+  triton_bmm_81748 0.0100 ms 100.0%
+  triton_bmm_81749 0.0101 ms 98.7%
+  triton_bmm_81756 0.0102 ms 98.1%
+  triton_bmm_81753 0.0102 ms 97.8%
+  triton_bmm_81751 0.0103 ms 97.2%
+  triton_bmm_81752 0.0103 ms 97.2%
+  triton_bmm_81754 0.0103 ms 96.6%
+  triton_bmm_81755 0.0103 ms 96.6%
+  triton_bmm_81750 0.0105 ms 94.8%
+  triton_bmm_81759 0.0109 ms 91.2%
+SingleProcess AUTOTUNE takes 3.7415 seconds
+AUTOTUNE bmm(16x1x855, 16x855x96)
+  bmm 0.0154 ms 100.0%
+  triton_bmm_81781 0.0180 ms 85.5%
+  triton_bmm_81780 0.0203 ms 76.0%
+  triton_bmm_81778 0.0268 ms 57.5%
+  triton_bmm_81774 0.0272 ms 56.6%
+  triton_bmm_81777 0.0273 ms 56.6%
+  triton_bmm_81776 0.0274 ms 56.3%
+  triton_bmm_81773 0.0287 ms 53.7%
+  triton_bmm_81775 0.0301 ms 51.2%
+  triton_bmm_81772 0.0371 ms 41.6%
+SingleProcess AUTOTUNE takes 4.2156 seconds
+AUTOTUNE bmm(16x1x96, 16x96x856)
+  triton_bmm_81850 0.0097 ms 100.0%
+  triton_bmm_81844 0.0099 ms 98.1%
+  triton_bmm_81847 0.0100 ms 96.8%
+  triton_bmm_81848 0.0102 ms 95.6%
+  triton_bmm_81845 0.0104 ms 93.3%
+  triton_bmm_81846 0.0105 ms 93.0%
+  triton_bmm_81851 0.0105 ms 92.7%
+  triton_bmm_81849 0.0105 ms 92.4%
+  triton_bmm_81852 0.0106 ms 91.6%
+  triton_bmm_81853 0.0107 ms 91.3%
+SingleProcess AUTOTUNE takes 3.6828 seconds
+AUTOTUNE bmm(16x1x856, 16x856x96)
+  triton_bmm_81874 0.0122 ms 100.0%
+  bmm 0.0124 ms 98.7%
+  triton_bmm_81873 0.0135 ms 90.7%
+  triton_bmm_81872 0.0136 ms 90.1%
+  triton_bmm_81876 0.0136 ms 89.9%
+  triton_bmm_81877 0.0148 ms 82.5%
+  triton_bmm_81871 0.0152 ms 80.4%
+  triton_bmm_81870 0.0164 ms 74.5%
+  triton_bmm_81869 0.0176 ms 69.5%
+  triton_bmm_81868 0.0252 ms 48.4%
+SingleProcess AUTOTUNE takes 4.1711 seconds
+AUTOTUNE bmm(16x1x96, 16x96x857)
+  triton_bmm_81944 0.0098 ms 100.0%
+  triton_bmm_81942 0.0100 ms 97.8%
+  triton_bmm_81943 0.0103 ms 95.0%
+  triton_bmm_81946 0.0103 ms 94.7%
+  triton_bmm_81940 0.0105 ms 92.8%
+  triton_bmm_81941 0.0107 ms 91.3%
+  triton_bmm_81945 0.0107 ms 91.0%
+  triton_bmm_81948 0.0107 ms 91.0%
+  triton_bmm_81947 0.0108 ms 90.0%
+  triton_bmm_81951 0.0109 ms 89.2%
+SingleProcess AUTOTUNE takes 4.0437 seconds
+AUTOTUNE bmm(16x1x857, 16x857x96)
+  bmm 0.0155 ms 100.0%
+  triton_bmm_81973 0.0181 ms 85.8%
+  triton_bmm_81972 0.0208 ms 74.6%
+  triton_bmm_81970 0.0268 ms 57.8%
+  triton_bmm_81968 0.0270 ms 57.5%
+  triton_bmm_81966 0.0272 ms 57.0%
+  triton_bmm_81969 0.0272 ms 57.0%
+  triton_bmm_81965 0.0291 ms 53.4%
+  triton_bmm_81967 0.0304 ms 51.0%
+  triton_bmm_81964 0.0372 ms 41.8%
+SingleProcess AUTOTUNE takes 3.8660 seconds
+AUTOTUNE bmm(16x1x96, 16x96x858)
+  triton_bmm_82044 0.0100 ms 100.0%
+  triton_bmm_82037 0.0101 ms 99.1%
+  triton_bmm_82039 0.0102 ms 98.4%
+  triton_bmm_82040 0.0102 ms 98.1%
+  triton_bmm_82042 0.0103 ms 97.4%
+  triton_bmm_82041 0.0103 ms 96.9%
+  triton_bmm_82047 0.0104 ms 96.6%
+  triton_bmm_82036 0.0105 ms 95.7%
+  triton_bmm_82038 0.0105 ms 95.1%
+  triton_bmm_82043 0.0107 ms 94.0%
+SingleProcess AUTOTUNE takes 3.6475 seconds
+AUTOTUNE bmm(16x1x858, 16x858x96)
+  triton_bmm_82066 0.0123 ms 100.0%
+  triton_bmm_82065 0.0125 ms 98.0%
+  triton_bmm_82068 0.0131 ms 93.9%
+  triton_bmm_82064 0.0138 ms 89.3%
+  triton_bmm_82069 0.0138 ms 88.9%
+  triton_bmm_82063 0.0148 ms 82.9%
+  bmm 0.0164 ms 74.9%
+  triton_bmm_82062 0.0172 ms 71.5%
+  triton_bmm_82061 0.0180 ms 68.3%
+  triton_bmm_82060 0.0262 ms 46.9%
+SingleProcess AUTOTUNE takes 4.0797 seconds
+AUTOTUNE bmm(16x1x96, 16x96x859)
+  triton_bmm_82132 0.0100 ms 100.0%
+  triton_bmm_82136 0.0103 ms 96.9%
+  triton_bmm_82138 0.0103 ms 96.7%
+  triton_bmm_82134 0.0105 ms 94.8%
+  triton_bmm_82140 0.0106 ms 93.7%
+  triton_bmm_82133 0.0107 ms 93.1%
+  triton_bmm_82141 0.0107 ms 93.1%
+  triton_bmm_82137 0.0107 ms 92.8%
+  triton_bmm_82139 0.0107 ms 92.8%
+  triton_bmm_82135 0.0108 ms 92.1%
+SingleProcess AUTOTUNE takes 4.2743 seconds
+AUTOTUNE bmm(16x1x859, 16x859x96)
+  bmm 0.0155 ms 100.0%
+  triton_bmm_82165 0.0185 ms 83.7%
+  triton_bmm_82164 0.0203 ms 76.3%
+  triton_bmm_82162 0.0268 ms 57.8%
+  triton_bmm_82161 0.0268 ms 57.7%
+  triton_bmm_82160 0.0274 ms 56.5%
+  triton_bmm_82158 0.0276 ms 56.1%
+  triton_bmm_82157 0.0292 ms 53.1%
+  triton_bmm_82159 0.0301 ms 51.4%
+  triton_bmm_82156 0.0367 ms 42.2%
+SingleProcess AUTOTUNE takes 3.8325 seconds
+AUTOTUNE bmm(16x1x96, 16x96x860)
+  triton_bmm_82232 0.0097 ms 100.0%
+  triton_bmm_82234 0.0097 ms 100.0%
+  triton_bmm_82228 0.0099 ms 97.4%
+  triton_bmm_82230 0.0099 ms 97.4%
+  triton_bmm_82235 0.0101 ms 95.6%
+  triton_bmm_82236 0.0101 ms 95.6%
+  triton_bmm_82233 0.0103 ms 93.5%
+  triton_bmm_82239 0.0104 ms 92.9%
+  triton_bmm_82229 0.0105 ms 92.1%
+  triton_bmm_82237 0.0106 ms 91.5%
+SingleProcess AUTOTUNE takes 3.9486 seconds
+AUTOTUNE bmm(16x1x860, 16x860x96)
+  triton_bmm_82257 0.0120 ms 100.0%
+  triton_bmm_82258 0.0128 ms 94.0%
+  triton_bmm_82260 0.0131 ms 91.7%
+  triton_bmm_82261 0.0135 ms 88.7%
+  triton_bmm_82256 0.0141 ms 84.8%
+  triton_bmm_82255 0.0153 ms 78.5%
+  bmm 0.0165 ms 72.8%
+  triton_bmm_82254 0.0171 ms 70.2%
+  triton_bmm_82253 0.0177 ms 67.9%
+  triton_bmm_82252 0.0258 ms 46.6%
+SingleProcess AUTOTUNE takes 4.5927 seconds
+AUTOTUNE bmm(16x1x96, 16x96x861)
+  triton_bmm_82326 0.0100 ms 100.0%
+  triton_bmm_82330 0.0103 ms 97.2%
+  triton_bmm_82327 0.0103 ms 96.9%
+  triton_bmm_82328 0.0103 ms 96.9%
+  triton_bmm_82324 0.0105 ms 95.1%
+  triton_bmm_82335 0.0105 ms 95.1%
+  triton_bmm_82334 0.0106 ms 94.3%
+  triton_bmm_82325 0.0107 ms 93.4%
+  triton_bmm_82333 0.0107 ms 93.4%
+  triton_bmm_82329 0.0107 ms 93.1%
+SingleProcess AUTOTUNE takes 3.6839 seconds
+AUTOTUNE bmm(16x1x861, 16x861x96)
+  bmm 0.0156 ms 100.0%
+  triton_bmm_82357 0.0185 ms 84.6%
+  triton_bmm_82356 0.0208 ms 75.3%
+  triton_bmm_82352 0.0271 ms 57.7%
+  triton_bmm_82354 0.0273 ms 57.3%
+  triton_bmm_82353 0.0274 ms 57.1%
+  triton_bmm_82350 0.0277 ms 56.5%
+  triton_bmm_82349 0.0292 ms 53.7%
+  triton_bmm_82351 0.0302 ms 51.9%
+  triton_bmm_82348 0.0369 ms 42.4%
+SingleProcess AUTOTUNE takes 4.0727 seconds
+AUTOTUNE bmm(16x1x96, 16x96x862)
+  triton_bmm_82428 0.0101 ms 100.0%
+  triton_bmm_82427 0.0102 ms 99.1%
+  triton_bmm_82423 0.0103 ms 98.8%
+  triton_bmm_82424 0.0103 ms 98.8%
+  triton_bmm_82426 0.0103 ms 98.8%
+  triton_bmm_82431 0.0104 ms 97.8%
+  triton_bmm_82420 0.0104 ms 97.2%
+  triton_bmm_82422 0.0105 ms 96.4%
+  triton_bmm_82421 0.0107 ms 95.2%
+  triton_bmm_82425 0.0107 ms 95.2%
+SingleProcess AUTOTUNE takes 3.6268 seconds
+AUTOTUNE bmm(16x1x862, 16x862x96)
+  triton_bmm_82450 0.0122 ms 100.0%
+  triton_bmm_82449 0.0125 ms 97.4%
+  triton_bmm_82452 0.0131 ms 93.4%
+  triton_bmm_82453 0.0140 ms 87.6%
+  triton_bmm_82448 0.0142 ms 86.3%
+  triton_bmm_82447 0.0148 ms 82.3%
+  bmm 0.0169 ms 72.2%
+  triton_bmm_82446 0.0172 ms 71.3%
+  triton_bmm_82445 0.0181 ms 67.6%
+  triton_bmm_82444 0.0259 ms 47.3%
+SingleProcess AUTOTUNE takes 4.3173 seconds
+AUTOTUNE bmm(16x1x96, 16x96x863)
+  triton_bmm_82518 0.0101 ms 100.0%
+  triton_bmm_82517 0.0102 ms 99.2%
+  triton_bmm_82521 0.0102 ms 99.1%
+  triton_bmm_82524 0.0102 ms 99.1%
+  triton_bmm_82519 0.0103 ms 98.1%
+  triton_bmm_82520 0.0103 ms 98.1%
+  triton_bmm_82522 0.0103 ms 98.1%
+  triton_bmm_82516 0.0105 ms 96.6%
+  triton_bmm_82527 0.0105 ms 96.6%
+  triton_bmm_82526 0.0106 ms 95.5%
+SingleProcess AUTOTUNE takes 3.7240 seconds
+AUTOTUNE bmm(16x1x863, 16x863x96)
+  bmm 0.0161 ms 100.0%
+  triton_bmm_82549 0.0185 ms 86.9%
+  triton_bmm_82548 0.0207 ms 77.5%
+  triton_bmm_82546 0.0250 ms 64.2%
+  triton_bmm_82545 0.0253 ms 63.5%
+  triton_bmm_82542 0.0255 ms 62.9%
+  triton_bmm_82544 0.0257 ms 62.6%
+  triton_bmm_82543 0.0280 ms 57.3%
+  triton_bmm_82541 0.0294 ms 54.6%
+  triton_bmm_82540 0.0392 ms 41.0%
+SingleProcess AUTOTUNE takes 3.8609 seconds
+AUTOTUNE bmm(16x1x96, 16x96x864)
+  triton_bmm_82613 0.0099 ms 100.0%
+  triton_bmm_82612 0.0099 ms 99.7%
+  triton_bmm_82614 0.0100 ms 99.4%
+  triton_bmm_82619 0.0101 ms 97.8%
+  triton_bmm_82620 0.0102 ms 97.2%
+  triton_bmm_82616 0.0102 ms 96.9%
+  triton_bmm_82618 0.0103 ms 96.3%
+  triton_bmm_82617 0.0105 ms 93.9%
+  triton_bmm_82615 0.0106 ms 93.6%
+  triton_bmm_82621 0.0106 ms 93.1%
+SingleProcess AUTOTUNE takes 4.1898 seconds
+AUTOTUNE bmm(16x1x864, 16x864x96)
+  triton_bmm_82642 0.0122 ms 100.0%
+  bmm 0.0122 ms 99.5%
+  triton_bmm_82641 0.0125 ms 96.9%
+  triton_bmm_82644 0.0135 ms 90.3%
+  triton_bmm_82645 0.0138 ms 88.2%
+  triton_bmm_82640 0.0140 ms 86.6%
+  triton_bmm_82639 0.0148 ms 82.1%
+  triton_bmm_82638 0.0169 ms 71.8%
+  triton_bmm_82637 0.0175 ms 69.6%
+  triton_bmm_82636 0.0254 ms 47.8%
+SingleProcess AUTOTUNE takes 3.8551 seconds
+TIMEOUT
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+dcgan
+cuda eval  dcgan                               int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 444.24it/s]
+1355.500ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model demucs does not support bfloat16, running with amp instead
+loading model: 0it [00:05, ?it/s]
+WARNING:common:Model demucs does not support bfloat16, running with amp instead
+demucs
+cuda eval  demucs                              int8weightonly            
+WARNING:common:Model demucs does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 14.75it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 18.83it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 20.12it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 20.71it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 21.05it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 21.26it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 21.37it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 21.43it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 21.49it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 21.52it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 20.96it/s]
+1184.444ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+densenet121
+cuda eval  densenet121                         int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  10%|█         | 3/30 [00:00<00:01, 26.03it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:00, 30.76it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 32.25it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 32.95it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 33.34it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 33.58it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 33.71it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 33.00it/s]
+1999.939ms
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_c4
+WARNING:common:Model detectron2_fasterrcnn_r_101_c4 does not support bfloat16, running with amp instead
+loading model: 0it [00:06, ?it/s]
+WARNING:common:Model detectron2_fasterrcnn_r_101_c4 does not support bfloat16, running with amp instead
+cuda eval  detectron2_fasterrcnn_r_101_c4      int8weightonly            
+WARNING:common:Model detectron2_fasterrcnn_r_101_c4 does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-11 22:32:09,019] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:03,  9.29it/s]running benchmark:  10%|█         | 3/30 [00:00<00:02, 11.94it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 12.59it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 12.91it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 12.99it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:01, 13.12it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:01, 13.13it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01, 13.19it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:01, 12.96it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 13.03it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 13.12it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 13.18it/s]running benchmark:  83%|████████▎ | 25/30 [00:01<00:00, 12.91it/s]running benchmark:  90%|█████████ | 27/30 [00:02<00:00, 12.46it/s]running benchmark:  97%|█████████▋| 29/30 [00:02<00:00, 12.67it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 12.84it/s]
+1745.776ms
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_dc5
+loading model: 0it [00:08, ?it/s]
+WARNING:root:detectron2_fasterrcnn_r_101_dc5 failed to load
+Original Error: "roi_align_forward_kernel" not implemented for 'BFloat16'
+Eager model failed to run
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model
+    self.model_iter_fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward
+    return self.inference(batched_inputs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference
+    results, _ = self.roi_heads(images, features, proposals, None)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 747, in forward
+    pred_instances = self._forward_box(features, proposals)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 798, in _forward_box
+    box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals])
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward
+    return self.level_poolers[0](x[0], pooler_fmt_boxes)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward
+    return roi_align(
+  File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 238, in roi_align
+    return torch.ops.torchvision.roi_align(
+  File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 755, in __call__
+    return self._op(*args, **(kwargs or {}))
+RuntimeError: "roi_align_forward_kernel" not implemented for 'BFloat16'
+
+The above exception was the direct cause of the following exception:
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model
+    self.validate_model(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model
+    raise NotImplementedError("Eager model failed to run") from e
+NotImplementedError: Eager model failed to run
+
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_fpn
+WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:07, ?it/s]
+WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_fasterrcnn_r_101_fpn     int8weightonly            
+WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-11 22:33:55,051] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 18.91it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 20.35it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 20.77it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 20.96it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 21.07it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 21.13it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 20.95it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 21.04it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 21.11it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 21.18it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 21.00it/s]
+2260.930ms
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_50_c4
+loading model: 0it [00:05, ?it/s]
+WARNING:root:detectron2_fasterrcnn_r_50_c4 failed to load
+Original Error: "roi_align_forward_kernel" not implemented for 'BFloat16'
+Eager model failed to run
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model
+    self.model_iter_fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward
+    return self.inference(batched_inputs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference
+    results, _ = self.roi_heads(images, features, proposals, None)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 477, in forward
+    box_features = self._shared_roi_transform(
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 456, in _shared_roi_transform
+    x = self.pooler(features, boxes)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward
+    return self.level_poolers[0](x[0], pooler_fmt_boxes)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward
+    return roi_align(
+  File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 238, in roi_align
+    return torch.ops.torchvision.roi_align(
+  File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 755, in __call__
+    return self._op(*args, **(kwargs or {}))
+RuntimeError: "roi_align_forward_kernel" not implemented for 'BFloat16'
+
+The above exception was the direct cause of the following exception:
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model
+    self.validate_model(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model
+    raise NotImplementedError("Eager model failed to run") from e
+NotImplementedError: Eager model failed to run
+
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_50_dc5
+WARNING:common:Model detectron2_fasterrcnn_r_50_dc5 does not support bfloat16, running with amp instead
+loading model: 0it [00:08, ?it/s]
+WARNING:common:Model detectron2_fasterrcnn_r_50_dc5 does not support bfloat16, running with amp instead
+cuda eval  detectron2_fasterrcnn_r_50_dc5      int8weightonly            
+WARNING:common:Model detectron2_fasterrcnn_r_50_dc5 does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-11 22:35:05,942] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  10%|█         | 3/30 [00:00<00:01, 21.30it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 22.48it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 23.05it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 23.33it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 23.48it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 23.54it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 23.56it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 23.61it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 23.65it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 23.56it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 23.38it/s]
+1356.145ms
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_50_fpn
+WARNING:common:Model detectron2_fasterrcnn_r_50_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:06, ?it/s]
+WARNING:common:Model detectron2_fasterrcnn_r_50_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_fasterrcnn_r_50_fpn      int8weightonly            
+WARNING:common:Model detectron2_fasterrcnn_r_50_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-11 22:36:11,766] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:05,  5.16it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:01, 14.52it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 18.94it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 21.35it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 22.84it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 23.81it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 24.46it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 24.89it/s]running benchmark:  83%|████████▎ | 25/30 [00:01<00:00, 25.12it/s]running benchmark:  93%|█████████▎| 28/30 [00:01<00:00, 25.34it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 22.68it/s]
+2037.553ms
+loading model: 0it [00:00, ?it/s]detectron2_fcos_r_50_fpn
+WARNING:common:Model detectron2_fcos_r_50_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:05, ?it/s]
+WARNING:common:Model detectron2_fcos_r_50_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_fcos_r_50_fpn            int8weightonly            
+WARNING:common:Model detectron2_fcos_r_50_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 14.45it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:01, 15.79it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 16.33it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 16.62it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 16.76it/s]running benchmark:  40%|████      | 12/30 [00:00<00:01, 16.83it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 17.02it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 17.14it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:00, 17.24it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 17.39it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 17.36it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 17.47it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 17.55it/s]running benchmark:  93%|█████████▎| 28/30 [00:01<00:00, 17.61it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 17.52it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 17.11it/s]
+1125.043ms
+loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_101_c4
+WARNING:common:Model detectron2_maskrcnn_r_101_c4 does not support bfloat16, running with amp instead
+loading model: 0it [00:07, ?it/s]
+WARNING:common:Model detectron2_maskrcnn_r_101_c4 does not support bfloat16, running with amp instead
+cuda eval  detectron2_maskrcnn_r_101_c4        int8weightonly            
+WARNING:common:Model detectron2_maskrcnn_r_101_c4 does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-11 22:38:48,534] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-11 22:38:54,807] [30/1_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:02, 11.80it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:02, 12.11it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 12.20it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 12.27it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 12.29it/s]running benchmark:  40%|████      | 12/30 [00:00<00:01, 12.27it/s]running benchmark:  47%|████▋     | 14/30 [00:01<00:01, 12.29it/s]running benchmark:  53%|█████▎    | 16/30 [00:01<00:01, 12.28it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:01, 11.81it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 11.97it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 12.07it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 12.14it/s]running benchmark:  87%|████████▋ | 26/30 [00:02<00:00, 12.19it/s]running benchmark:  93%|█████████▎| 28/30 [00:02<00:00, 12.23it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 12.27it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 12.17it/s]
+1727.084ms
+loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_101_fpn
+WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:07, ?it/s]
+WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_maskrcnn_r_101_fpn       int8weightonly            
+WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-11 22:40:28,382] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-11 22:40:33,001] [30/1_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:09,  3.16it/s]running benchmark:  10%|█         | 3/30 [00:00<00:03,  8.06it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:02, 11.44it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 13.78it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 15.35it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:01, 16.35it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 17.19it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:00, 17.76it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:00, 18.18it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 18.49it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 18.69it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 18.82it/s]running benchmark:  83%|████████▎ | 25/30 [00:01<00:00, 18.92it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 19.01it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 19.07it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 16.25it/s]
+2098.250ms
+loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_50_c4
+WARNING:common:Model detectron2_maskrcnn_r_50_c4 does not support bfloat16, running with amp instead
+loading model: 0it [00:05, ?it/s]
+WARNING:common:Model detectron2_maskrcnn_r_50_c4 does not support bfloat16, running with amp instead
+cuda eval  detectron2_maskrcnn_r_50_c4         int8weightonly            
+WARNING:common:Model detectron2_maskrcnn_r_50_c4 does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-11 22:41:32,752] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-11 22:41:39,280] [30/1_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:02, 12.42it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:01, 13.49it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 13.93it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 14.15it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 14.30it/s]running benchmark:  40%|████      | 12/30 [00:00<00:01, 14.34it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:01, 14.37it/s]running benchmark:  53%|█████▎    | 16/30 [00:01<00:00, 14.07it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:00, 14.22it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 14.31it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 14.39it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 14.45it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 14.49it/s]running benchmark:  93%|█████████▎| 28/30 [00:01<00:00, 14.52it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 14.53it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 14.28it/s]
+1550.645ms
+loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_50_fpn
+WARNING:common:Model detectron2_maskrcnn_r_50_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:05, ?it/s]
+WARNING:common:Model detectron2_maskrcnn_r_50_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_maskrcnn_r_50_fpn        int8weightonly            
+WARNING:common:Model detectron2_maskrcnn_r_50_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-11 22:42:52,632] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-11 22:42:57,562] [30/1_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  10%|█         | 3/30 [00:00<00:01, 21.04it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 21.69it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 21.87it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 22.06it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 22.10it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 22.25it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 22.33it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 22.37it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 22.45it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 21.96it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 22.07it/s]
+1864.396ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:13, ?it/s]
+dlrm
+cuda eval  dlrm                                int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 230.72it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 231.56it/s]
+1467.675ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model doctr_det_predictor does not support bfloat16, running with amp instead
+loading model: 0it [00:07, ?it/s]
+WARNING:common:Model doctr_det_predictor does not support bfloat16, running with amp instead
+doctr_det_predictor
+cuda eval  doctr_det_predictor                 int8weightonly            
+WARNING:common:Model doctr_det_predictor does not support bfloat16, running with amp instead
+[2023-12-11 22:44:11,878] [1/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+malloc(): unaligned tcache chunk detected
+Run failed with return code:  -6
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead
+loading model: 0it [00:07, ?it/s]
+WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead
+doctr_reco_predictor
+cuda eval  doctr_reco_predictor                int8weightonly            
+WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 197.12it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 197.29it/s]
+2420.496ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+drq
+cuda eval  drq                                 int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 548.55it/s]
+3263.142ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+fastNLP_Bert
+cuda eval  fastNLP_Bert                        int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:00, 37.96it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 38.51it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 38.10it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 38.47it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 38.51it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 38.47it/s]running benchmark:  93%|█████████▎| 28/30 [00:00<00:00, 38.69it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 38.56it/s]
+3991.254ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+functorch_dp_cifar10
+cuda eval  functorch_dp_cifar10                int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 229.69it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 233.80it/s]
+4531.000ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+functorch_maml_omniglot
+cuda eval  functorch_maml_omniglot             int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 751.60it/s]
+3000.024ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+hf_Albert
+cuda eval  hf_Albert                           int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:07,  3.93it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 22.89it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 33.50it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 39.90it/s]running benchmark:  83%|████████▎ | 25/30 [00:00<00:00, 44.01it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 37.10it/s]
+7224.107ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:07, ?it/s]
+hf_Bart
+cuda eval  hf_Bart                             int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 17.44it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:00, 33.10it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 38.08it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 40.17it/s]running benchmark:  73%|███████▎  | 22/30 [00:00<00:00, 40.71it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 41.34it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 39.23it/s]
+4181.538ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:05, ?it/s]
+hf_BigBird
+cuda eval  hf_BigBird                          int8weightonly            
+[2023-12-11 22:48:19,748] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 22:48:22,756] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 22:48:24,547] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 22:48:26,299] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 22:48:28,304] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 22:48:30,031] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 22:48:31,758] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 22:48:33,470] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 22:48:35,485] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 22:48:37,207] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 22:48:38,925] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-11 22:48:40,695] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:02, 11.58it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:02, 11.63it/s]running benchmark:  20%|██        | 6/30 [00:00<00:02, 11.67it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 11.64it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 11.67it/s]running benchmark:  40%|████      | 12/30 [00:01<00:01, 11.73it/s]running benchmark:  47%|████▋     | 14/30 [00:01<00:01, 11.70it/s]running benchmark:  53%|█████▎    | 16/30 [00:01<00:01, 11.71it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:01, 11.71it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 11.75it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 11.74it/s]running benchmark:  80%|████████  | 24/30 [00:02<00:00, 11.72it/s]running benchmark:  87%|████████▋ | 26/30 [00:02<00:00, 11.73it/s]running benchmark:  93%|█████████▎| 28/30 [00:02<00:00, 11.70it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 11.75it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 11.71it/s]
+2599.183ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:05, ?it/s]
+hf_DistilBert
+cuda eval  hf_DistilBert                       int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 86.50it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 93.79it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 96.45it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 94.96it/s]
+4326.068ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:05, ?it/s]
+hf_GPT2
+cuda eval  hf_GPT2                             int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  20%|██        | 6/30 [00:00<00:00, 59.64it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 62.34it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 62.14it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 59.40it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 60.55it/s]
+3468.913ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:18, ?it/s]
+hf_GPT2_large
+cuda eval  hf_GPT2_large                       int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:09,  3.19it/s]running benchmark:  10%|█         | 3/30 [00:00<00:03,  8.34it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:02, 11.85it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 14.27it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 15.82it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:01, 17.01it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 17.83it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:00, 18.40it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:00, 18.79it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 19.07it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 19.26it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 19.39it/s]running benchmark:  83%|████████▎ | 25/30 [00:01<00:00, 19.38it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 19.45it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 19.50it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 16.74it/s]
+2065.583ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:07, ?it/s]
+hf_Longformer
+cuda eval  hf_Longformer                       int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:02, 12.66it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:01, 13.66it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 14.03it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 14.21it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 14.32it/s]running benchmark:  40%|████      | 12/30 [00:00<00:01, 14.37it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:01, 14.42it/s]running benchmark:  53%|█████▎    | 16/30 [00:01<00:00, 14.43it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:00, 14.44it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 14.46it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 14.46it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 14.46it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 14.47it/s]running benchmark:  93%|█████████▎| 28/30 [00:01<00:00, 14.47it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 14.48it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 14.34it/s]
+2402.260ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+hf_Reformer
+cuda eval  hf_Reformer                         int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:00, 49.12it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 50.48it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 50.99it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 51.23it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 50.92it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 50.84it/s]
+4771.188ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:05, ?it/s]
+hf_T5
+cuda eval  hf_T5                               int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 18.01it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 21.78it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 22.90it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 23.41it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 23.66it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 23.83it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 23.96it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 24.03it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 24.05it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 24.07it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 23.61it/s]
+2763.916ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:07, ?it/s]
+hf_T5_base
+cuda eval  hf_T5_base                          int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:05,  5.18it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:04,  6.83it/s]running benchmark:  10%|█         | 3/30 [00:00<00:03,  7.62it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:03,  8.01it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:03,  8.28it/s]running benchmark:  20%|██        | 6/30 [00:00<00:02,  8.45it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:02,  8.55it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:02,  8.62it/s]running benchmark:  30%|███       | 9/30 [00:01<00:02,  8.67it/s]running benchmark:  33%|███▎      | 10/30 [00:01<00:02,  8.71it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:02,  8.75it/s]running benchmark:  40%|████      | 12/30 [00:01<00:02,  8.77it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:01,  8.78it/s]running benchmark:  47%|████▋     | 14/30 [00:01<00:01,  8.79it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01,  8.79it/s]running benchmark:  53%|█████▎    | 16/30 [00:01<00:01,  8.80it/s]running benchmark:  57%|█████▋    | 17/30 [00:02<00:01,  8.80it/s]running benchmark:  60%|██████    | 18/30 [00:02<00:01,  8.80it/s]running benchmark:  63%|██████▎   | 19/30 [00:02<00:01,  8.79it/s]running benchmark:  67%|██████▋   | 20/30 [00:02<00:01,  8.80it/s]running benchmark:  70%|███████   | 21/30 [00:02<00:01,  8.80it/s]running benchmark:  73%|███████▎  | 22/30 [00:02<00:00,  8.80it/s]running benchmark:  77%|███████▋  | 23/30 [00:02<00:00,  8.80it/s]running benchmark:  80%|████████  | 24/30 [00:02<00:00,  8.80it/s]running benchmark:  83%|████████▎ | 25/30 [00:02<00:00,  8.81it/s]running benchmark:  87%|████████▋ | 26/30 [00:03<00:00,  8.80it/s]running benchmark:  90%|█████████ | 27/30 [00:03<00:00,  8.79it/s]running benchmark:  93%|█████████▎| 28/30 [00:03<00:00,  8.79it/s]running benchmark:  97%|█████████▋| 29/30 [00:03<00:00,  8.79it/s]running benchmark: 100%|██████████| 30/30 [00:03<00:00,  8.79it/s]running benchmark: 100%|██████████| 30/30 [00:03<00:00,  8.60it/s]
+2797.801ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:09, ?it/s]
+hf_T5_generate
+cuda eval  hf_T5_generate                      int8weightonly            
+[2023-12-11 23:12:03,113] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (1000)
+[2023-12-11 23:12:03,113] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py:1645)
+[2023-12-11 23:12:03,113] torch._dynamo.convert_frame: [WARNING]    last reason: ___check_obj_id(L['past_key_values'], 7628576)                # mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length  # miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py:1026 in forward
+[2023-12-11 23:12:03,113] torch._dynamo.convert_frame: [WARNING] To log all recompilation reasons, use TORCH_LOGS="recompiles".
+[2023-12-11 23:12:03,113] torch._dynamo.convert_frame: [WARNING] To diagnose recompilation issues, see https://pytorch.org/docs/master/compile/troubleshooting.html.
+AUTOTUNE bmm(8x1x64, 8x64x127)
+  triton_bmm_30269 0.0067 ms 100.0%
+  triton_bmm_30267 0.0070 ms 95.4%
+  triton_bmm_30273 0.0071 ms 93.7%
+  triton_bmm_30266 0.0072 ms 92.3%
+  triton_bmm_30268 0.0073 ms 92.1%
+  triton_bmm_30270 0.0073 ms 92.1%
+  triton_bmm_30264 0.0073 ms 91.3%
+  triton_bmm_30272 0.0073 ms 91.3%
+  triton_bmm_30265 0.0075 ms 89.3%
+  triton_bmm_30271 0.0076 ms 88.2%
+SingleProcess AUTOTUNE takes 4.3988 seconds
+AUTOTUNE bmm(8x1x64, 8x64x128)
+  triton_bmm_30518 0.0067 ms 100.0%
+  triton_bmm_30520 0.0067 ms 100.0%
+  triton_bmm_30522 0.0067 ms 100.0%
+  triton_bmm_30517 0.0067 ms 99.5%
+  triton_bmm_30519 0.0069 ms 96.8%
+  triton_bmm_30521 0.0071 ms 94.6%
+  triton_bmm_30525 0.0071 ms 94.1%
+  triton_bmm_30524 0.0075 ms 89.3%
+  triton_bmm_30516 0.0077 ms 86.4%
+  triton_bmm_30526 0.0078 ms 86.0%
+SingleProcess AUTOTUNE takes 4.0998 seconds
+[2023-12-11 23:28:03,535] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (1000)
+[2023-12-11 23:28:03,535] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py:978)
+[2023-12-11 23:28:03,535] torch._dynamo.convert_frame: [WARNING]    last reason: tensor 'L['input_ids']' stride mismatch at index 0. expected 65, actual 129
+[2023-12-11 23:28:03,535] torch._dynamo.convert_frame: [WARNING] To log all recompilation reasons, use TORCH_LOGS="recompiles".
+[2023-12-11 23:28:03,535] torch._dynamo.convert_frame: [WARNING] To diagnose recompilation issues, see https://pytorch.org/docs/master/compile/troubleshooting.html.
+AUTOTUNE bmm(8x1x64, 8x64x129)
+  triton_bmm_30770 0.0067 ms 100.0%
+  triton_bmm_30774 0.0067 ms 100.0%
+  triton_bmm_30769 0.0069 ms 96.8%
+  triton_bmm_30773 0.0070 ms 95.0%
+  triton_bmm_30776 0.0071 ms 93.7%
+  triton_bmm_30777 0.0071 ms 93.7%
+  triton_bmm_30768 0.0072 ms 92.9%
+  triton_bmm_30772 0.0073 ms 92.1%
+  triton_bmm_30771 0.0077 ms 86.7%
+  triton_bmm_30778 0.0078 ms 85.7%
+SingleProcess AUTOTUNE takes 3.8123 seconds
+AUTOTUNE bmm(8x1x64, 8x64x130)
+  triton_bmm_31024 0.0070 ms 100.0%
+  triton_bmm_31022 0.0070 ms 99.5%
+  triton_bmm_31025 0.0071 ms 98.6%
+  triton_bmm_31028 0.0071 ms 98.2%
+  triton_bmm_31029 0.0071 ms 98.2%
+  triton_bmm_31026 0.0072 ms 97.3%
+  triton_bmm_31021 0.0075 ms 93.6%
+  triton_bmm_31023 0.0075 ms 93.0%
+  triton_bmm_31031 0.0076 ms 92.4%
+  triton_bmm_31020 0.0077 ms 91.2%
+SingleProcess AUTOTUNE takes 3.5217 seconds
+AUTOTUNE bmm(8x1x64, 8x64x131)
+  triton_bmm_31277 0.0067 ms 100.0%
+  triton_bmm_31278 0.0067 ms 100.0%
+  triton_bmm_31275 0.0071 ms 93.7%
+  triton_bmm_31281 0.0071 ms 93.7%
+  triton_bmm_31274 0.0073 ms 92.1%
+  triton_bmm_31276 0.0073 ms 92.1%
+  triton_bmm_31273 0.0075 ms 89.7%
+  triton_bmm_31283 0.0076 ms 88.2%
+  triton_bmm_31280 0.0077 ms 86.7%
+  triton_bmm_31272 0.0078 ms 86.0%
+SingleProcess AUTOTUNE takes 4.0954 seconds
+TIMEOUT
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:17, ?it/s]
+hf_T5_large
+cuda eval  hf_T5_large                         int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:06,  4.31it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:04,  6.22it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:03,  8.25it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:02,  8.72it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:02,  9.42it/s]running benchmark:  30%|███       | 9/30 [00:01<00:02,  9.79it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:01,  9.96it/s]running benchmark:  40%|████      | 12/30 [00:01<00:01,  9.94it/s]running benchmark:  47%|████▋     | 14/30 [00:01<00:01, 10.05it/s]running benchmark:  53%|█████▎    | 16/30 [00:01<00:01, 10.17it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:01, 10.21it/s]running benchmark:  67%|██████▋   | 20/30 [00:02<00:00, 10.18it/s]running benchmark:  73%|███████▎  | 22/30 [00:02<00:00, 10.19it/s]running benchmark:  80%|████████  | 24/30 [00:02<00:00, 10.21it/s]running benchmark:  87%|████████▋ | 26/30 [00:02<00:00, 10.24it/s]running benchmark:  93%|█████████▎| 28/30 [00:02<00:00, 10.27it/s]running benchmark: 100%|██████████| 30/30 [00:03<00:00, 10.25it/s]running benchmark: 100%|██████████| 30/30 [00:03<00:00,  9.76it/s]
+4175.148ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+hf_Whisper
+cuda eval  hf_Whisper                          int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:00, 49.40it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 54.66it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 56.36it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 57.17it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 57.64it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 56.63it/s]
+4256.511ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:05, ?it/s]
+WARNING:root:hf_clip failed to load
+hf_clip
+Original Error: 'str' object has no attribute 'shape'
+Eager model failed to run
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model
+    self.model_iter_fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.py", line 1142, in forward
+    vision_outputs = self.vision_model(
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.py", line 883, in forward
+    hidden_states = self.embeddings(pixel_values)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.py", line 194, in forward
+    batch_size = pixel_values.shape[0]
+AttributeError: 'str' object has no attribute 'shape'
+
+The above exception was the direct cause of the following exception:
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model
+    self.validate_model(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model
+    raise NotImplementedError("Eager model failed to run") from e
+NotImplementedError: Eager model failed to run
+
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+lennard_jones
+cuda eval  lennard_jones                       int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 704.44it/s]
+2482.369ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+llama
+cuda eval  llama                               int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:00, 44.33it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 45.44it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 45.58it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 45.42it/s]running benchmark:  83%|████████▎ | 25/30 [00:00<00:00, 45.74it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 45.91it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 45.65it/s]
+6298.944ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:59, ?it/s]
+llama_v2_7b_16h
+cuda eval  llama_v2_7b_16h                     int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:05,  5.45it/s]running benchmark:  10%|█         | 3/30 [00:00<00:02,  9.12it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:02, 10.38it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:02, 11.00it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 11.35it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:01, 11.55it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:01, 11.65it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01, 11.75it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:01, 11.80it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 11.86it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 11.89it/s]running benchmark:  77%|███████▋  | 23/30 [00:02<00:00, 11.91it/s]running benchmark:  83%|████████▎ | 25/30 [00:02<00:00, 11.93it/s]running benchmark:  90%|█████████ | 27/30 [00:02<00:00, 11.93it/s]running benchmark:  97%|█████████▋| 29/30 [00:02<00:00, 11.93it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 11.49it/s]
+1335.755ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+maml_omniglot
+cuda eval  maml_omniglot                       int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 866.47it/s]
+2882.167ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+mnasnet1_0
+cuda eval  mnasnet1_0                          int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 94.28it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 110.42it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 111.00it/s]
+3180.722ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+mobilenet_v2
+cuda eval  mobilenet_v2                        int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 96.85it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 113.99it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 114.45it/s]
+4744.114ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+WARNING:root:mobilenet_v2_quantized_qat failed to load
+mobilenet_v2_quantized_qat
+The eval test only supports CPU.
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 443, in load_model
+    benchmark = benchmark_cls(
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/util/model.py", line 24, in __call__
+    obj = type.__call__(cls, *args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/models/mobilenet_v2_quantized_qat/__init__.py", line 21, in __init__
+    raise NotImplementedError("The eval test only supports CPU.")
+NotImplementedError: The eval test only supports CPU.
+
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+mobilenet_v3_large
+cuda eval  mobilenet_v3_large                  int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 81.96it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 85.78it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 86.96it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 86.44it/s]
+4970.758ms
+loading model: 0it [00:00, ?it/s]NCCL version 2.19.3+cuda12.0
+loading model: 0it [00:03, ?it/s]
+moco
+cuda eval  moco                                int8weightonly            
+ERROR:common:Backend eager failed in warmup()
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2380, in warmup
+    fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/parallel/distributed.py", line 1523, in forward
+    else self._run_ddp_forward(*inputs, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/parallel/distributed.py", line 1359, in _run_ddp_forward
+    return self.module(*inputs, **kwargs)  # type: ignore[index]
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/models/moco/moco/builder.py", line 130, in forward
+    self._momentum_update_key_encoder()  # update the key encoder
+  File "/home/cdhernandez/local/pytorch/torch/utils/_contextlib.py", line 115, in decorate_context
+    return func(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/models/moco/moco/builder.py", line 50, in _momentum_update_key_encoder
+    param_k.mul_(self.m).add_(param_q.mul(1. - self.m))
+TypeError: add_(): argument 'other' (position 1) must be Tensor, not NoneType
+Run failed with return code:  255
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+nanogpt
+number of parameters: 123.69M
+num decayed parameter tensors: 50, with 124,354,560 parameters
+num non-decayed parameter tensors: 98, with 121,344 parameters
+using fused AdamW: True
+cuda eval  nanogpt                             int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 79.52it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 81.75it/s]running benchmark:  87%|████████▋ | 26/30 [00:00<00:00, 81.30it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 81.19it/s]
+5286.202ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+nvidia_deeprecommender
+cuda eval  nvidia_deeprecommender              int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 202.78it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 205.87it/s]
+924.469ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+opacus_cifar10
+cuda eval  opacus_cifar10                      int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 206.30it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 206.37it/s]
+4531.787ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:25, ?it/s]
+phi_1_5
+cuda eval  phi_1_5                             int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:03,  9.15it/s]running benchmark:  10%|█         | 3/30 [00:00<00:01, 13.79it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 15.01it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 15.70it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 16.05it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:01, 16.25it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:01, 16.34it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 16.31it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:00, 16.35it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 16.45it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 16.65it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 16.80it/s]running benchmark:  83%|████████▎ | 25/30 [00:01<00:00, 16.94it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 17.04it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 17.08it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 16.31it/s]
+2239.852ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+phlippe_densenet
+cuda eval  phlippe_densenet                    int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 93.15it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 94.23it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 95.69it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 95.11it/s]
+4683.889ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+phlippe_resnet
+cuda eval  phlippe_resnet                      int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 199.24it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 210.43it/s]
+4046.004ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+pyhpc_equation_of_state
+cuda eval  pyhpc_equation_of_state             int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 228.25it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 229.13it/s]
+16569.192ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+pyhpc_isoneutral_mixing
+cuda eval  pyhpc_isoneutral_mixing             int8weightonly            
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 87.08it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 88.43it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 88.70it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 88.61it/s]
+5940.769ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model pyhpc_turbulent_kinetic_energy does not support bfloat16, running with amp instead
+loading model: 0it [00:02, ?it/s]
+WARNING:common:Model pyhpc_turbulent_kinetic_energy does not support bfloat16, running with amp instead
+pyhpc_turbulent_kinetic_energy
+cuda eval  pyhpc_turbulent_kinetic_energy      int8weightonly            
+WARNING:common:Model pyhpc_turbulent_kinetic_energy does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:00, 48.78it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 68.40it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 74.54it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 73.22it/s]
+4555.870ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+pytorch_CycleGAN_and_pix2pix
+cuda eval  pytorch_CycleGAN_and_pix2pix        int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 145.92it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 151.10it/s]
+2385.773ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+pytorch_stargan
+cuda eval  pytorch_stargan                     int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 82.65it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 96.36it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 98.11it/s]
+2401.245ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+pytorch_unet
+cuda eval  pytorch_unet                        int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:00, 34.35it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 41.32it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 43.68it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 44.80it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 45.46it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 45.84it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 44.47it/s]
+1813.197ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+resnet152
+cuda eval  resnet152                           int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:04,  7.00it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 21.09it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 26.10it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 28.37it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 29.87it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 30.52it/s]running benchmark:  83%|████████▎ | 25/30 [00:00<00:00, 31.30it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 31.59it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 28.85it/s]
+2392.278ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+resnet18
+cuda eval  resnet18                            int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 226.56it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 229.25it/s]
+3502.487ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+resnet50
+cuda eval  resnet50                            int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:08,  3.47it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 31.51it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 49.14it/s]running benchmark:  93%|█████████▎| 28/30 [00:00<00:00, 60.55it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 47.59it/s]
+1828.509ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+WARNING:root:resnet50_quantized_qat failed to load
+resnet50_quantized_qat
+The eval test only supports CPU.
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 443, in load_model
+    benchmark = benchmark_cls(
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/util/model.py", line 24, in __call__
+    obj = type.__call__(cls, *args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/models/resnet50_quantized_qat/__init__.py", line 21, in __init__
+    raise NotImplementedError("The eval test only supports CPU.")
+NotImplementedError: The eval test only supports CPU.
+
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+resnext50_32x4d
+cuda eval  resnext50_32x4d                     int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 97.38it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 101.72it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 102.41it/s]
+3778.942ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:10, ?it/s]
+sam
+cuda eval  sam                                 int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:17,  1.65it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:10,  2.79it/s]running benchmark:  10%|█         | 3/30 [00:00<00:07,  3.58it/s]running benchmark:  13%|█▎        | 4/30 [00:01<00:06,  4.13it/s]running benchmark:  17%|█▋        | 5/30 [00:01<00:05,  4.50it/s]running benchmark:  20%|██        | 6/30 [00:01<00:05,  4.76it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:04,  4.95it/s]running benchmark:  27%|██▋       | 8/30 [00:01<00:04,  5.08it/s]running benchmark:  30%|███       | 9/30 [00:02<00:04,  5.16it/s]running benchmark:  33%|███▎      | 10/30 [00:02<00:03,  5.23it/s]running benchmark:  37%|███▋      | 11/30 [00:02<00:03,  5.27it/s]running benchmark:  40%|████      | 12/30 [00:02<00:03,  5.30it/s]running benchmark:  43%|████▎     | 13/30 [00:02<00:03,  5.33it/s]running benchmark:  47%|████▋     | 14/30 [00:03<00:02,  5.35it/s]running benchmark:  50%|█████     | 15/30 [00:03<00:02,  5.36it/s]running benchmark:  53%|█████▎    | 16/30 [00:03<00:02,  5.37it/s]running benchmark:  57%|█████▋    | 17/30 [00:03<00:02,  5.38it/s]running benchmark:  60%|██████    | 18/30 [00:03<00:02,  5.39it/s]running benchmark:  63%|██████▎   | 19/30 [00:03<00:02,  5.39it/s]running benchmark:  67%|██████▋   | 20/30 [00:04<00:01,  5.40it/s]running benchmark:  70%|███████   | 21/30 [00:04<00:01,  5.39it/s]running benchmark:  73%|███████▎  | 22/30 [00:04<00:01,  5.40it/s]running benchmark:  77%|███████▋  | 23/30 [00:04<00:01,  5.40it/s]running benchmark:  80%|████████  | 24/30 [00:04<00:01,  5.40it/s]running benchmark:  83%|████████▎ | 25/30 [00:05<00:00,  5.39it/s]running benchmark:  87%|████████▋ | 26/30 [00:05<00:00,  5.39it/s]running benchmark:  90%|█████████ | 27/30 [00:05<00:00,  5.38it/s]running benchmark:  93%|█████████▎| 28/30 [00:05<00:00,  5.39it/s]running benchmark:  97%|█████████▋| 29/30 [00:05<00:00,  5.39it/s]running benchmark: 100%|██████████| 30/30 [00:05<00:00,  5.37it/s]running benchmark: 100%|██████████| 30/30 [00:05<00:00,  5.01it/s]
+2075.661ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+shufflenet_v2_x1_0
+cuda eval  shufflenet_v2_x1_0                  int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 79.45it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 89.71it/s]running benchmark:  93%|█████████▎| 28/30 [00:00<00:00, 93.29it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 91.68it/s]
+3501.712ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+soft_actor_critic
+cuda eval  soft_actor_critic                   int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 826.51it/s]
+1990.728ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+speech_transformer
+cuda eval  speech_transformer                  int8weightonly            
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['non-cuda device in graph']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  10%|█         | 3/30 [00:00<00:00, 27.84it/s]running benchmark:  20%|██        | 6/30 [00:00<00:00, 28.23it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 28.34it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 28.67it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 28.85it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 28.66it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 28.54it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 28.60it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 28.50it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 28.54it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 28.53it/s]
+1676.928ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+squeezenet1_1
+cuda eval  squeezenet1_1                       int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 296.87it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 296.20it/s]
+3534.094ms
+loading model: 0it [00:00, ?it/s]stable_diffusion_text_encoder
+
+Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s][A
+Loading pipeline components...:  17%|█▋        | 1/6 [00:00<00:01,  4.78it/s][A
+Loading pipeline components...:  50%|█████     | 3/6 [00:00<00:00,  4.50it/s][A
+Loading pipeline components...:  83%|████████▎ | 5/6 [00:00<00:00,  7.45it/s][ALoading pipeline components...: 100%|██████████| 6/6 [00:00<00:00,  7.70it/s]
+loading model: 0it [00:08, ?it/s]
+cuda eval  stable_diffusion_text_encoder       int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:00, 35.23it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 35.98it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 36.09it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 36.24it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 35.98it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 36.21it/s]running benchmark:  93%|█████████▎| 28/30 [00:00<00:00, 36.35it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 36.17it/s]
+5272.442ms
+loading model: 0it [00:00, ?it/s]stable_diffusion_unet
+
+Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s][A
+Loading pipeline components...:  50%|█████     | 3/6 [00:00<00:00,  9.95it/s][A
+Loading pipeline components...: 100%|██████████| 6/6 [00:00<00:00,  7.35it/s][ALoading pipeline components...: 100%|██████████| 6/6 [00:00<00:00,  7.65it/s]
+loading model: 0it [00:06, ?it/s]
+cuda eval  stable_diffusion_unet               int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:05,  4.89it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:04,  6.95it/s]running benchmark:  10%|█         | 3/30 [00:00<00:03,  8.00it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:03,  8.62it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:02,  9.01it/s]running benchmark:  20%|██        | 6/30 [00:00<00:02,  9.26it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:02,  9.43it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:02,  9.55it/s]running benchmark:  30%|███       | 9/30 [00:01<00:02,  9.63it/s]running benchmark:  33%|███▎      | 10/30 [00:01<00:02,  9.69it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:01,  9.72it/s]running benchmark:  40%|████      | 12/30 [00:01<00:01,  9.74it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:01,  9.76it/s]running benchmark:  47%|████▋     | 14/30 [00:01<00:01,  9.77it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01,  9.78it/s]running benchmark:  53%|█████▎    | 16/30 [00:01<00:01,  9.78it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:01,  9.78it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:01,  9.77it/s]running benchmark:  63%|██████▎   | 19/30 [00:02<00:01,  9.75it/s]running benchmark:  67%|██████▋   | 20/30 [00:02<00:01,  9.73it/s]running benchmark:  70%|███████   | 21/30 [00:02<00:00,  9.73it/s]running benchmark:  73%|███████▎  | 22/30 [00:02<00:00,  9.74it/s]running benchmark:  77%|███████▋  | 23/30 [00:02<00:00,  9.75it/s]running benchmark:  80%|████████  | 24/30 [00:02<00:00,  9.75it/s]running benchmark:  83%|████████▎ | 25/30 [00:02<00:00,  9.75it/s]running benchmark:  87%|████████▋ | 26/30 [00:02<00:00,  9.75it/s]running benchmark:  90%|█████████ | 27/30 [00:02<00:00,  9.75it/s]running benchmark:  93%|█████████▎| 28/30 [00:02<00:00,  9.76it/s]running benchmark:  97%|█████████▋| 29/30 [00:03<00:00,  9.75it/s]running benchmark: 100%|██████████| 30/30 [00:03<00:00,  9.76it/s]running benchmark: 100%|██████████| 30/30 [00:03<00:00,  9.46it/s]
+1671.668ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+timm_efficientdet
+cuda eval  timm_efficientdet                   int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:08,  3.61it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:07,  3.70it/s]running benchmark:  10%|█         | 3/30 [00:00<00:07,  3.72it/s]running benchmark:  13%|█▎        | 4/30 [00:01<00:07,  3.68it/s]running benchmark:  17%|█▋        | 5/30 [00:01<00:06,  3.74it/s]running benchmark:  20%|██        | 6/30 [00:01<00:06,  3.77it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:06,  3.79it/s]running benchmark:  27%|██▋       | 8/30 [00:02<00:05,  3.81it/s]running benchmark:  30%|███       | 9/30 [00:02<00:05,  3.83it/s]running benchmark:  33%|███▎      | 10/30 [00:02<00:05,  3.82it/s]running benchmark:  37%|███▋      | 11/30 [00:02<00:04,  3.81it/s]running benchmark:  40%|████      | 12/30 [00:03<00:04,  3.82it/s]running benchmark:  43%|████▎     | 13/30 [00:03<00:04,  3.82it/s]running benchmark:  47%|████▋     | 14/30 [00:03<00:04,  3.82it/s]running benchmark:  50%|█████     | 15/30 [00:03<00:03,  3.84it/s]running benchmark:  53%|█████▎    | 16/30 [00:04<00:03,  3.85it/s]running benchmark:  57%|█████▋    | 17/30 [00:04<00:03,  3.86it/s]running benchmark:  60%|██████    | 18/30 [00:04<00:03,  3.86it/s]running benchmark:  63%|██████▎   | 19/30 [00:04<00:02,  3.84it/s]running benchmark:  67%|██████▋   | 20/30 [00:05<00:02,  3.82it/s]running benchmark:  70%|███████   | 21/30 [00:05<00:02,  3.80it/s]running benchmark:  73%|███████▎  | 22/30 [00:05<00:02,  3.82it/s]running benchmark:  77%|███████▋  | 23/30 [00:06<00:01,  3.83it/s]running benchmark:  80%|████████  | 24/30 [00:06<00:01,  3.85it/s]running benchmark:  83%|████████▎ | 25/30 [00:06<00:01,  3.81it/s]running benchmark:  87%|████████▋ | 26/30 [00:06<00:01,  3.82it/s]running benchmark:  90%|█████████ | 27/30 [00:07<00:00,  3.83it/s]running benchmark:  93%|█████████▎| 28/30 [00:07<00:00,  3.80it/s]running benchmark:  97%|█████████▋| 29/30 [00:07<00:00,  3.77it/s]running benchmark: 100%|██████████| 30/30 [00:07<00:00,  3.75it/s]running benchmark: 100%|██████████| 30/30 [00:07<00:00,  3.80it/s]
+1486.976ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+timm_efficientnet
+cuda eval  timm_efficientnet                   int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  20%|██        | 6/30 [00:00<00:00, 53.77it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 58.10it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 59.40it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 60.05it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 59.40it/s]
+2207.494ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+timm_nfnet
+cuda eval  timm_nfnet                          int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:06,  4.60it/s]running benchmark:  10%|█         | 3/30 [00:00<00:02,  9.54it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:02, 11.82it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 13.08it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 13.84it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:01, 14.28it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:01, 14.62it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01, 14.81it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:00, 14.97it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 15.06it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 15.13it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 15.18it/s]running benchmark:  83%|████████▎ | 25/30 [00:01<00:00, 15.20it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 15.23it/s]running benchmark:  97%|█████████▋| 29/30 [00:02<00:00, 15.25it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 14.18it/s]
+1844.913ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:05, ?it/s]
+timm_regnet
+cuda eval  timm_regnet                         int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  10%|█         | 3/30 [00:00<00:01, 24.26it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:00, 30.07it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 32.01it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 32.86it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 33.36it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 33.65it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 33.85it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 32.92it/s]
+1356.080ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+timm_resnest
+cuda eval  timm_resnest                        int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 98.89it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 104.40it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 104.85it/s]
+1669.521ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+timm_vision_transformer
+cuda eval  timm_vision_transformer             int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  20%|██        | 6/30 [00:00<00:00, 59.94it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 68.95it/s]running benchmark:  73%|███████▎  | 22/30 [00:00<00:00, 71.61it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 72.85it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 71.20it/s]
+2268.917ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:21, ?it/s]
+timm_vision_transformer_large
+cuda eval  timm_vision_transformer_large       int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:17,  1.62it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:11,  2.33it/s]running benchmark:  10%|█         | 3/30 [00:01<00:09,  2.72it/s]running benchmark:  13%|█▎        | 4/30 [00:01<00:08,  2.95it/s]running benchmark:  17%|█▋        | 5/30 [00:01<00:08,  3.09it/s]running benchmark:  20%|██        | 6/30 [00:02<00:07,  3.19it/s]running benchmark:  23%|██▎       | 7/30 [00:02<00:07,  3.25it/s]running benchmark:  27%|██▋       | 8/30 [00:02<00:06,  3.29it/s]running benchmark:  30%|███       | 9/30 [00:02<00:06,  3.32it/s]running benchmark:  33%|███▎      | 10/30 [00:03<00:05,  3.34it/s]running benchmark:  37%|███▋      | 11/30 [00:03<00:05,  3.35it/s]running benchmark:  40%|████      | 12/30 [00:03<00:05,  3.36it/s]running benchmark:  43%|████▎     | 13/30 [00:04<00:05,  3.36it/s]running benchmark:  47%|████▋     | 14/30 [00:04<00:04,  3.37it/s]running benchmark:  50%|█████     | 15/30 [00:04<00:04,  3.37it/s]running benchmark:  53%|█████▎    | 16/30 [00:05<00:04,  3.37it/s]running benchmark:  57%|█████▋    | 17/30 [00:05<00:03,  3.37it/s]running benchmark:  60%|██████    | 18/30 [00:05<00:03,  3.37it/s]running benchmark:  63%|██████▎   | 19/30 [00:05<00:03,  3.37it/s]running benchmark:  67%|██████▋   | 20/30 [00:06<00:02,  3.37it/s]running benchmark:  70%|███████   | 21/30 [00:06<00:02,  3.37it/s]running benchmark:  73%|███████▎  | 22/30 [00:06<00:02,  3.37it/s]running benchmark:  77%|███████▋  | 23/30 [00:07<00:02,  3.37it/s]running benchmark:  80%|████████  | 24/30 [00:07<00:01,  3.37it/s]running benchmark:  83%|████████▎ | 25/30 [00:07<00:01,  3.37it/s]running benchmark:  87%|████████▋ | 26/30 [00:08<00:01,  3.38it/s]running benchmark:  90%|█████████ | 27/30 [00:08<00:00,  3.37it/s]running benchmark:  93%|█████████▎| 28/30 [00:08<00:00,  3.37it/s]running benchmark:  97%|█████████▋| 29/30 [00:08<00:00,  3.37it/s]running benchmark: 100%|██████████| 30/30 [00:09<00:00,  3.37it/s]running benchmark: 100%|██████████| 30/30 [00:09<00:00,  3.26it/s]
+1710.876ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+timm_vovnet
+cuda eval  timm_vovnet                         int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:00, 66.54it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 79.83it/s]running benchmark:  83%|████████▎ | 25/30 [00:00<00:00, 84.09it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 82.74it/s]
+1766.941ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+torch_multimodal_clip
+cuda eval  torch_multimodal_clip               int8weightonly            
+skipping cudagraphs due to ['non-cuda device in graph']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:00, 30.55it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 30.85it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 30.19it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 29.98it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 30.02it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 29.92it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 29.79it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 29.75it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 29.96it/s]
+2270.178ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model tts_angular does not support bfloat16, running with amp instead
+loading model: 0it [00:01, ?it/s]
+WARNING:common:Model tts_angular does not support bfloat16, running with amp instead
+tts_angular
+cuda eval  tts_angular                         int8weightonly            
+WARNING:common:Model tts_angular does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 168.62it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 168.49it/s]
+930.431ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+vgg16
+cuda eval  vgg16                               int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 205.06it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 208.95it/s]
+1413.322ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model vision_maskrcnn does not support bfloat16, running with float16 instead
+loading model: 0it [00:03, ?it/s]
+WARNING:common:Model vision_maskrcnn does not support bfloat16, running with float16 instead
+vision_maskrcnn
+cuda eval  vision_maskrcnn                     int8weightonly            
+WARNING:common:Model vision_maskrcnn does not support bfloat16, running with float16 instead
+ERROR:common:Backend dynamo failed in warmup()
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2380, in warmup
+    fn(model, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 488, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/vision/torchvision/models/detection/generalized_rcnn.py", line 104, in forward
+    proposals, proposal_losses = self.rpn(images, features, targets)
+  File "/home/cdhernandez/local/vision/torchvision/models/detection/generalized_rcnn.py", line 105, in resume_in_forward
+    detections, detector_losses = self.roi_heads(features, proposals, images.image_sizes, targets)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/vision/torchvision/models/detection/roi_heads.py", line 761, in forward
+    box_features = self.box_roi_pool(features, proposals, image_shapes)
+  File "/home/cdhernandez/local/vision/torchvision/models/detection/roi_heads.py", line 775, in resume_in_forward
+    boxes, scores, labels = self.postprocess_detections(class_logits, box_regression, proposals, image_shapes)
+  File "/home/cdhernandez/local/vision/torchvision/models/detection/roi_heads.py", line 804, in resume_in_forward
+    mask_features = self.mask_roi_pool(features, mask_proposals, image_shapes)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 654, in catch_errors
+    return callback(frame, cache_entry, hooks, frame_state)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 727, in _convert_frame
+    result = inner_convert(frame, cache_entry, hooks, frame_state)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 383, in _convert_frame_assert
+    compiled_product = _compile(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 646, in _compile
+    guarded_code = compile_inner(code, one_graph, hooks, transform)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 562, in compile_inner
+    out_code = transform_code_object(code, transform)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/bytecode_transformation.py", line 1033, in transform_code_object
+    transformations(instructions, code_options)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 151, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 527, in transform
+    tracer.run()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2144, in run
+    super().run()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 818, in run
+    and self.step()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 781, in step
+    getattr(self, inst.opname)(inst)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2259, in RETURN_VALUE
+    self.output.compile_subgraph(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 945, in compile_subgraph
+    self.compile_and_call_fx_graph(tx, pass2.graph_output_vars(), root)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1087, in compile_and_call_fx_graph
+    compiled_fn = self.call_user_compiler(gm)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1159, in call_user_compiler
+    raise BackendCompilerFailed(self.compiler_fn, e).with_traceback(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1140, in call_user_compiler
+    compiled_fn = compiler_fn(gm, self.example_inputs())
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_dynamo.py", line 117, in debug_wrapper
+    compiled_gm = compiler_fn(gm, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/__init__.py", line 1672, in __call__
+    return compile_fx(model_, inputs_, config_patches=self.config)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 952, in compile_fx
+    return compile_fx(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1168, in compile_fx
+    return aot_autograd(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/backends/common.py", line 55, in compiler_fn
+    cg = aot_module_simplified(gm, example_inputs, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 887, in aot_module_simplified
+    compiled_fn = create_aot_dispatcher_function(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 600, in create_aot_dispatcher_function
+    compiled_fn = compiler_fn(flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 425, in aot_wrapper_dedupe
+    return compiler_fn(flat_fn, leaf_flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 630, in aot_wrapper_synthetic_base
+    return compiler_fn(flat_fn, flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py", line 97, in aot_dispatch_base
+    compiled_fw = compiler(fw_module, updated_flat_args)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1100, in fw_compiler_base
+    return inner_compile(
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_aot.py", line 83, in debug_wrapper
+    inner_compiled_fn = compiler_fn(gm, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/debug.py", line 305, in inner
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 320, in compile_fx_inner
+    compiled_graph = fx_codegen_and_compile(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 535, in fx_codegen_and_compile
+    graph.run(*example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 518, in run
+    return super().run(*args)
+  File "/home/cdhernandez/local/pytorch/torch/fx/interpreter.py", line 138, in run
+    self.env[node] = self.run_node(node)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 815, in run_node
+    result = self.call_function(n.target, args, kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 695, in call_function
+    raise LoweringException(e, target, args, kwargs).with_traceback(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 692, in call_function
+    out = lowerings[target](*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/lowering.py", line 291, in wrapped
+    out = decomp_fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/kernel/conv.py", line 367, in convolution
+    result = convolution(x, weight, None, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/lowering.py", line 291, in wrapped
+    out = decomp_fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/kernel/conv.py", line 457, in convolution
+    return autotune_select_algorithm("convolution", choices, args, layout)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/select_algorithm.py", line 991, in autotune_select_algorithm
+    return _ALGORITHM_SELECTOR_CACHE(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/select_algorithm.py", line 748, in __call__
+    timings = self.lookup(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 291, in lookup
+    timings = benchmark(choices)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/select_algorithm.py", line 739, in autotune
+    return make_benchmark_fn()(choices)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/select_algorithm.py", line 865, in benchmark_in_current_process
+    raise AssertionError(  # noqa: TRY200
+torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised:
+LoweringException: AssertionError: Incorrect result from choice ExternKernelCaller(extern_kernels.convolution)
+
+expected size 256==256, stride 196==1 at dim=1
+  target: aten.convolution.default
+  args[0]: TensorBox(StorageBox(
+    InputBuffer(name='arg12_1', layout=FixedLayout('cuda', torch.float16, size=[0, 256, 14, 14], stride=[50176, 196, 14, 1]))
+  ))
+  args[1]: TensorBox(StorageBox(
+    InputBuffer(name='arg0_1', layout=FixedLayout('cuda', torch.float16, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
+  ))
+  args[2]: TensorBox(StorageBox(
+    InputBuffer(name='arg1_1', layout=FixedLayout('cuda', torch.float16, size=[256], stride=[1]))
+  ))
+  args[3]: [1, 1]
+  args[4]: [1, 1]
+  args[5]: [1, 1]
+  args[6]: False
+  args[7]: [0, 0]
+  args[8]: 1
+
+Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information
+
+
+You can suppress this exception and fall back to eager by setting:
+    import torch._dynamo
+    torch._dynamo.config.suppress_errors = True
+
+Run failed with return code:  255
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+yolov3
+cuda eval  yolov3                              int8weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:00, 46.69it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 52.36it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 54.02it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 54.35it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 55.14it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 54.15it/s]
+1853.596ms
+
+Summary for tag=0.000000:
+speedup             gmean=0.00x mean=0.000x
+abs_latency         gmean=0.00x mean=0.000x
+compilation_latency mean=0.000 seconds
+compression_ratio   mean=0.000x
+eager_peak_mem      gmean=0.00x mean=0.000x
+dynamo_peak_mem     gmean=0.00x mean=0.000x
+calls_captured      gmean=0.00x mean=0.000x
+unique_graphs       gmean=0.00x mean=0.000x
+graph_breaks        gmean=0.00x mean=0.000x
+unique_graph_breaks gmean=0.00x mean=0.000x
+
+Summary for tag=int8dynamic:
+speedup             gmean=9.20x mean=113.389x
+abs_latency         gmean=4.24x mean=10.510x
+compilation_latency mean=34.839 seconds
+compression_ratio   mean=1.263x
+eager_peak_mem      gmean=0.38x mean=0.878x
+dynamo_peak_mem     gmean=0.36x mean=0.844x
+calls_captured      gmean=233.44x mean=564.988x
+unique_graphs       gmean=1.86x mean=7.136x
+graph_breaks        gmean=0.00x mean=5.160x
+unique_graph_breaks gmean=0.00x mean=1.333x
+
+Summary for tag=int8weightonly:
+speedup             gmean=2.46x mean=2.889x
+abs_latency         gmean=4.51x mean=11.782x
+compilation_latency mean=31.136 seconds
+compression_ratio   mean=1.098x
+eager_peak_mem      gmean=0.38x mean=0.871x
+dynamo_peak_mem     gmean=0.46x mean=0.896x
+calls_captured      gmean=233.16x mean=563.963x
+unique_graphs       gmean=1.85x mean=7.183x
+graph_breaks        gmean=0.00x mean=5.220x
+unique_graph_breaks gmean=0.00x mean=1.317x
+start int4 weight only
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+torchrec_dlrm
+/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/fbgemm_gpu_py.so: undefined symbol: _ZNK5torch8autograd4Node4nameEv
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 36, in <module>
+    run()
+  File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 30, in run
+    benchmark.run(bm_args)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/run.py", line 24, in run
+    main(TorchBenchmarkRunner(), original_dir, args)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3133, in main
+    process_entry(0, runner, original_dir, args)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3090, in process_entry
+    return maybe_fresh_cache(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1714, in inner
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 383, in load_model
+    module = importlib.import_module(c)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/importlib/__init__.py", line 126, in import_module
+    return _bootstrap._gcd_import(name[level:], package, level)
+  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
+  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
+  File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
+  File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
+  File "<frozen importlib._bootstrap_external>", line 883, in exec_module
+  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/__init__.py", line 7, in <module>
+    from .data.dlrm_dataloader import get_dataloader
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/data/dlrm_dataloader.py", line 13, in <module>
+    from torchrec.datasets.criteo import (
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/__init__.py", line 8, in <module>
+    import torchrec.distributed  # noqa
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/__init__.py", line 36, in <module>
+    from torchrec.distributed.model_parallel import DistributedModelParallel  # noqa
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/model_parallel.py", line 21, in <module>
+    from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/__init__.py", line 22, in <module>
+    from torchrec.distributed.planner.planners import EmbeddingShardingPlanner  # noqa
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/planners.py", line 19, in <module>
+    from torchrec.distributed.planner.constants import BATCH_SIZE, MAX_SIZE
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/constants.py", line 10, in <module>
+    from torchrec.distributed.embedding_types import EmbeddingComputeKernel
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/embedding_types.py", line 14, in <module>
+    from fbgemm_gpu.split_table_batched_embeddings_ops_training import EmbeddingLocation
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 23, in <module>
+    from . import _fbgemm_gpu_docs, sparse_ops  # noqa: F401, E402  # noqa: F401, E402
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/_fbgemm_gpu_docs.py", line 19, in <module>
+    torch.ops.fbgemm.jagged_2d_to_dense,
+  File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 822, in __getattr__
+    raise AttributeError(
+AttributeError: '_OpNamespace' 'fbgemm' object has no attribute 'jagged_2d_to_dense'
+Run failed with return code:  1
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+BERT_pytorch
+cuda eval  BERT_pytorch                        int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:04,  7.20it/s]running benchmark:  10%|█         | 3/30 [00:00<00:02, 12.46it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 14.35it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 15.28it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 15.80it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:01, 16.11it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:01, 16.30it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 16.44it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:00, 16.55it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 16.61it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 16.67it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 16.70it/s]running benchmark:  83%|████████▎ | 25/30 [00:01<00:00, 16.73it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 16.74it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 16.75it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 16.05it/s]
+1128.943ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+Background_Matting
+cuda eval  Background_Matting                  int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:00, 35.29it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 39.92it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 41.32it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 41.92it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 42.25it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 42.45it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 41.69it/s]
+2057.696ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model DALLE2_pytorch does not support bfloat16, running with amp instead
+loading model: 0it [00:12, ?it/s]
+WARNING:common:Model DALLE2_pytorch does not support bfloat16, running with amp instead
+DALLE2_pytorch
+cuda eval  DALLE2_pytorch                      int4weightonly            
+WARNING:common:Model DALLE2_pytorch does not support bfloat16, running with amp instead
+[2023-12-12 00:01:37,045] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 00:01:37,315] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 00:01:37,532] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 00:01:37,757] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 00:01:37,977] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 00:01:38,193] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 00:01:38,403] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 00:01:38,613] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 00:01:38,824] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 00:01:39,039] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 00:01:39,258] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 00:01:39,475] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 00:02:08,222] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 00:02:08,418] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 00:02:08,609] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 00:02:08,807] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 00:02:09,011] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 00:02:09,206] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 00:02:09,401] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 00:02:09,595] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 00:02:09,792] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 00:02:09,989] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 00:02:10,188] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 00:02:10,383] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+ERROR:common:Backend dynamo failed in warmup()
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2380, in warmup
+    fn(model, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 488, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/utils/_contextlib.py", line 115, in decorate_context
+    return func(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/dalle2_pytorch/dalle2_pytorch.py", line 95, in inner
+    model.eval()
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/dalle2_pytorch/dalle2_pytorch.py", line 96, in resume_in_inner
+    out = fn(model, *args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/dalle2_pytorch/dalle2_pytorch.py", line 3329, in forward
+    image_embed = self.prior.sample(text, num_samples_per_batch = self.prior_num_samples, cond_scale = prior_cond_scale)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/dalle2_pytorch/dalle2_pytorch.py", line 3332, in resume_in_forward
+    images = self.decoder.sample(image_embed = image_embed, text = text_cond, cond_scale = cond_scale)
+  File "/home/cdhernandez/local/pytorch/torch/utils/_contextlib.py", line 115, in decorate_context
+    return func(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/dalle2_pytorch/dalle2_pytorch.py", line 95, in inner
+    model.eval()
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/dalle2_pytorch/dalle2_pytorch.py", line 96, in resume_in_inner
+    out = fn(model, *args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/dalle2_pytorch/dalle2_pytorch.py", line 3199, in sample
+    img = self.p_sample_loop(
+  File "/home/cdhernandez/local/pytorch/torch/utils/_contextlib.py", line 115, in decorate_context
+    return func(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/dalle2_pytorch/dalle2_pytorch.py", line 3019, in p_sample_loop
+    @torch.no_grad()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 488, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/external_utils.py", line 17, in inner
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 901, in forward
+    return compiled_fn(full_args)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/utils.py", line 81, in g
+    return f(*args)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 94, in runtime_wrapper
+    all_outs = call_func_at_runtime_with_args(
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/utils.py", line 105, in call_func_at_runtime_with_args
+    out = normalize_as_list(f(args))
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 297, in inner_fn
+    unwrapped_outs = runtime_fn(unwrapped_args)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py", line 118, in rng_functionalization_wrapper
+    return compiled_fw(args)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 863, in __call__
+    return self.get_current_callable()(inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 665, in run
+    return compiled_fn(new_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/cudagraph_trees.py", line 380, in deferred_cudagraphify
+    fn, out = cudagraphify(model, inputs, new_static_input_idxs, *args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/cudagraph_trees.py", line 408, in cudagraphify
+    return manager.add_function(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/cudagraph_trees.py", line 1941, in add_function
+    return fn, fn(inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/cudagraph_trees.py", line 1755, in run
+    out = self._run(new_inputs, function_id)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/cudagraph_trees.py", line 1796, in _run
+    return self.run_eager(new_inputs, function_id)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/cudagraph_trees.py", line 1911, in run_eager
+    return node.run(new_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/cudagraph_trees.py", line 611, in run
+    out = self.wrapped_function.model(new_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 891, in _run_from_cache
+    return compiled_graph.compiled_artifact(inputs)
+  File "/tmp/torchinductor_cdhernandez/3e/c3e2nwnk77sukrondqzw7w7udogyeub3rowfjpo3yjxc2g6ymebc.py", line 12668, in call
+    buf170 = aten._weight_int4pack_mm(buf169, arg141_1, 128, arg142_1)
+  File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 755, in __call__
+    return self._op(*args, **(kwargs or {}))
+  File "/home/cdhernandez/local/pytorch/torch/utils/_device.py", line 77, in __torch_function__
+    return func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 755, in __call__
+    return self._op(*args, **(kwargs or {}))
+RuntimeError: Expected A.is_contiguous() to be true, but got false.  (Could this error message be improved?  If so, please report an enhancement request to PyTorch.)
+Run failed with return code:  255
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:05, ?it/s]
+LearningToPaint
+cuda eval  LearningToPaint                     int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 189.23it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 195.26it/s]
+2047.300ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model Super_SloMo does not support bfloat16, running with amp instead
+loading model: 0it [00:03, ?it/s]
+WARNING:common:Model Super_SloMo does not support bfloat16, running with amp instead
+Super_SloMo
+cuda eval  Super_SloMo                         int4weightonly            
+WARNING:common:Model Super_SloMo does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:03,  8.97it/s]running benchmark:  10%|█         | 3/30 [00:00<00:01, 14.67it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 16.58it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 17.48it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 17.95it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:01, 18.26it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 18.46it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 18.56it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 18.66it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 18.74it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 18.75it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 18.76it/s]running benchmark:  83%|████████▎ | 25/30 [00:01<00:00, 18.77it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 18.79it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 18.81it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 18.17it/s]
+1619.383ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+alexnet
+cuda eval  alexnet                             int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 19.25it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 86.23it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 107.32it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 94.97it/s] 
+1274.608ms
+loading model: 0it [00:00, ?it/s]basic_gnn_edgecnn
+loading model: 0it [00:03, ?it/s]
+cuda eval  basic_gnn_edgecnn                   int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:04,  5.95it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 106.71it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 100.03it/s]
+1380.827ms
+loading model: 0it [00:00, ?it/s]basic_gnn_gcn
+loading model: 0it [00:03, ?it/s]
+cuda eval  basic_gnn_gcn                       int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 131.72it/s]running benchmark:  93%|█████████▎| 28/30 [00:00<00:00, 133.34it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 133.06it/s]
+1048.914ms
+loading model: 0it [00:00, ?it/s]basic_gnn_gin
+loading model: 0it [00:03, ?it/s]
+cuda eval  basic_gnn_gin                       int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 334.82it/s]
+1212.970ms
+loading model: 0it [00:00, ?it/s]basic_gnn_sage
+loading model: 0it [00:03, ?it/s]
+cuda eval  basic_gnn_sage                      int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 151.51it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 158.77it/s]
+1145.545ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:05, ?it/s]
+cm3leon_generate
+cuda eval  cm3leon_generate                    int4weightonly            
+AUTOTUNE bmm(16x1x96, 16x96x865)
+  triton_bmm_82714 0.0098 ms 100.0%
+  triton_bmm_82712 0.0099 ms 99.7%
+  triton_bmm_82708 0.0100 ms 97.8%
+  triton_bmm_82709 0.0103 ms 95.6%
+  triton_bmm_82715 0.0103 ms 95.3%
+  triton_bmm_82711 0.0104 ms 94.8%
+  triton_bmm_82710 0.0107 ms 92.2%
+  triton_bmm_82716 0.0107 ms 92.2%
+  triton_bmm_82717 0.0108 ms 91.1%
+  triton_bmm_82718 0.0108 ms 91.1%
+SingleProcess AUTOTUNE takes 4.1096 seconds
+AUTOTUNE bmm(16x1x865, 16x865x96)
+  bmm 0.0145 ms 100.0%
+  triton_bmm_82741 0.0187 ms 77.4%
+  triton_bmm_82740 0.0208 ms 69.6%
+  triton_bmm_82738 0.0275 ms 52.8%
+  triton_bmm_82737 0.0277 ms 52.2%
+  triton_bmm_82736 0.0281 ms 51.7%
+  triton_bmm_82734 0.0283 ms 51.2%
+  triton_bmm_82733 0.0297 ms 48.8%
+  triton_bmm_82735 0.0305 ms 47.5%
+  triton_bmm_82732 0.0386 ms 37.5%
+SingleProcess AUTOTUNE takes 4.6605 seconds
+AUTOTUNE bmm(16x1x96, 16x96x866)
+  triton_bmm_82808 0.0098 ms 100.0%
+  triton_bmm_82806 0.0101 ms 96.8%
+  triton_bmm_82805 0.0102 ms 95.6%
+  triton_bmm_82807 0.0102 ms 95.6%
+  triton_bmm_82809 0.0103 ms 95.3%
+  triton_bmm_82811 0.0103 ms 95.3%
+  triton_bmm_82810 0.0104 ms 94.4%
+  triton_bmm_82815 0.0105 ms 93.0%
+  triton_bmm_82804 0.0106 ms 92.4%
+  triton_bmm_82812 0.0107 ms 91.9%
+SingleProcess AUTOTUNE takes 3.6314 seconds
+AUTOTUNE bmm(16x1x866, 16x866x96)
+  triton_bmm_82833 0.0127 ms 100.0%
+  triton_bmm_82836 0.0130 ms 97.3%
+  triton_bmm_82834 0.0130 ms 97.2%
+  triton_bmm_82832 0.0140 ms 90.6%
+  triton_bmm_82831 0.0150 ms 84.4%
+  triton_bmm_82830 0.0170 ms 74.6%
+  bmm 0.0177 ms 71.5%
+  triton_bmm_82829 0.0183 ms 69.1%
+  triton_bmm_82828 0.0265 ms 47.8%
+  triton_bmm_82835 0.0295 ms 43.0%
+SingleProcess AUTOTUNE takes 4.0954 seconds
+AUTOTUNE bmm(16x1x96, 16x96x867)
+  triton_bmm_82904 0.0099 ms 100.0%
+  triton_bmm_82900 0.0100 ms 98.4%
+  triton_bmm_82902 0.0101 ms 97.8%
+  triton_bmm_82907 0.0103 ms 96.0%
+  triton_bmm_82906 0.0104 ms 95.1%
+  triton_bmm_82908 0.0107 ms 92.2%
+  triton_bmm_82910 0.0108 ms 91.7%
+  triton_bmm_82901 0.0108 ms 91.2%
+  triton_bmm_82905 0.0109 ms 90.9%
+  triton_bmm_82903 0.0109 ms 90.6%
+SingleProcess AUTOTUNE takes 6.1017 seconds
+AUTOTUNE bmm(16x1x867, 16x867x96)
+  bmm 0.0148 ms 100.0%
+  triton_bmm_82933 0.0182 ms 81.2%
+  triton_bmm_82932 0.0208 ms 71.2%
+  triton_bmm_82929 0.0273 ms 54.2%
+  triton_bmm_82928 0.0275 ms 53.8%
+  triton_bmm_82930 0.0278 ms 53.2%
+  triton_bmm_82926 0.0283 ms 52.3%
+  triton_bmm_82925 0.0297 ms 49.8%
+  triton_bmm_82927 0.0310 ms 47.7%
+  triton_bmm_82924 0.0386 ms 38.4%
+SingleProcess AUTOTUNE takes 4.2184 seconds
+AUTOTUNE bmm(16x1x96, 16x96x868)
+  triton_bmm_82998 0.0099 ms 100.0%
+  triton_bmm_83004 0.0101 ms 98.1%
+  triton_bmm_83000 0.0103 ms 96.6%
+  triton_bmm_83003 0.0103 ms 96.6%
+  triton_bmm_83002 0.0103 ms 96.3%
+  triton_bmm_83001 0.0104 ms 95.7%
+  triton_bmm_83007 0.0105 ms 94.2%
+  triton_bmm_83006 0.0106 ms 93.9%
+  triton_bmm_82996 0.0106 ms 93.7%
+  triton_bmm_82997 0.0106 ms 93.4%
+SingleProcess AUTOTUNE takes 3.8618 seconds
+AUTOTUNE bmm(16x1x868, 16x868x96)
+  triton_bmm_83025 0.0123 ms 100.0%
+  triton_bmm_83026 0.0124 ms 99.0%
+  triton_bmm_83028 0.0130 ms 94.1%
+  triton_bmm_83029 0.0135 ms 90.5%
+  triton_bmm_83024 0.0140 ms 87.6%
+  triton_bmm_83023 0.0151 ms 81.3%
+  triton_bmm_83022 0.0170 ms 72.3%
+  bmm 0.0176 ms 69.8%
+  triton_bmm_83021 0.0180 ms 68.1%
+  triton_bmm_83020 0.0260 ms 47.1%
+SingleProcess AUTOTUNE takes 4.1177 seconds
+AUTOTUNE bmm(16x1x96, 16x96x869)
+  triton_bmm_83094 0.0101 ms 100.0%
+  triton_bmm_83093 0.0103 ms 98.1%
+  triton_bmm_83099 0.0103 ms 97.8%
+  triton_bmm_83096 0.0104 ms 96.6%
+  triton_bmm_83098 0.0105 ms 96.3%
+  triton_bmm_83092 0.0106 ms 95.2%
+  triton_bmm_83100 0.0107 ms 94.3%
+  triton_bmm_83095 0.0108 ms 93.2%
+  triton_bmm_83097 0.0109 ms 92.6%
+  triton_bmm_83103 0.0111 ms 90.5%
+SingleProcess AUTOTUNE takes 3.8160 seconds
+AUTOTUNE bmm(16x1x869, 16x869x96)
+  bmm 0.0148 ms 100.0%
+  triton_bmm_83125 0.0182 ms 81.1%
+  triton_bmm_83124 0.0209 ms 70.9%
+  triton_bmm_83122 0.0276 ms 53.7%
+  triton_bmm_83121 0.0280 ms 52.8%
+  triton_bmm_83120 0.0282 ms 52.4%
+  triton_bmm_83118 0.0285 ms 51.9%
+  triton_bmm_83117 0.0298 ms 49.6%
+  triton_bmm_83119 0.0311 ms 47.5%
+  triton_bmm_83116 0.0388 ms 38.1%
+SingleProcess AUTOTUNE takes 3.8786 seconds
+AUTOTUNE bmm(16x1x96, 16x96x870)
+  triton_bmm_83192 0.0099 ms 100.0%
+  triton_bmm_83194 0.0099 ms 99.7%
+  triton_bmm_83188 0.0100 ms 98.1%
+  triton_bmm_83196 0.0100 ms 98.1%
+  triton_bmm_83190 0.0101 ms 97.5%
+  triton_bmm_83189 0.0102 ms 96.9%
+  triton_bmm_83199 0.0105 ms 93.6%
+  triton_bmm_83191 0.0108 ms 91.7%
+  triton_bmm_83198 0.0108 ms 91.4%
+  triton_bmm_83193 0.0108 ms 91.1%
+SingleProcess AUTOTUNE takes 3.8430 seconds
+AUTOTUNE bmm(16x1x870, 16x870x96)
+  triton_bmm_83217 0.0123 ms 100.0%
+  triton_bmm_83218 0.0131 ms 93.9%
+  triton_bmm_83220 0.0136 ms 90.4%
+  triton_bmm_83221 0.0140 ms 87.7%
+  triton_bmm_83216 0.0146 ms 84.4%
+  triton_bmm_83215 0.0156 ms 78.9%
+  triton_bmm_83214 0.0170 ms 72.3%
+  bmm 0.0177 ms 69.6%
+  triton_bmm_83213 0.0183 ms 67.1%
+  triton_bmm_83212 0.0270 ms 45.5%
+SingleProcess AUTOTUNE takes 4.6542 seconds
+AUTOTUNE bmm(16x1x96, 16x96x871)
+  triton_bmm_83288 0.0099 ms 100.0%
+  triton_bmm_83292 0.0102 ms 97.3%
+  triton_bmm_83287 0.0103 ms 95.8%
+  triton_bmm_83290 0.0105 ms 94.6%
+  triton_bmm_83284 0.0107 ms 92.9%
+  triton_bmm_83286 0.0107 ms 92.4%
+  triton_bmm_83294 0.0108 ms 91.8%
+  triton_bmm_83289 0.0108 ms 91.3%
+  triton_bmm_83285 0.0109 ms 91.0%
+  triton_bmm_83291 0.0109 ms 91.0%
+SingleProcess AUTOTUNE takes 4.2990 seconds
+AUTOTUNE bmm(16x1x871, 16x871x96)
+  bmm 0.0159 ms 100.0%
+  triton_bmm_83317 0.0188 ms 84.5%
+  triton_bmm_83316 0.0211 ms 75.4%
+  triton_bmm_83313 0.0275 ms 57.7%
+  triton_bmm_83314 0.0278 ms 57.1%
+  triton_bmm_83312 0.0280 ms 56.8%
+  triton_bmm_83310 0.0285 ms 55.6%
+  triton_bmm_83309 0.0300 ms 52.9%
+  triton_bmm_83311 0.0315 ms 50.5%
+  triton_bmm_83308 0.0387 ms 41.0%
+SingleProcess AUTOTUNE takes 4.5755 seconds
+AUTOTUNE bmm(16x1x96, 16x96x872)
+  triton_bmm_83384 0.0097 ms 100.0%
+  triton_bmm_83381 0.0100 ms 97.1%
+  triton_bmm_83380 0.0100 ms 96.8%
+  triton_bmm_83382 0.0100 ms 96.8%
+  triton_bmm_83388 0.0100 ms 96.8%
+  triton_bmm_83385 0.0103 ms 94.7%
+  triton_bmm_83386 0.0103 ms 94.6%
+  triton_bmm_83391 0.0105 ms 92.4%
+  triton_bmm_83383 0.0107 ms 91.0%
+  triton_bmm_83390 0.0108 ms 90.2%
+SingleProcess AUTOTUNE takes 4.2114 seconds
+AUTOTUNE bmm(16x1x872, 16x872x96)
+  bmm 0.0128 ms 100.0%
+  triton_bmm_83410 0.0129 ms 99.5%
+  triton_bmm_83409 0.0130 ms 98.8%
+  triton_bmm_83412 0.0141 ms 91.1%
+  triton_bmm_83408 0.0143 ms 89.5%
+  triton_bmm_83413 0.0145 ms 88.3%
+  triton_bmm_83407 0.0150 ms 85.5%
+  triton_bmm_83406 0.0168 ms 76.4%
+  triton_bmm_83405 0.0183 ms 70.0%
+  triton_bmm_83404 0.0265 ms 48.4%
+SingleProcess AUTOTUNE takes 3.8394 seconds
+AUTOTUNE bmm(16x1x96, 16x96x873)
+  triton_bmm_83482 0.0099 ms 100.0%
+  triton_bmm_83476 0.0100 ms 98.4%
+  triton_bmm_83477 0.0102 ms 96.9%
+  triton_bmm_83483 0.0103 ms 96.0%
+  triton_bmm_83481 0.0104 ms 95.4%
+  triton_bmm_83480 0.0105 ms 94.5%
+  triton_bmm_83478 0.0107 ms 92.5%
+  triton_bmm_83484 0.0108 ms 92.0%
+  triton_bmm_83479 0.0108 ms 91.2%
+  triton_bmm_83487 0.0111 ms 88.8%
+SingleProcess AUTOTUNE takes 3.9112 seconds
+AUTOTUNE bmm(16x1x873, 16x873x96)
+  bmm 0.0158 ms 100.0%
+  triton_bmm_83509 0.0188 ms 84.3%
+  triton_bmm_83508 0.0205 ms 77.3%
+  triton_bmm_83505 0.0280 ms 56.6%
+  triton_bmm_83506 0.0281 ms 56.4%
+  triton_bmm_83504 0.0282 ms 56.1%
+  triton_bmm_83502 0.0284 ms 55.7%
+  triton_bmm_83501 0.0295 ms 53.7%
+  triton_bmm_83503 0.0313 ms 50.6%
+  triton_bmm_83500 0.0387 ms 40.9%
+SingleProcess AUTOTUNE takes 3.8774 seconds
+AUTOTUNE bmm(16x1x96, 16x96x874)
+  triton_bmm_83574 0.0101 ms 100.0%
+  triton_bmm_83575 0.0103 ms 98.4%
+  triton_bmm_83579 0.0103 ms 98.4%
+  triton_bmm_83576 0.0104 ms 97.5%
+  triton_bmm_83578 0.0104 ms 96.9%
+  triton_bmm_83572 0.0106 ms 95.5%
+  triton_bmm_83580 0.0107 ms 94.9%
+  triton_bmm_83573 0.0108 ms 93.8%
+  triton_bmm_83577 0.0108 ms 93.8%
+  triton_bmm_83583 0.0111 ms 91.5%
+SingleProcess AUTOTUNE takes 3.8302 seconds
+AUTOTUNE bmm(16x1x874, 16x874x96)
+  triton_bmm_83602 0.0125 ms 100.0%
+  triton_bmm_83601 0.0128 ms 97.6%
+  triton_bmm_83604 0.0132 ms 94.4%
+  triton_bmm_83600 0.0140 ms 89.3%
+  triton_bmm_83605 0.0146 ms 85.7%
+  triton_bmm_83599 0.0152 ms 82.1%
+  triton_bmm_83598 0.0176 ms 71.2%
+  bmm 0.0180 ms 69.7%
+  triton_bmm_83597 0.0185 ms 67.5%
+  triton_bmm_83596 0.0267 ms 46.9%
+SingleProcess AUTOTUNE takes 4.0081 seconds
+AUTOTUNE bmm(16x1x96, 16x96x875)
+  triton_bmm_83674 0.0099 ms 100.0%
+  triton_bmm_83668 0.0100 ms 98.4%
+  triton_bmm_83670 0.0101 ms 97.8%
+  triton_bmm_83676 0.0102 ms 97.2%
+  triton_bmm_83675 0.0103 ms 96.0%
+  triton_bmm_83672 0.0104 ms 94.6%
+  triton_bmm_83679 0.0106 ms 93.6%
+  triton_bmm_83677 0.0108 ms 91.7%
+  triton_bmm_83678 0.0108 ms 91.7%
+  triton_bmm_83669 0.0108 ms 91.2%
+SingleProcess AUTOTUNE takes 3.6921 seconds
+AUTOTUNE bmm(16x1x875, 16x875x96)
+  bmm 0.0165 ms 100.0%
+  triton_bmm_83701 0.0182 ms 90.7%
+  triton_bmm_83700 0.0205 ms 80.5%
+  triton_bmm_83696 0.0279 ms 59.3%
+  triton_bmm_83697 0.0281 ms 58.9%
+  triton_bmm_83698 0.0282 ms 58.6%
+  triton_bmm_83694 0.0286 ms 57.9%
+  triton_bmm_83693 0.0297 ms 55.7%
+  triton_bmm_83695 0.0310 ms 53.4%
+  triton_bmm_83692 0.0388 ms 42.7%
+SingleProcess AUTOTUNE takes 4.1207 seconds
+AUTOTUNE bmm(16x1x96, 16x96x876)
+  triton_bmm_83768 0.0098 ms 100.0%
+  triton_bmm_83770 0.0098 ms 99.7%
+  triton_bmm_83765 0.0101 ms 96.8%
+  triton_bmm_83767 0.0101 ms 96.8%
+  triton_bmm_83772 0.0101 ms 96.5%
+  triton_bmm_83771 0.0103 ms 95.0%
+  triton_bmm_83775 0.0105 ms 92.7%
+  triton_bmm_83766 0.0106 ms 92.1%
+  triton_bmm_83774 0.0106 ms 92.1%
+  triton_bmm_83764 0.0106 ms 91.9%
+SingleProcess AUTOTUNE takes 3.9437 seconds
+AUTOTUNE bmm(16x1x876, 16x876x96)
+  triton_bmm_83793 0.0123 ms 100.0%
+  triton_bmm_83794 0.0125 ms 98.2%
+  triton_bmm_83796 0.0132 ms 92.8%
+  triton_bmm_83797 0.0143 ms 85.9%
+  triton_bmm_83792 0.0145 ms 84.8%
+  triton_bmm_83791 0.0152 ms 80.7%
+  bmm 0.0173 ms 71.0%
+  triton_bmm_83790 0.0175 ms 70.2%
+  triton_bmm_83789 0.0181 ms 68.0%
+  triton_bmm_83788 0.0266 ms 46.2%
+SingleProcess AUTOTUNE takes 3.8556 seconds
+TIMEOUT
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+dcgan
+cuda eval  dcgan                               int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 468.65it/s]
+1376.735ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model demucs does not support bfloat16, running with amp instead
+loading model: 0it [00:04, ?it/s]
+WARNING:common:Model demucs does not support bfloat16, running with amp instead
+demucs
+cuda eval  demucs                              int4weightonly            
+WARNING:common:Model demucs does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 14.29it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 18.29it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 19.58it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 20.16it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 20.48it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 20.67it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 20.79it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 20.86it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 20.91it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 20.93it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 20.39it/s]
+1179.486ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+densenet121
+cuda eval  densenet121                         int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  10%|█         | 3/30 [00:00<00:01, 24.76it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:00, 30.11it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 31.89it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 32.74it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 33.21it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 33.48it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 33.66it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 32.79it/s]
+1993.155ms
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_c4
+WARNING:common:Model detectron2_fasterrcnn_r_101_c4 does not support bfloat16, running with amp instead
+loading model: 0it [00:06, ?it/s]
+WARNING:common:Model detectron2_fasterrcnn_r_101_c4 does not support bfloat16, running with amp instead
+cuda eval  detectron2_fasterrcnn_r_101_c4      int4weightonly            
+WARNING:common:Model detectron2_fasterrcnn_r_101_c4 does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 00:43:09,850] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:09,  2.91it/s]running benchmark:  10%|█         | 3/30 [00:00<00:03,  6.82it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:02,  9.09it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:02, 10.45it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 11.37it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:01, 11.72it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:01, 12.21it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01, 12.57it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:01, 12.83it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 12.98it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 13.10it/s]running benchmark:  77%|███████▋  | 23/30 [00:02<00:00, 13.17it/s]running benchmark:  83%|████████▎ | 25/30 [00:02<00:00, 13.25it/s]running benchmark:  90%|█████████ | 27/30 [00:02<00:00, 12.97it/s]running benchmark:  97%|█████████▋| 29/30 [00:02<00:00, 13.11it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 11.79it/s]
+1749.239ms
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_dc5
+loading model: 0it [00:10, ?it/s]
+WARNING:root:detectron2_fasterrcnn_r_101_dc5 failed to load
+Original Error: "roi_align_forward_kernel" not implemented for 'BFloat16'
+Eager model failed to run
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model
+    self.model_iter_fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward
+    return self.inference(batched_inputs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference
+    results, _ = self.roi_heads(images, features, proposals, None)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 747, in forward
+    pred_instances = self._forward_box(features, proposals)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 798, in _forward_box
+    box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals])
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward
+    return self.level_poolers[0](x[0], pooler_fmt_boxes)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward
+    return roi_align(
+  File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 238, in roi_align
+    return torch.ops.torchvision.roi_align(
+  File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 755, in __call__
+    return self._op(*args, **(kwargs or {}))
+RuntimeError: "roi_align_forward_kernel" not implemented for 'BFloat16'
+
+The above exception was the direct cause of the following exception:
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model
+    self.validate_model(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model
+    raise NotImplementedError("Eager model failed to run") from e
+NotImplementedError: Eager model failed to run
+
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_fpn
+WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:07, ?it/s]
+WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_fasterrcnn_r_101_fpn     int4weightonly            
+WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 00:44:57,955] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 18.62it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 19.60it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 19.91it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 20.01it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 20.08it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 20.13it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 20.18it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 20.14it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 20.18it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 20.23it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 20.09it/s]
+2146.622ms
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_50_c4
+loading model: 0it [00:06, ?it/s]
+WARNING:root:detectron2_fasterrcnn_r_50_c4 failed to load
+Original Error: "roi_align_forward_kernel" not implemented for 'BFloat16'
+Eager model failed to run
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model
+    self.model_iter_fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward
+    return self.inference(batched_inputs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference
+    results, _ = self.roi_heads(images, features, proposals, None)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 477, in forward
+    box_features = self._shared_roi_transform(
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 456, in _shared_roi_transform
+    x = self.pooler(features, boxes)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward
+    return self.level_poolers[0](x[0], pooler_fmt_boxes)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward
+    return roi_align(
+  File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 238, in roi_align
+    return torch.ops.torchvision.roi_align(
+  File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 755, in __call__
+    return self._op(*args, **(kwargs or {}))
+RuntimeError: "roi_align_forward_kernel" not implemented for 'BFloat16'
+
+The above exception was the direct cause of the following exception:
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model
+    self.validate_model(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model
+    raise NotImplementedError("Eager model failed to run") from e
+NotImplementedError: Eager model failed to run
+
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_50_dc5
+WARNING:common:Model detectron2_fasterrcnn_r_50_dc5 does not support bfloat16, running with amp instead
+loading model: 0it [00:09, ?it/s]
+WARNING:common:Model detectron2_fasterrcnn_r_50_dc5 does not support bfloat16, running with amp instead
+cuda eval  detectron2_fasterrcnn_r_50_dc5      int4weightonly            
+WARNING:common:Model detectron2_fasterrcnn_r_50_dc5 does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 00:46:13,545] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 14.38it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:01, 15.20it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 15.54it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 15.72it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 15.83it/s]running benchmark:  40%|████      | 12/30 [00:00<00:01, 15.87it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:01, 15.94it/s]running benchmark:  53%|█████▎    | 16/30 [00:01<00:00, 15.96it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:00, 16.01it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 16.05it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 16.08it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 16.08it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 16.11it/s]running benchmark:  93%|█████████▎| 28/30 [00:01<00:00, 16.11it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 16.09it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 15.92it/s]
+1232.009ms
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_50_fpn
+WARNING:common:Model detectron2_fasterrcnn_r_50_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:06, ?it/s]
+WARNING:common:Model detectron2_fasterrcnn_r_50_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_fasterrcnn_r_50_fpn      int4weightonly            
+WARNING:common:Model detectron2_fasterrcnn_r_50_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 00:47:22,238] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 19.79it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 22.66it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 23.46it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 23.81it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 24.01it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 23.42it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 23.67it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 23.86it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 24.01it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 24.11it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 23.74it/s]
+1902.075ms
+loading model: 0it [00:00, ?it/s]detectron2_fcos_r_50_fpn
+WARNING:common:Model detectron2_fcos_r_50_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:05, ?it/s]
+WARNING:common:Model detectron2_fcos_r_50_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_fcos_r_50_fpn            int4weightonly            
+WARNING:common:Model detectron2_fcos_r_50_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 15.60it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:01, 16.55it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 16.93it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 17.21it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 17.37it/s]running benchmark:  40%|████      | 12/30 [00:00<00:01, 17.37it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 17.50it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 17.66it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:00, 17.77it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 17.88it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 17.94it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 17.89it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 17.92it/s]running benchmark:  93%|█████████▎| 28/30 [00:01<00:00, 17.89it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 17.78it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 17.58it/s]
+1175.154ms
+loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_101_c4
+WARNING:common:Model detectron2_maskrcnn_r_101_c4 does not support bfloat16, running with amp instead
+loading model: 0it [00:06, ?it/s]
+WARNING:common:Model detectron2_maskrcnn_r_101_c4 does not support bfloat16, running with amp instead
+cuda eval  detectron2_maskrcnn_r_101_c4        int4weightonly            
+WARNING:common:Model detectron2_maskrcnn_r_101_c4 does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 00:49:55,703] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 00:50:02,120] [30/1_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:02, 11.78it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:02, 12.06it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 12.15it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 12.17it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 12.22it/s]running benchmark:  40%|████      | 12/30 [00:00<00:01, 12.26it/s]running benchmark:  47%|████▋     | 14/30 [00:01<00:01, 12.29it/s]running benchmark:  53%|█████▎    | 16/30 [00:01<00:01, 12.29it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:00, 12.31it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 12.32it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 12.33it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 12.33it/s]running benchmark:  87%|████████▋ | 26/30 [00:02<00:00, 12.34it/s]running benchmark:  93%|█████████▎| 28/30 [00:02<00:00, 12.34it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 12.32it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 12.27it/s]
+1744.326ms
+loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_101_fpn
+WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:06, ?it/s]
+WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_maskrcnn_r_101_fpn       int4weightonly            
+WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 00:51:35,640] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 00:51:40,233] [30/1_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 16.43it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:01, 17.18it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 17.53it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 17.76it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 17.87it/s]running benchmark:  40%|████      | 12/30 [00:00<00:01, 17.97it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 17.80it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 17.91it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:00, 17.98it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 17.92it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 18.01it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 18.05it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 18.10it/s]running benchmark:  93%|█████████▎| 28/30 [00:01<00:00, 18.12it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 18.15it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 17.92it/s]
+1987.047ms
+loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_50_c4
+WARNING:common:Model detectron2_maskrcnn_r_50_c4 does not support bfloat16, running with amp instead
+loading model: 0it [00:05, ?it/s]
+WARNING:common:Model detectron2_maskrcnn_r_50_c4 does not support bfloat16, running with amp instead
+cuda eval  detectron2_maskrcnn_r_50_c4         int4weightonly            
+WARNING:common:Model detectron2_maskrcnn_r_50_c4 does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 00:52:40,623] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 00:52:46,850] [30/1_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:02, 13.49it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:01, 13.39it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 13.81it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 14.04it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 14.17it/s]running benchmark:  40%|████      | 12/30 [00:00<00:01, 14.26it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:01, 14.31it/s]running benchmark:  53%|█████▎    | 16/30 [00:01<00:00, 14.32it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:00, 14.03it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 14.15it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 14.17it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 13.56it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 13.79it/s]running benchmark:  93%|█████████▎| 28/30 [00:02<00:00, 13.97it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 14.07it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 14.01it/s]
+1552.354ms
+loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_50_fpn
+WARNING:common:Model detectron2_maskrcnn_r_50_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:05, ?it/s]
+WARNING:common:Model detectron2_maskrcnn_r_50_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_maskrcnn_r_50_fpn        int4weightonly            
+WARNING:common:Model detectron2_maskrcnn_r_50_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 00:53:56,882] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 00:54:01,085] [30/1_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 15.06it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 18.56it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 19.78it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 20.22it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 20.58it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 20.83it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 20.96it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 21.05it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 21.13it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 21.21it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 20.61it/s]
+1786.087ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:12, ?it/s]
+dlrm
+cuda eval  dlrm                                int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 182.55it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 185.54it/s]
+1056.231ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model doctr_det_predictor does not support bfloat16, running with amp instead
+loading model: 0it [00:06, ?it/s]
+WARNING:common:Model doctr_det_predictor does not support bfloat16, running with amp instead
+doctr_det_predictor
+cuda eval  doctr_det_predictor                 int4weightonly            
+WARNING:common:Model doctr_det_predictor does not support bfloat16, running with amp instead
+[2023-12-12 00:55:14,020] [1/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+malloc(): unaligned tcache chunk detected
+Run failed with return code:  -6
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead
+loading model: 0it [00:05, ?it/s]
+WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead
+doctr_reco_predictor
+cuda eval  doctr_reco_predictor                int4weightonly            
+WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 164.51it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 164.89it/s]
+2628.670ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+drq
+cuda eval  drq                                 int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 597.11it/s]
+3369.718ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+fastNLP_Bert
+cuda eval  fastNLP_Bert                        int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:00, 35.95it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 37.72it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 38.16it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 38.35it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 38.56it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 38.58it/s]running benchmark:  93%|█████████▎| 28/30 [00:00<00:00, 38.72it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 38.43it/s]
+1659.323ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+functorch_dp_cifar10
+cuda eval  functorch_dp_cifar10                int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  83%|████████▎ | 25/30 [00:00<00:00, 244.20it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 244.42it/s]
+4400.785ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+functorch_maml_omniglot
+cuda eval  functorch_maml_omniglot             int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 822.40it/s]
+3001.436ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+hf_Albert
+cuda eval  hf_Albert                           int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  10%|█         | 3/30 [00:00<00:01, 25.81it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:00, 32.33it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 34.50it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 35.45it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 35.66it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 36.08it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 36.31it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 35.33it/s]
+1864.635ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:07, ?it/s]
+hf_Bart
+cuda eval  hf_Bart                             int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 16.95it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 21.83it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 23.55it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 24.33it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 24.77it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 25.08it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 25.27it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 25.39it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 25.47it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 25.43it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 24.70it/s]
+2027.441ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:05, ?it/s]
+hf_BigBird
+cuda eval  hf_BigBird                          int4weightonly            
+[2023-12-12 00:59:20,818] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 00:59:23,595] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 00:59:25,739] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 00:59:27,533] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 00:59:29,320] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 00:59:31,154] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 00:59:33,329] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 00:59:35,116] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 00:59:36,898] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 00:59:38,679] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 00:59:40,853] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 00:59:42,648] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:06,  4.17it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:06,  4.49it/s]running benchmark:  10%|█         | 3/30 [00:00<00:05,  4.61it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:05,  4.67it/s]running benchmark:  17%|█▋        | 5/30 [00:01<00:05,  4.70it/s]running benchmark:  20%|██        | 6/30 [00:01<00:05,  4.71it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:04,  4.73it/s]running benchmark:  27%|██▋       | 8/30 [00:01<00:04,  4.74it/s]running benchmark:  30%|███       | 9/30 [00:01<00:04,  4.75it/s]running benchmark:  33%|███▎      | 10/30 [00:02<00:04,  4.74it/s]running benchmark:  37%|███▋      | 11/30 [00:02<00:04,  4.75it/s]running benchmark:  40%|████      | 12/30 [00:02<00:03,  4.75it/s]running benchmark:  43%|████▎     | 13/30 [00:02<00:03,  4.76it/s]running benchmark:  47%|████▋     | 14/30 [00:02<00:03,  4.76it/s]running benchmark:  50%|█████     | 15/30 [00:03<00:03,  4.76it/s]running benchmark:  53%|█████▎    | 16/30 [00:03<00:02,  4.76it/s]running benchmark:  57%|█████▋    | 17/30 [00:03<00:02,  4.76it/s]running benchmark:  60%|██████    | 18/30 [00:03<00:02,  4.76it/s]running benchmark:  63%|██████▎   | 19/30 [00:04<00:02,  4.77it/s]running benchmark:  67%|██████▋   | 20/30 [00:04<00:02,  4.77it/s]running benchmark:  70%|███████   | 21/30 [00:04<00:01,  4.77it/s]running benchmark:  73%|███████▎  | 22/30 [00:04<00:01,  4.76it/s]running benchmark:  77%|███████▋  | 23/30 [00:04<00:01,  4.76it/s]running benchmark:  80%|████████  | 24/30 [00:05<00:01,  4.75it/s]running benchmark:  83%|████████▎ | 25/30 [00:05<00:01,  4.76it/s]running benchmark:  87%|████████▋ | 26/30 [00:05<00:00,  4.76it/s]running benchmark:  90%|█████████ | 27/30 [00:05<00:00,  4.76it/s]running benchmark:  93%|█████████▎| 28/30 [00:05<00:00,  4.76it/s]running benchmark:  97%|█████████▋| 29/30 [00:06<00:00,  4.75it/s]running benchmark: 100%|██████████| 30/30 [00:06<00:00,  4.75it/s]running benchmark: 100%|██████████| 30/30 [00:06<00:00,  4.73it/s]
+1419.527ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+hf_DistilBert
+cuda eval  hf_DistilBert                       int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:00, 48.46it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 56.82it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 59.17it/s]running benchmark:  87%|████████▋ | 26/30 [00:00<00:00, 60.36it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 59.26it/s]
+1611.878ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:07, ?it/s]
+hf_GPT2
+cuda eval  hf_GPT2                             int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:00, 32.62it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 37.15it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 38.51it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 39.25it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 39.76it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 39.98it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 39.13it/s]
+1792.867ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:20, ?it/s]
+hf_GPT2_large
+cuda eval  hf_GPT2_large                       int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:04,  6.85it/s]running benchmark:  10%|█         | 3/30 [00:00<00:02, 10.31it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:02, 12.05it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 12.94it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 13.42it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:01, 13.71it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:01, 13.86it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01, 13.99it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:00, 14.12it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 14.19it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 14.23it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 14.26it/s]running benchmark:  83%|████████▎ | 25/30 [00:01<00:00, 14.30it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 14.29it/s]running benchmark:  97%|█████████▋| 29/30 [00:02<00:00, 14.31it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 13.69it/s]
+1563.554ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:06, ?it/s]
+hf_Longformer
+cuda eval  hf_Longformer                       int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:08,  3.47it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:06,  4.24it/s]running benchmark:  10%|█         | 3/30 [00:00<00:05,  4.56it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:05,  4.74it/s]running benchmark:  17%|█▋        | 5/30 [00:01<00:05,  4.83it/s]running benchmark:  20%|██        | 6/30 [00:01<00:04,  4.89it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:04,  4.93it/s]running benchmark:  27%|██▋       | 8/30 [00:01<00:04,  4.96it/s]running benchmark:  30%|███       | 9/30 [00:01<00:04,  4.98it/s]running benchmark:  33%|███▎      | 10/30 [00:02<00:04,  4.99it/s]running benchmark:  37%|███▋      | 11/30 [00:02<00:03,  5.00it/s]running benchmark:  40%|████      | 12/30 [00:02<00:03,  5.01it/s]running benchmark:  43%|████▎     | 13/30 [00:02<00:03,  5.01it/s]running benchmark:  47%|████▋     | 14/30 [00:02<00:03,  5.01it/s]running benchmark:  50%|█████     | 15/30 [00:03<00:02,  5.01it/s]running benchmark:  53%|█████▎    | 16/30 [00:03<00:02,  5.02it/s]running benchmark:  57%|█████▋    | 17/30 [00:03<00:02,  5.02it/s]running benchmark:  60%|██████    | 18/30 [00:03<00:02,  5.02it/s]running benchmark:  63%|██████▎   | 19/30 [00:03<00:02,  5.02it/s]running benchmark:  67%|██████▋   | 20/30 [00:04<00:01,  5.01it/s]running benchmark:  70%|███████   | 21/30 [00:04<00:01,  5.01it/s]running benchmark:  73%|███████▎  | 22/30 [00:04<00:01,  5.01it/s]running benchmark:  77%|███████▋  | 23/30 [00:04<00:01,  5.01it/s]running benchmark:  80%|████████  | 24/30 [00:04<00:01,  5.01it/s]running benchmark:  83%|████████▎ | 25/30 [00:05<00:00,  5.00it/s]running benchmark:  87%|████████▋ | 26/30 [00:05<00:00,  5.01it/s]running benchmark:  90%|█████████ | 27/30 [00:05<00:00,  5.01it/s]running benchmark:  93%|█████████▎| 28/30 [00:05<00:00,  5.01it/s]running benchmark:  97%|█████████▋| 29/30 [00:05<00:00,  5.01it/s]running benchmark: 100%|██████████| 30/30 [00:06<00:00,  5.02it/s]running benchmark: 100%|██████████| 30/30 [00:06<00:00,  4.94it/s]
+1198.502ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+hf_Reformer
+cuda eval  hf_Reformer                         int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:00, 31.68it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 34.03it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 34.86it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 35.06it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 35.25it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 35.42it/s]running benchmark:  93%|█████████▎| 28/30 [00:00<00:00, 35.53it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 35.11it/s]
+1174.441ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+hf_T5
+cuda eval  hf_T5                               int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:05,  5.28it/s]running benchmark:  10%|█         | 3/30 [00:00<00:03,  8.74it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:02,  9.88it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:02, 10.43it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 10.73it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:01, 10.91it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:01, 11.02it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01, 11.10it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:01, 11.15it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 11.21it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 11.25it/s]running benchmark:  77%|███████▋  | 23/30 [00:02<00:00, 11.28it/s]running benchmark:  83%|████████▎ | 25/30 [00:02<00:00, 11.28it/s]running benchmark:  90%|█████████ | 27/30 [00:02<00:00, 11.28it/s]running benchmark:  97%|█████████▋| 29/30 [00:02<00:00, 11.30it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 10.88it/s]
+1481.560ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:07, ?it/s]
+hf_T5_base
+cuda eval  hf_T5_base                          int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:23,  1.24it/s]running benchmark:   7%|▋         | 2/30 [00:01<00:13,  2.12it/s]running benchmark:  10%|█         | 3/30 [00:01<00:09,  2.75it/s]running benchmark:  13%|█▎        | 4/30 [00:01<00:08,  3.19it/s]running benchmark:  17%|█▋        | 5/30 [00:01<00:07,  3.50it/s]running benchmark:  20%|██        | 6/30 [00:01<00:06,  3.72it/s]running benchmark:  23%|██▎       | 7/30 [00:02<00:05,  3.88it/s]running benchmark:  27%|██▋       | 8/30 [00:02<00:05,  3.99it/s]running benchmark:  30%|███       | 9/30 [00:02<00:05,  4.06it/s]running benchmark:  33%|███▎      | 10/30 [00:02<00:04,  4.12it/s]running benchmark:  37%|███▋      | 11/30 [00:03<00:04,  4.15it/s]running benchmark:  40%|████      | 12/30 [00:03<00:04,  4.18it/s]running benchmark:  43%|████▎     | 13/30 [00:03<00:04,  4.20it/s]running benchmark:  47%|████▋     | 14/30 [00:03<00:03,  4.21it/s]running benchmark:  50%|█████     | 15/30 [00:04<00:03,  4.22it/s]running benchmark:  53%|█████▎    | 16/30 [00:04<00:03,  4.23it/s]running benchmark:  57%|█████▋    | 17/30 [00:04<00:03,  4.23it/s]running benchmark:  60%|██████    | 18/30 [00:04<00:02,  4.23it/s]running benchmark:  63%|██████▎   | 19/30 [00:05<00:02,  4.23it/s]running benchmark:  67%|██████▋   | 20/30 [00:05<00:02,  4.23it/s]running benchmark:  70%|███████   | 21/30 [00:05<00:02,  4.23it/s]running benchmark:  73%|███████▎  | 22/30 [00:05<00:01,  4.23it/s]running benchmark:  77%|███████▋  | 23/30 [00:05<00:01,  4.23it/s]running benchmark:  80%|████████  | 24/30 [00:06<00:01,  4.24it/s]running benchmark:  83%|████████▎ | 25/30 [00:06<00:01,  4.24it/s]running benchmark:  87%|████████▋ | 26/30 [00:06<00:00,  4.24it/s]running benchmark:  90%|█████████ | 27/30 [00:06<00:00,  4.24it/s]running benchmark:  93%|█████████▎| 28/30 [00:07<00:00,  4.24it/s]running benchmark:  97%|█████████▋| 29/30 [00:07<00:00,  4.24it/s]running benchmark: 100%|██████████| 30/30 [00:07<00:00,  4.24it/s]running benchmark: 100%|██████████| 30/30 [00:07<00:00,  3.92it/s]
+1524.679ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:08, ?it/s]
+hf_T5_generate
+cuda eval  hf_T5_generate                      int4weightonly            
+[2023-12-12 01:22:38,564] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (1000)
+[2023-12-12 01:22:38,564] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py:1645)
+[2023-12-12 01:22:38,564] torch._dynamo.convert_frame: [WARNING]    last reason: ___check_obj_id(L['past_key_values'], 7628576)                # mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length  # miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py:1026 in forward
+[2023-12-12 01:22:38,564] torch._dynamo.convert_frame: [WARNING] To log all recompilation reasons, use TORCH_LOGS="recompiles".
+[2023-12-12 01:22:38,564] torch._dynamo.convert_frame: [WARNING] To diagnose recompilation issues, see https://pytorch.org/docs/master/compile/troubleshooting.html.
+[2023-12-12 01:37:33,830] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (1000)
+[2023-12-12 01:37:33,830] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py:978)
+[2023-12-12 01:37:33,830] torch._dynamo.convert_frame: [WARNING]    last reason: tensor 'L['input_ids']' stride mismatch at index 0. expected 65, actual 129
+[2023-12-12 01:37:33,830] torch._dynamo.convert_frame: [WARNING] To log all recompilation reasons, use TORCH_LOGS="recompiles".
+[2023-12-12 01:37:33,830] torch._dynamo.convert_frame: [WARNING] To diagnose recompilation issues, see https://pytorch.org/docs/master/compile/troubleshooting.html.
+AUTOTUNE bmm(8x1x64, 8x64x134)
+  triton_bmm_32030 0.0067 ms 100.0%
+  triton_bmm_32032 0.0067 ms 100.0%
+  triton_bmm_32029 0.0069 ms 95.9%
+  triton_bmm_32037 0.0072 ms 92.9%
+  triton_bmm_32034 0.0073 ms 91.0%
+  triton_bmm_32033 0.0075 ms 89.3%
+  triton_bmm_32035 0.0077 ms 86.7%
+  triton_bmm_32039 0.0077 ms 86.7%
+  triton_bmm_32031 0.0077 ms 86.3%
+  triton_bmm_32036 0.0078 ms 85.6%
+SingleProcess AUTOTUNE takes 4.1738 seconds
+AUTOTUNE bmm(8x1x134, 8x134x64)
+  triton_bmm_32043 0.0072 ms 100.0%
+  triton_bmm_32044 0.0072 ms 100.0%
+  triton_bmm_32042 0.0074 ms 96.6%
+  triton_bmm_32045 0.0074 ms 96.6%
+  triton_bmm_32046 0.0079 ms 90.3%
+  triton_bmm_32041 0.0082 ms 87.3%
+  triton_bmm_32040 0.0095 ms 75.2%
+  triton_bmm_32047 0.0113 ms 63.3%
+  bmm 0.0114 ms 63.1%
+  triton_bmm_32048 0.0115 ms 62.6%
+SingleProcess AUTOTUNE takes 3.4180 seconds
+AUTOTUNE bmm(8x1x64, 8x64x135)
+  triton_bmm_32286 0.0067 ms 100.0%
+  triton_bmm_32283 0.0072 ms 92.9%
+  triton_bmm_32289 0.0072 ms 92.9%
+  triton_bmm_32285 0.0073 ms 91.6%
+  triton_bmm_32284 0.0073 ms 91.2%
+  triton_bmm_32282 0.0073 ms 91.0%
+  triton_bmm_32280 0.0074 ms 89.7%
+  triton_bmm_32288 0.0074 ms 89.7%
+  triton_bmm_32281 0.0077 ms 86.3%
+  triton_bmm_32287 0.0083 ms 80.6%
+SingleProcess AUTOTUNE takes 3.6932 seconds
+AUTOTUNE bmm(8x1x135, 8x135x64)
+  triton_bmm_32297 0.0084 ms 100.0%
+  triton_bmm_32293 0.0095 ms 89.2%
+  triton_bmm_32295 0.0095 ms 89.2%
+  triton_bmm_32298 0.0095 ms 88.9%
+  triton_bmm_32296 0.0097 ms 87.1%
+  triton_bmm_32294 0.0098 ms 86.6%
+  triton_bmm_32292 0.0108 ms 78.6%
+  triton_bmm_32300 0.0110 ms 76.7%
+  bmm 0.0116 ms 72.5%
+  triton_bmm_32299 0.0133 ms 63.3%
+SingleProcess AUTOTUNE takes 2.9271 seconds
+AUTOTUNE bmm(8x1x64, 8x64x136)
+  triton_bmm_32533 0.0069 ms 100.0%
+  triton_bmm_32537 0.0069 ms 100.0%
+  triton_bmm_32535 0.0071 ms 96.9%
+  triton_bmm_32540 0.0072 ms 96.4%
+  triton_bmm_32541 0.0072 ms 96.4%
+  triton_bmm_32536 0.0073 ms 94.3%
+  triton_bmm_32534 0.0075 ms 92.7%
+  triton_bmm_32538 0.0075 ms 92.7%
+  triton_bmm_32539 0.0077 ms 90.0%
+  triton_bmm_32543 0.0077 ms 90.0%
+SingleProcess AUTOTUNE takes 3.7880 seconds
+AUTOTUNE bmm(8x1x136, 8x136x64)
+  triton_bmm_32548 0.0071 ms 100.0%
+  triton_bmm_32547 0.0072 ms 99.6%
+  triton_bmm_32546 0.0074 ms 96.5%
+  triton_bmm_32550 0.0077 ms 92.9%
+  triton_bmm_32549 0.0078 ms 91.4%
+  triton_bmm_32545 0.0082 ms 86.6%
+  triton_bmm_32544 0.0090 ms 79.4%
+  bmm 0.0104 ms 68.4%
+  triton_bmm_32551 0.0113 ms 63.2%
+  triton_bmm_32552 0.0115 ms 61.9%
+SingleProcess AUTOTUNE takes 3.5480 seconds
+AUTOTUNE bmm(8x1x64, 8x64x137)
+  triton_bmm_32786 0.0067 ms 100.0%
+  triton_bmm_32788 0.0067 ms 100.0%
+  triton_bmm_32789 0.0067 ms 100.0%
+  triton_bmm_32785 0.0071 ms 93.3%
+  triton_bmm_32787 0.0072 ms 92.9%
+  triton_bmm_32790 0.0075 ms 89.3%
+  triton_bmm_32791 0.0077 ms 86.7%
+  triton_bmm_32793 0.0077 ms 86.3%
+  triton_bmm_32792 0.0077 ms 86.0%
+  triton_bmm_32784 0.0078 ms 85.6%
+SingleProcess AUTOTUNE takes 3.7351 seconds
+AUTOTUNE bmm(8x1x137, 8x137x64)
+  triton_bmm_32802 0.0090 ms 100.0%
+  triton_bmm_32801 0.0092 ms 97.2%
+  triton_bmm_32800 0.0097 ms 92.0%
+  triton_bmm_32798 0.0098 ms 91.8%
+  triton_bmm_32797 0.0099 ms 90.3%
+  triton_bmm_32799 0.0100 ms 89.5%
+  triton_bmm_32796 0.0108 ms 83.1%
+  bmm 0.0109 ms 82.1%
+  triton_bmm_32804 0.0110 ms 81.4%
+  triton_bmm_32803 0.0133 ms 67.1%
+SingleProcess AUTOTUNE takes 3.1197 seconds
+TIMEOUT
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:18, ?it/s]
+hf_T5_large
+cuda eval  hf_T5_large                         int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:10,  2.73it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:06,  4.21it/s]running benchmark:  10%|█         | 3/30 [00:00<00:05,  5.23it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:04,  5.90it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:03,  6.30it/s]running benchmark:  20%|██        | 6/30 [00:01<00:03,  6.61it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:03,  6.82it/s]running benchmark:  27%|██▋       | 8/30 [00:01<00:03,  6.94it/s]running benchmark:  30%|███       | 9/30 [00:01<00:02,  7.07it/s]running benchmark:  33%|███▎      | 10/30 [00:01<00:02,  7.18it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:02,  7.21it/s]running benchmark:  40%|████      | 12/30 [00:01<00:02,  7.21it/s]running benchmark:  43%|████▎     | 13/30 [00:02<00:02,  7.21it/s]running benchmark:  47%|████▋     | 14/30 [00:02<00:02,  7.26it/s]running benchmark:  50%|█████     | 15/30 [00:02<00:02,  7.27it/s]running benchmark:  53%|█████▎    | 16/30 [00:02<00:01,  7.27it/s]running benchmark:  57%|█████▋    | 17/30 [00:02<00:01,  7.21it/s]running benchmark:  60%|██████    | 18/30 [00:02<00:01,  7.22it/s]running benchmark:  63%|██████▎   | 19/30 [00:02<00:01,  7.26it/s]running benchmark:  67%|██████▋   | 20/30 [00:02<00:01,  7.30it/s]running benchmark:  70%|███████   | 21/30 [00:03<00:01,  7.34it/s]running benchmark:  73%|███████▎  | 22/30 [00:03<00:01,  7.34it/s]running benchmark:  77%|███████▋  | 23/30 [00:03<00:00,  7.32it/s]running benchmark:  80%|████████  | 24/30 [00:03<00:00,  7.32it/s]running benchmark:  83%|████████▎ | 25/30 [00:03<00:00,  7.33it/s]running benchmark:  87%|████████▋ | 26/30 [00:03<00:00,  7.36it/s]running benchmark:  90%|█████████ | 27/30 [00:03<00:00,  7.38it/s]running benchmark:  93%|█████████▎| 28/30 [00:04<00:00,  7.37it/s]running benchmark:  97%|█████████▋| 29/30 [00:04<00:00,  7.35it/s]running benchmark: 100%|██████████| 30/30 [00:04<00:00,  7.26it/s]running benchmark: 100%|██████████| 30/30 [00:04<00:00,  6.90it/s]
+1556.371ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+hf_Whisper
+cuda eval  hf_Whisper                          int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:04,  6.42it/s]running benchmark:  10%|█         | 3/30 [00:00<00:02, 11.09it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 12.79it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 13.63it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 14.07it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:01, 14.34it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:01, 14.51it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01, 14.64it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:00, 14.74it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 14.80it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 14.83it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 14.85it/s]running benchmark:  83%|████████▎ | 25/30 [00:01<00:00, 14.85it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 14.88it/s]running benchmark:  97%|█████████▋| 29/30 [00:02<00:00, 14.91it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 14.28it/s]
+1287.395ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:05, ?it/s]
+WARNING:root:hf_clip failed to load
+hf_clip
+Original Error: 'str' object has no attribute 'shape'
+Eager model failed to run
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model
+    self.model_iter_fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.py", line 1142, in forward
+    vision_outputs = self.vision_model(
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.py", line 883, in forward
+    hidden_states = self.embeddings(pixel_values)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.py", line 194, in forward
+    batch_size = pixel_values.shape[0]
+AttributeError: 'str' object has no attribute 'shape'
+
+The above exception was the direct cause of the following exception:
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model
+    self.validate_model(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model
+    raise NotImplementedError("Eager model failed to run") from e
+NotImplementedError: Eager model failed to run
+
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+lennard_jones
+cuda eval  lennard_jones                       int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 794.66it/s]
+2720.609ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+llama
+cuda eval  llama                               int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  10%|█         | 3/30 [00:00<00:00, 28.05it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:00, 32.62it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 34.01it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 34.08it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 34.18it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 34.27it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 34.34it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 33.98it/s]
+2303.089ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [01:00, ?it/s]
+llama_v2_7b_16h
+cuda eval  llama_v2_7b_16h                     int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:27,  1.04it/s]running benchmark:   7%|▋         | 2/30 [00:01<00:17,  1.60it/s]running benchmark:  10%|█         | 3/30 [00:01<00:14,  1.93it/s]running benchmark:  13%|█▎        | 4/30 [00:02<00:12,  2.14it/s]running benchmark:  17%|█▋        | 5/30 [00:02<00:11,  2.27it/s]running benchmark:  20%|██        | 6/30 [00:02<00:10,  2.36it/s]running benchmark:  23%|██▎       | 7/30 [00:03<00:09,  2.42it/s]running benchmark:  27%|██▋       | 8/30 [00:03<00:08,  2.46it/s]running benchmark:  30%|███       | 9/30 [00:04<00:08,  2.49it/s]running benchmark:  33%|███▎      | 10/30 [00:04<00:07,  2.51it/s]running benchmark:  37%|███▋      | 11/30 [00:04<00:07,  2.52it/s]running benchmark:  40%|████      | 12/30 [00:05<00:07,  2.53it/s]running benchmark:  43%|████▎     | 13/30 [00:05<00:06,  2.54it/s]running benchmark:  47%|████▋     | 14/30 [00:06<00:06,  2.55it/s]running benchmark:  50%|█████     | 15/30 [00:06<00:05,  2.55it/s]running benchmark:  53%|█████▎    | 16/30 [00:06<00:05,  2.55it/s]running benchmark:  57%|█████▋    | 17/30 [00:07<00:05,  2.55it/s]running benchmark:  60%|██████    | 18/30 [00:07<00:04,  2.55it/s]running benchmark:  63%|██████▎   | 19/30 [00:08<00:04,  2.55it/s]running benchmark:  67%|██████▋   | 20/30 [00:08<00:03,  2.56it/s]running benchmark:  70%|███████   | 21/30 [00:08<00:03,  2.56it/s]running benchmark:  73%|███████▎  | 22/30 [00:09<00:03,  2.56it/s]running benchmark:  77%|███████▋  | 23/30 [00:09<00:02,  2.56it/s]running benchmark:  80%|████████  | 24/30 [00:09<00:02,  2.56it/s]running benchmark:  83%|████████▎ | 25/30 [00:10<00:01,  2.56it/s]running benchmark:  87%|████████▋ | 26/30 [00:10<00:01,  2.55it/s]running benchmark:  90%|█████████ | 27/30 [00:11<00:01,  2.55it/s]running benchmark:  93%|█████████▎| 28/30 [00:11<00:00,  2.55it/s]running benchmark:  97%|█████████▋| 29/30 [00:11<00:00,  2.55it/s]running benchmark: 100%|██████████| 30/30 [00:12<00:00,  2.55it/s]running benchmark: 100%|██████████| 30/30 [00:12<00:00,  2.44it/s]
+1041.857ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+maml_omniglot
+cuda eval  maml_omniglot                       int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 664.88it/s]
+3351.525ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+mnasnet1_0
+cuda eval  mnasnet1_0                          int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 106.31it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 116.05it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 116.20it/s]
+3155.656ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+mobilenet_v2
+cuda eval  mobilenet_v2                        int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 103.81it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 118.59it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 118.58it/s]
+4644.124ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+WARNING:root:mobilenet_v2_quantized_qat failed to load
+mobilenet_v2_quantized_qat
+The eval test only supports CPU.
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 443, in load_model
+    benchmark = benchmark_cls(
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/util/model.py", line 24, in __call__
+    obj = type.__call__(cls, *args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/models/mobilenet_v2_quantized_qat/__init__.py", line 21, in __init__
+    raise NotImplementedError("The eval test only supports CPU.")
+NotImplementedError: The eval test only supports CPU.
+
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+mobilenet_v3_large
+cuda eval  mobilenet_v3_large                  int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 74.53it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 89.85it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 95.64it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 92.48it/s]
+4259.038ms
+loading model: 0it [00:00, ?it/s]NCCL version 2.19.3+cuda12.0
+loading model: 0it [00:04, ?it/s]
+moco
+cuda eval  moco                                int4weightonly            
+ERROR:common:Backend eager failed in warmup()
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2380, in warmup
+    fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/parallel/distributed.py", line 1523, in forward
+    else self._run_ddp_forward(*inputs, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/parallel/distributed.py", line 1359, in _run_ddp_forward
+    return self.module(*inputs, **kwargs)  # type: ignore[index]
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/models/moco/moco/builder.py", line 130, in forward
+    self._momentum_update_key_encoder()  # update the key encoder
+  File "/home/cdhernandez/local/pytorch/torch/utils/_contextlib.py", line 115, in decorate_context
+    return func(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/models/moco/moco/builder.py", line 50, in _momentum_update_key_encoder
+    param_k.mul_(self.m).add_(param_q.mul(1. - self.m))
+TypeError: add_(): argument 'other' (position 1) must be Tensor, not NoneType
+Run failed with return code:  255
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+nanogpt
+number of parameters: 123.69M
+num decayed parameter tensors: 50, with 124,354,560 parameters
+num non-decayed parameter tensors: 98, with 121,344 parameters
+using fused AdamW: True
+cuda eval  nanogpt                             int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 79.83it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 81.25it/s]running benchmark:  87%|████████▋ | 26/30 [00:00<00:00, 81.39it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 81.19it/s]
+5079.642ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+nvidia_deeprecommender
+cuda eval  nvidia_deeprecommender              int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 202.36it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 205.56it/s]
+923.400ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+opacus_cifar10
+cuda eval  opacus_cifar10                      int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 182.29it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 183.39it/s]
+5339.411ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:26, ?it/s]
+phi_1_5
+cuda eval  phi_1_5                             int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:11,  2.53it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:07,  3.82it/s]running benchmark:  10%|█         | 3/30 [00:00<00:05,  4.57it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:05,  5.04it/s]running benchmark:  17%|█▋        | 5/30 [00:01<00:04,  5.34it/s]running benchmark:  20%|██        | 6/30 [00:01<00:04,  5.55it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:04,  5.69it/s]running benchmark:  27%|██▋       | 8/30 [00:01<00:03,  5.78it/s]running benchmark:  30%|███       | 9/30 [00:01<00:03,  5.85it/s]running benchmark:  33%|███▎      | 10/30 [00:01<00:03,  5.89it/s]running benchmark:  37%|███▋      | 11/30 [00:02<00:03,  5.92it/s]running benchmark:  40%|████      | 12/30 [00:02<00:03,  5.94it/s]running benchmark:  43%|████▎     | 13/30 [00:02<00:02,  5.96it/s]running benchmark:  47%|████▋     | 14/30 [00:02<00:02,  5.96it/s]running benchmark:  50%|█████     | 15/30 [00:02<00:02,  5.98it/s]running benchmark:  53%|█████▎    | 16/30 [00:02<00:02,  5.98it/s]running benchmark:  57%|█████▋    | 17/30 [00:03<00:02,  5.98it/s]running benchmark:  60%|██████    | 18/30 [00:03<00:02,  5.98it/s]running benchmark:  63%|██████▎   | 19/30 [00:03<00:01,  5.98it/s]running benchmark:  67%|██████▋   | 20/30 [00:03<00:01,  5.98it/s]running benchmark:  70%|███████   | 21/30 [00:03<00:01,  5.98it/s]running benchmark:  73%|███████▎  | 22/30 [00:03<00:01,  5.98it/s]running benchmark:  77%|███████▋  | 23/30 [00:04<00:01,  5.98it/s]running benchmark:  80%|████████  | 24/30 [00:04<00:01,  5.99it/s]running benchmark:  83%|████████▎ | 25/30 [00:04<00:00,  5.99it/s]running benchmark:  87%|████████▋ | 26/30 [00:04<00:00,  5.99it/s]running benchmark:  90%|█████████ | 27/30 [00:04<00:00,  6.00it/s]running benchmark:  93%|█████████▎| 28/30 [00:04<00:00,  6.00it/s]running benchmark:  97%|█████████▋| 29/30 [00:05<00:00,  6.00it/s]running benchmark: 100%|██████████| 30/30 [00:05<00:00,  6.00it/s]running benchmark: 100%|██████████| 30/30 [00:05<00:00,  5.73it/s]
+1120.893ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+phlippe_densenet
+cuda eval  phlippe_densenet                    int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 78.14it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 88.27it/s]running benchmark:  93%|█████████▎| 28/30 [00:00<00:00, 90.65it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 89.39it/s]
+4800.416ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+phlippe_resnet
+cuda eval  phlippe_resnet                      int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 223.08it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 227.59it/s]
+4016.233ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+pyhpc_equation_of_state
+cuda eval  pyhpc_equation_of_state             int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 132.47it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 143.87it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 141.99it/s]
+22852.893ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+pyhpc_isoneutral_mixing
+cuda eval  pyhpc_isoneutral_mixing             int4weightonly            
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 86.73it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 89.02it/s]running benchmark:  93%|█████████▎| 28/30 [00:00<00:00, 89.06it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 88.56it/s]
+5973.531ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model pyhpc_turbulent_kinetic_energy does not support bfloat16, running with amp instead
+loading model: 0it [00:01, ?it/s]
+WARNING:common:Model pyhpc_turbulent_kinetic_energy does not support bfloat16, running with amp instead
+pyhpc_turbulent_kinetic_energy
+cuda eval  pyhpc_turbulent_kinetic_energy      int4weightonly            
+WARNING:common:Model pyhpc_turbulent_kinetic_energy does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 76.24it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 80.31it/s]running benchmark:  87%|████████▋ | 26/30 [00:00<00:00, 80.75it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 80.52it/s]
+4506.998ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+pytorch_CycleGAN_and_pix2pix
+cuda eval  pytorch_CycleGAN_and_pix2pix        int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 152.30it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 155.67it/s]
+2308.465ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+pytorch_stargan
+cuda eval  pytorch_stargan                     int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 102.26it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 111.73it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 112.11it/s]
+2024.177ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+pytorch_unet
+cuda eval  pytorch_unet                        int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:00, 37.31it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 42.62it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 44.24it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 44.95it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 45.30it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 45.54it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 44.68it/s]
+1792.471ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+resnet152
+cuda eval  resnet152                           int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:08,  3.40it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 14.73it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 21.59it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 25.90it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 28.66it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 30.61it/s]running benchmark:  83%|████████▎ | 25/30 [00:00<00:00, 32.06it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 32.83it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 26.67it/s]
+2295.410ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+resnet18
+cuda eval  resnet18                            int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  83%|████████▎ | 25/30 [00:00<00:00, 247.35it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 246.87it/s]
+3412.665ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+resnet50
+cuda eval  resnet50                            int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 76.38it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 81.46it/s]running benchmark:  87%|████████▋ | 26/30 [00:00<00:00, 82.49it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 81.90it/s]
+1822.395ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+WARNING:root:resnet50_quantized_qat failed to load
+resnet50_quantized_qat
+The eval test only supports CPU.
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 443, in load_model
+    benchmark = benchmark_cls(
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/util/model.py", line 24, in __call__
+    obj = type.__call__(cls, *args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/models/resnet50_quantized_qat/__init__.py", line 21, in __init__
+    raise NotImplementedError("The eval test only supports CPU.")
+NotImplementedError: The eval test only supports CPU.
+
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+resnext50_32x4d
+cuda eval  resnext50_32x4d                     int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 78.43it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 82.30it/s]running benchmark:  87%|████████▋ | 26/30 [00:00<00:00, 82.20it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 81.80it/s]
+4932.187ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:10, ?it/s]
+sam
+cuda eval  sam                                 int4weightonly            
+ERROR:common:Backend dynamo failed in warmup()
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2380, in warmup
+    fn(model, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 488, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 538, in forward_pass
+    def forward_pass(self, mod, inputs, collect_outputs=True):
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 488, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/external_utils.py", line 17, in inner
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 901, in forward
+    return compiled_fn(full_args)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/utils.py", line 81, in g
+    return f(*args)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 94, in runtime_wrapper
+    all_outs = call_func_at_runtime_with_args(
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/utils.py", line 105, in call_func_at_runtime_with_args
+    out = normalize_as_list(f(args))
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 297, in inner_fn
+    unwrapped_outs = runtime_fn(unwrapped_args)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py", line 118, in rng_functionalization_wrapper
+    return compiled_fw(args)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 863, in __call__
+    return self.get_current_callable()(inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 665, in run
+    return compiled_fn(new_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/cudagraph_trees.py", line 380, in deferred_cudagraphify
+    fn, out = cudagraphify(model, inputs, new_static_input_idxs, *args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/cudagraph_trees.py", line 408, in cudagraphify
+    return manager.add_function(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/cudagraph_trees.py", line 1941, in add_function
+    return fn, fn(inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/cudagraph_trees.py", line 1755, in run
+    out = self._run(new_inputs, function_id)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/cudagraph_trees.py", line 1796, in _run
+    return self.run_eager(new_inputs, function_id)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/cudagraph_trees.py", line 1911, in run_eager
+    return node.run(new_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/cudagraph_trees.py", line 611, in run
+    out = self.wrapped_function.model(new_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 891, in _run_from_cache
+    return compiled_graph.compiled_artifact(inputs)
+  File "/tmp/torchinductor_cdhernandez/g4/cg4rxxzan5bcfti7rnynechvdhv3ychv7ycv3no3cfgi3zry5ymt.py", line 8391, in call
+    buf1057 = aten._weight_int4pack_mm(buf1056, arg610_1, 128, arg611_1)
+  File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 755, in __call__
+    return self._op(*args, **(kwargs or {}))
+  File "/home/cdhernandez/local/pytorch/torch/utils/_device.py", line 77, in __torch_function__
+    return func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 755, in __call__
+    return self._op(*args, **(kwargs or {}))
+RuntimeError: Expected A.is_contiguous() to be true, but got false.  (Could this error message be improved?  If so, please report an enhancement request to PyTorch.)
+Run failed with return code:  255
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+shufflenet_v2_x1_0
+cuda eval  shufflenet_v2_x1_0                  int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 75.55it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 79.39it/s]running benchmark:  87%|████████▋ | 26/30 [00:00<00:00, 80.40it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 79.65it/s]
+4342.282ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+soft_actor_critic
+cuda eval  soft_actor_critic                   int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 925.01it/s]
+2058.497ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+speech_transformer
+cuda eval  speech_transformer                  int4weightonly            
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['non-cuda device in graph']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  10%|█         | 3/30 [00:00<00:01, 23.33it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 23.59it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 23.54it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 23.60it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 23.70it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 23.72it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 23.52it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 23.41it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 23.30it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 23.43it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 23.49it/s]
+1540.684ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+squeezenet1_1
+cuda eval  squeezenet1_1                       int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 307.81it/s]
+3467.094ms
+loading model: 0it [00:00, ?it/s]stable_diffusion_text_encoder
+
+Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s][A
+Loading pipeline components...:  17%|█▋        | 1/6 [00:00<00:00,  9.78it/s][A
+Loading pipeline components...:  33%|███▎      | 2/6 [00:00<00:00,  4.46it/s][A
+Loading pipeline components...:  67%|██████▋   | 4/6 [00:00<00:00,  6.33it/s][ALoading pipeline components...: 100%|██████████| 6/6 [00:00<00:00,  8.27it/s]
+loading model: 0it [00:07, ?it/s]
+cuda eval  stable_diffusion_text_encoder       int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:00, 37.51it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 37.78it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 37.93it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 37.65it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 37.64it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 37.73it/s]running benchmark:  93%|█████████▎| 28/30 [00:00<00:00, 37.71it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 37.72it/s]
+9110.340ms
+loading model: 0it [00:00, ?it/s]stable_diffusion_unet
+
+Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s][A
+Loading pipeline components...:  50%|█████     | 3/6 [00:00<00:00, 10.50it/s][A
+Loading pipeline components...:  83%|████████▎ | 5/6 [00:00<00:00,  5.67it/s][ALoading pipeline components...: 100%|██████████| 6/6 [00:00<00:00,  7.30it/s]
+loading model: 0it [00:06, ?it/s]
+cuda eval  stable_diffusion_unet               int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:21,  1.34it/s]running benchmark:   7%|▋         | 2/30 [00:01<00:13,  2.03it/s]running benchmark:  10%|█         | 3/30 [00:01<00:11,  2.43it/s]running benchmark:  13%|█▎        | 4/30 [00:01<00:09,  2.68it/s]running benchmark:  17%|█▋        | 5/30 [00:02<00:08,  2.84it/s]running benchmark:  20%|██        | 6/30 [00:02<00:08,  2.95it/s]running benchmark:  23%|██▎       | 7/30 [00:02<00:07,  3.02it/s]running benchmark:  27%|██▋       | 8/30 [00:02<00:07,  3.07it/s]running benchmark:  30%|███       | 9/30 [00:03<00:06,  3.10it/s]running benchmark:  33%|███▎      | 10/30 [00:03<00:06,  3.13it/s]running benchmark:  37%|███▋      | 11/30 [00:03<00:06,  3.14it/s]running benchmark:  40%|████      | 12/30 [00:04<00:05,  3.16it/s]running benchmark:  43%|████▎     | 13/30 [00:04<00:05,  3.16it/s]running benchmark:  47%|████▋     | 14/30 [00:04<00:05,  3.17it/s]running benchmark:  50%|█████     | 15/30 [00:05<00:04,  3.17it/s]running benchmark:  53%|█████▎    | 16/30 [00:05<00:04,  3.17it/s]running benchmark:  57%|█████▋    | 17/30 [00:05<00:04,  3.17it/s]running benchmark:  60%|██████    | 18/30 [00:06<00:03,  3.17it/s]running benchmark:  63%|██████▎   | 19/30 [00:06<00:03,  3.17it/s]running benchmark:  67%|██████▋   | 20/30 [00:06<00:03,  3.18it/s]running benchmark:  70%|███████   | 21/30 [00:07<00:02,  3.18it/s]running benchmark:  73%|███████▎  | 22/30 [00:07<00:02,  3.18it/s]running benchmark:  77%|███████▋  | 23/30 [00:07<00:02,  3.18it/s]running benchmark:  80%|████████  | 24/30 [00:07<00:01,  3.18it/s]running benchmark:  83%|████████▎ | 25/30 [00:08<00:01,  3.18it/s]running benchmark:  87%|████████▋ | 26/30 [00:08<00:01,  3.18it/s]running benchmark:  90%|█████████ | 27/30 [00:08<00:00,  3.18it/s]running benchmark:  93%|█████████▎| 28/30 [00:09<00:00,  3.18it/s]running benchmark:  97%|█████████▋| 29/30 [00:09<00:00,  3.18it/s]running benchmark: 100%|██████████| 30/30 [00:09<00:00,  3.18it/s]running benchmark: 100%|██████████| 30/30 [00:09<00:00,  3.04it/s]
+1111.615ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+timm_efficientdet
+cuda eval  timm_efficientdet                   int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:07,  3.89it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:07,  3.92it/s]running benchmark:  10%|█         | 3/30 [00:00<00:06,  3.95it/s]running benchmark:  13%|█▎        | 4/30 [00:01<00:06,  3.94it/s]running benchmark:  17%|█▋        | 5/30 [00:01<00:06,  3.95it/s]running benchmark:  20%|██        | 6/30 [00:01<00:06,  3.93it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:06,  3.78it/s]running benchmark:  27%|██▋       | 8/30 [00:02<00:05,  3.81it/s]running benchmark:  30%|███       | 9/30 [00:02<00:05,  3.62it/s]running benchmark:  33%|███▎      | 10/30 [00:02<00:05,  3.59it/s]running benchmark:  37%|███▋      | 11/30 [00:02<00:05,  3.67it/s]running benchmark:  40%|████      | 12/30 [00:03<00:04,  3.70it/s]running benchmark:  43%|████▎     | 13/30 [00:03<00:04,  3.76it/s]running benchmark:  47%|████▋     | 14/30 [00:03<00:04,  3.81it/s]running benchmark:  50%|█████     | 15/30 [00:03<00:03,  3.84it/s]running benchmark:  53%|█████▎    | 16/30 [00:04<00:03,  3.84it/s]running benchmark:  57%|█████▋    | 17/30 [00:04<00:03,  3.66it/s]running benchmark:  60%|██████    | 18/30 [00:04<00:03,  3.71it/s]running benchmark:  63%|██████▎   | 19/30 [00:05<00:02,  3.75it/s]running benchmark:  67%|██████▋   | 20/30 [00:05<00:02,  3.79it/s]running benchmark:  70%|███████   | 21/30 [00:05<00:02,  3.84it/s]running benchmark:  73%|███████▎  | 22/30 [00:05<00:02,  3.86it/s]running benchmark:  77%|███████▋  | 23/30 [00:06<00:01,  3.87it/s]running benchmark:  80%|████████  | 24/30 [00:06<00:01,  3.88it/s]running benchmark:  83%|████████▎ | 25/30 [00:06<00:01,  3.88it/s]running benchmark:  87%|████████▋ | 26/30 [00:06<00:01,  3.88it/s]running benchmark:  90%|█████████ | 27/30 [00:07<00:00,  3.88it/s]running benchmark:  93%|█████████▎| 28/30 [00:07<00:00,  3.90it/s]running benchmark:  97%|█████████▋| 29/30 [00:07<00:00,  3.90it/s]running benchmark: 100%|██████████| 30/30 [00:07<00:00,  3.89it/s]running benchmark: 100%|██████████| 30/30 [00:07<00:00,  3.82it/s]
+1505.924ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+timm_efficientnet
+cuda eval  timm_efficientnet                   int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:00, 48.32it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 56.24it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 58.52it/s]running benchmark:  87%|████████▋ | 26/30 [00:00<00:00, 59.58it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 58.58it/s]
+2205.959ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+timm_nfnet
+cuda eval  timm_nfnet                          int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:04,  6.99it/s]running benchmark:  10%|█         | 3/30 [00:00<00:02, 11.73it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 13.42it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 14.21it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 14.61it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:01, 14.87it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:01, 15.02it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:00, 15.15it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:00, 15.20it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 15.26it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 15.29it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 15.31it/s]running benchmark:  83%|████████▎ | 25/30 [00:01<00:00, 15.34it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 15.33it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 15.36it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 14.79it/s]
+1856.539ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+timm_regnet
+cuda eval  timm_regnet                         int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  10%|█         | 3/30 [00:00<00:01, 24.94it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:00, 30.38it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 32.17it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 33.01it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 33.45it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 33.71it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 33.91it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 33.05it/s]
+1358.869ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+timm_resnest
+cuda eval  timm_resnest                        int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 99.48it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 104.91it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 105.35it/s]
+1679.939ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+timm_vision_transformer
+cuda eval  timm_vision_transformer             int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:05,  5.07it/s]running benchmark:  10%|█         | 3/30 [00:00<00:02,  9.18it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:02, 10.76it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 11.56it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 12.02it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:01, 12.30it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:01, 12.48it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01, 12.60it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:01, 12.67it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 12.73it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 12.77it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 12.80it/s]running benchmark:  83%|████████▎ | 25/30 [00:02<00:00, 12.80it/s]running benchmark:  90%|█████████ | 27/30 [00:02<00:00, 12.82it/s]running benchmark:  97%|█████████▋| 29/30 [00:02<00:00, 12.83it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 12.22it/s]
+1042.761ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:20, ?it/s]
+timm_vision_transformer_large
+cuda eval  timm_vision_transformer_large       int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:06<02:57,  6.11s/it]running benchmark:   7%|▋         | 2/30 [00:08<01:50,  3.96s/it]running benchmark:  10%|█         | 3/30 [00:11<01:28,  3.27s/it]running benchmark:  13%|█▎        | 4/30 [00:13<01:16,  2.94s/it]running benchmark:  17%|█▋        | 5/30 [00:15<01:09,  2.76s/it]running benchmark:  20%|██        | 6/30 [00:18<01:03,  2.66s/it]running benchmark:  23%|██▎       | 7/30 [00:20<00:59,  2.59s/it]running benchmark:  27%|██▋       | 8/30 [00:23<00:55,  2.54s/it]running benchmark:  30%|███       | 9/30 [00:25<00:52,  2.51s/it]running benchmark:  33%|███▎      | 10/30 [00:28<00:49,  2.49s/it]running benchmark:  37%|███▋      | 11/30 [00:30<00:47,  2.48s/it]running benchmark:  40%|████      | 12/30 [00:33<00:44,  2.47s/it]running benchmark:  43%|████▎     | 13/30 [00:35<00:41,  2.46s/it]running benchmark:  47%|████▋     | 14/30 [00:37<00:39,  2.46s/it]running benchmark:  50%|█████     | 15/30 [00:40<00:36,  2.46s/it]running benchmark:  53%|█████▎    | 16/30 [00:42<00:34,  2.45s/it]running benchmark:  57%|█████▋    | 17/30 [00:45<00:31,  2.45s/it]running benchmark:  60%|██████    | 18/30 [00:47<00:29,  2.46s/it]running benchmark:  63%|██████▎   | 19/30 [00:50<00:26,  2.45s/it]running benchmark:  67%|██████▋   | 20/30 [00:52<00:24,  2.45s/it]running benchmark:  70%|███████   | 21/30 [00:55<00:22,  2.45s/it]running benchmark:  73%|███████▎  | 22/30 [00:57<00:19,  2.45s/it]running benchmark:  77%|███████▋  | 23/30 [00:59<00:17,  2.45s/it]running benchmark:  80%|████████  | 24/30 [01:02<00:14,  2.45s/it]running benchmark:  83%|████████▎ | 25/30 [01:04<00:12,  2.45s/it]running benchmark:  87%|████████▋ | 26/30 [01:07<00:09,  2.45s/it]running benchmark:  90%|█████████ | 27/30 [01:09<00:07,  2.45s/it]running benchmark:  93%|█████████▎| 28/30 [01:12<00:04,  2.45s/it]running benchmark:  97%|█████████▋| 29/30 [01:14<00:02,  2.45s/it]running benchmark: 100%|██████████| 30/30 [01:17<00:00,  2.45s/it]running benchmark: 100%|██████████| 30/30 [01:17<00:00,  2.57s/it]
+1024.735ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+timm_vovnet
+cuda eval  timm_vovnet                         int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 77.15it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 84.20it/s]running benchmark:  87%|████████▋ | 26/30 [00:00<00:00, 86.48it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 85.61it/s]
+1753.444ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:05, ?it/s]
+torch_multimodal_clip
+cuda eval  torch_multimodal_clip               int4weightonly            
+skipping cudagraphs due to ['non-cuda device in graph']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:04,  6.21it/s]running benchmark:  10%|█         | 3/30 [00:00<00:02, 11.06it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 12.86it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 13.77it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 14.28it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:01, 14.59it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:01, 14.79it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01, 14.93it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:00, 15.02it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 15.08it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 15.12it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 15.15it/s]running benchmark:  83%|████████▎ | 25/30 [00:01<00:00, 15.17it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 15.19it/s]running benchmark:  97%|█████████▋| 29/30 [00:02<00:00, 15.20it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 14.51it/s]
+1101.767ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model tts_angular does not support bfloat16, running with amp instead
+loading model: 0it [00:00, ?it/s]
+WARNING:common:Model tts_angular does not support bfloat16, running with amp instead
+tts_angular
+cuda eval  tts_angular                         int4weightonly            
+WARNING:common:Model tts_angular does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 159.93it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 160.68it/s]
+972.916ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+vgg16
+cuda eval  vgg16                               int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 292.73it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 292.20it/s]
+1577.414ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model vision_maskrcnn does not support bfloat16, running with float16 instead
+loading model: 0it [00:04, ?it/s]
+WARNING:common:Model vision_maskrcnn does not support bfloat16, running with float16 instead
+vision_maskrcnn
+cuda eval  vision_maskrcnn                     int4weightonly            
+WARNING:common:Model vision_maskrcnn does not support bfloat16, running with float16 instead
+ERROR:common:Backend eager failed in warmup()
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2380, in warmup
+    fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/vision/torchvision/models/detection/generalized_rcnn.py", line 105, in forward
+    detections, detector_losses = self.roi_heads(features, proposals, images.image_sizes, targets)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/vision/torchvision/models/detection/roi_heads.py", line 762, in forward
+    box_features = self.box_head(box_features)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/vision/torchvision/models/detection/faster_rcnn.py", line 301, in forward
+    x = F.relu(self.fc6(x))
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/linear.py", line 116, in forward
+    return F.linear(input, self.weight, self.bias)
+  File "/home/cdhernandez/local/ao/torchao/quantization/subclass.py", line 120, in __torch_function__
+    return cls._quantized_op(mat1, w_qtensor, bias)
+  File "/home/cdhernandez/local/ao/torchao/quantization/subclass.py", line 345, in _quantized_op
+    y = aten._weight_int4pack_mm(
+  File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 755, in __call__
+    return self._op(*args, **(kwargs or {}))
+RuntimeError: CUDA error: invalid configuration argument
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Run failed with return code:  255
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+yolov3
+cuda eval  yolov3                              int4weightonly            
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:00, 46.49it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 51.55it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 52.83it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 54.27it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 54.96it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 53.71it/s]
+1867.943ms
+
+Summary for tag=0.000000:
+speedup             gmean=0.00x mean=0.000x
+abs_latency         gmean=0.00x mean=0.000x
+compilation_latency mean=0.000 seconds
+compression_ratio   mean=0.000x
+eager_peak_mem      gmean=0.00x mean=0.000x
+dynamo_peak_mem     gmean=0.00x mean=0.000x
+calls_captured      gmean=0.00x mean=0.000x
+unique_graphs       gmean=0.00x mean=0.000x
+graph_breaks        gmean=0.00x mean=0.000x
+unique_graph_breaks gmean=0.00x mean=0.000x
+
+Summary for tag=int8dynamic:
+speedup             gmean=9.20x mean=113.389x
+abs_latency         gmean=4.24x mean=10.510x
+compilation_latency mean=34.839 seconds
+compression_ratio   mean=1.263x
+eager_peak_mem      gmean=0.38x mean=0.878x
+dynamo_peak_mem     gmean=0.36x mean=0.844x
+calls_captured      gmean=233.44x mean=564.988x
+unique_graphs       gmean=1.86x mean=7.136x
+graph_breaks        gmean=0.00x mean=5.160x
+unique_graph_breaks gmean=0.00x mean=1.333x
+
+Summary for tag=int8weightonly:
+speedup             gmean=2.46x mean=2.889x
+abs_latency         gmean=4.51x mean=11.782x
+compilation_latency mean=31.136 seconds
+compression_ratio   mean=1.098x
+eager_peak_mem      gmean=0.38x mean=0.871x
+dynamo_peak_mem     gmean=0.46x mean=0.896x
+calls_captured      gmean=233.16x mean=563.963x
+unique_graphs       gmean=1.85x mean=7.183x
+graph_breaks        gmean=0.00x mean=5.220x
+unique_graph_breaks gmean=0.00x mean=1.317x
+
+Summary for tag=int4weightonly:
+speedup             gmean=2.01x mean=2.520x
+abs_latency         gmean=6.14x mean=33.943x
+compilation_latency mean=27.431 seconds
+compression_ratio   mean=1.140x
+eager_peak_mem      gmean=0.33x mean=0.696x
+dynamo_peak_mem     gmean=0.37x mean=0.739x
+calls_captured      gmean=219.02x mean=494.800x
+unique_graphs       gmean=1.83x mean=7.125x
+graph_breaks        gmean=0.00x mean=5.088x
+unique_graph_breaks gmean=0.00x mean=1.312x
+start baseline
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+torchrec_dlrm
+/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/fbgemm_gpu_py.so: undefined symbol: _ZNK5torch8autograd4Node4nameEv
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 36, in <module>
+    run()
+  File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 30, in run
+    benchmark.run(bm_args)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/run.py", line 24, in run
+    main(TorchBenchmarkRunner(), original_dir, args)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3133, in main
+    process_entry(0, runner, original_dir, args)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3090, in process_entry
+    return maybe_fresh_cache(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1714, in inner
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 383, in load_model
+    module = importlib.import_module(c)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/importlib/__init__.py", line 126, in import_module
+    return _bootstrap._gcd_import(name[level:], package, level)
+  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
+  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
+  File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
+  File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
+  File "<frozen importlib._bootstrap_external>", line 883, in exec_module
+  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/__init__.py", line 7, in <module>
+    from .data.dlrm_dataloader import get_dataloader
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/data/dlrm_dataloader.py", line 13, in <module>
+    from torchrec.datasets.criteo import (
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/__init__.py", line 8, in <module>
+    import torchrec.distributed  # noqa
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/__init__.py", line 36, in <module>
+    from torchrec.distributed.model_parallel import DistributedModelParallel  # noqa
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/model_parallel.py", line 21, in <module>
+    from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/__init__.py", line 22, in <module>
+    from torchrec.distributed.planner.planners import EmbeddingShardingPlanner  # noqa
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/planners.py", line 19, in <module>
+    from torchrec.distributed.planner.constants import BATCH_SIZE, MAX_SIZE
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/constants.py", line 10, in <module>
+    from torchrec.distributed.embedding_types import EmbeddingComputeKernel
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/embedding_types.py", line 14, in <module>
+    from fbgemm_gpu.split_table_batched_embeddings_ops_training import EmbeddingLocation
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 23, in <module>
+    from . import _fbgemm_gpu_docs, sparse_ops  # noqa: F401, E402  # noqa: F401, E402
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/_fbgemm_gpu_docs.py", line 19, in <module>
+    torch.ops.fbgemm.jagged_2d_to_dense,
+  File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 822, in __getattr__
+    raise AttributeError(
+AttributeError: '_OpNamespace' 'fbgemm' object has no attribute 'jagged_2d_to_dense'
+Run failed with return code:  1
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+BERT_pytorch
+cuda eval  BERT_pytorch                        baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:00, 65.70it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 68.54it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 69.81it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 69.64it/s]
+3335.310ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+Background_Matting
+cuda eval  Background_Matting                  baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:00, 32.70it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 38.57it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 40.48it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 41.42it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 41.93it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 42.20it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 41.09it/s]
+2050.328ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model DALLE2_pytorch does not support bfloat16, running with amp instead
+loading model: 0it [00:13, ?it/s]
+WARNING:common:Model DALLE2_pytorch does not support bfloat16, running with amp instead
+DALLE2_pytorch
+cuda eval  DALLE2_pytorch                      baseline                  
+WARNING:common:Model DALLE2_pytorch does not support bfloat16, running with amp instead
+[2023-12-12 02:13:25,005] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 02:13:25,565] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 02:13:25,948] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 02:13:26,334] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 02:13:26,729] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 02:13:27,416] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 02:13:27,809] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 02:13:28,205] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 02:13:28,609] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 02:13:29,007] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 02:13:29,405] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 02:13:29,799] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 02:13:58,491] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 02:13:59,018] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 02:13:59,406] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 02:13:59,789] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 02:14:00,176] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 02:14:00,568] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 02:14:00,970] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 02:14:01,366] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 02:14:01,766] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 02:14:02,157] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 02:14:02,545] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 02:14:02,933] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:16,  1.76it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:09,  2.81it/s]running benchmark:  10%|█         | 3/30 [00:00<00:07,  3.43it/s]running benchmark:  13%|█▎        | 4/30 [00:01<00:06,  3.90it/s]running benchmark:  17%|█▋        | 5/30 [00:01<00:05,  4.24it/s]running benchmark:  20%|██        | 6/30 [00:01<00:05,  4.46it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:05,  4.57it/s]running benchmark:  27%|██▋       | 8/30 [00:02<00:04,  4.65it/s]running benchmark:  30%|███       | 9/30 [00:02<00:04,  4.76it/s]running benchmark:  33%|███▎      | 10/30 [00:02<00:04,  4.83it/s]running benchmark:  37%|███▋      | 11/30 [00:02<00:03,  4.89it/s]running benchmark:  40%|████      | 12/30 [00:02<00:03,  4.86it/s]running benchmark:  43%|████▎     | 13/30 [00:03<00:03,  4.85it/s]running benchmark:  47%|████▋     | 14/30 [00:03<00:03,  4.84it/s]running benchmark:  50%|█████     | 15/30 [00:03<00:03,  4.83it/s]running benchmark:  53%|█████▎    | 16/30 [00:03<00:02,  4.86it/s]running benchmark:  57%|█████▋    | 17/30 [00:03<00:02,  4.88it/s]running benchmark:  60%|██████    | 18/30 [00:04<00:02,  4.91it/s]running benchmark:  63%|██████▎   | 19/30 [00:04<00:02,  4.90it/s]running benchmark:  67%|██████▋   | 20/30 [00:04<00:02,  4.81it/s]running benchmark:  70%|███████   | 21/30 [00:04<00:01,  4.79it/s]running benchmark:  73%|███████▎  | 22/30 [00:04<00:01,  4.82it/s]running benchmark:  77%|███████▋  | 23/30 [00:05<00:01,  4.84it/s]running benchmark:  80%|████████  | 24/30 [00:05<00:01,  4.88it/s]running benchmark:  83%|████████▎ | 25/30 [00:05<00:01,  4.91it/s]running benchmark:  87%|████████▋ | 26/30 [00:05<00:00,  4.89it/s]running benchmark:  90%|█████████ | 27/30 [00:05<00:00,  4.81it/s]running benchmark:  93%|█████████▎| 28/30 [00:06<00:00,  4.80it/s]running benchmark:  97%|█████████▋| 29/30 [00:06<00:00,  4.86it/s]running benchmark: 100%|██████████| 30/30 [00:06<00:00,  4.90it/s]running benchmark: 100%|██████████| 30/30 [00:06<00:00,  4.60it/s]
+3744.710ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+LearningToPaint
+cuda eval  LearningToPaint                     baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 197.09it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 203.05it/s]
+2070.288ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model Super_SloMo does not support bfloat16, running with amp instead
+loading model: 0it [00:03, ?it/s]
+WARNING:common:Model Super_SloMo does not support bfloat16, running with amp instead
+Super_SloMo
+cuda eval  Super_SloMo                         baseline                  
+WARNING:common:Model Super_SloMo does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:03,  7.31it/s]running benchmark:  10%|█         | 3/30 [00:00<00:02, 13.43it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 15.76it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 16.97it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 17.65it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:01, 18.05it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 18.31it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 18.48it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 18.63it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 18.72it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 18.77it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 18.79it/s]running benchmark:  83%|████████▎ | 25/30 [00:01<00:00, 18.81it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 18.82it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 18.83it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 17.94it/s]
+1621.762ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+alexnet
+cuda eval  alexnet                             baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 132.55it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 147.84it/s]
+1362.181ms
+loading model: 0it [00:00, ?it/s]basic_gnn_edgecnn
+loading model: 0it [00:03, ?it/s]
+cuda eval  basic_gnn_edgecnn                   baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  73%|███████▎  | 22/30 [00:00<00:00, 212.13it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 214.11it/s]
+1383.746ms
+loading model: 0it [00:00, ?it/s]basic_gnn_gcn
+loading model: 0it [00:03, ?it/s]
+cuda eval  basic_gnn_gcn                       baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 127.65it/s]running benchmark:  87%|████████▋ | 26/30 [00:00<00:00, 127.76it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 127.70it/s]
+1045.334ms
+loading model: 0it [00:00, ?it/s]basic_gnn_gin
+loading model: 0it [00:03, ?it/s]
+cuda eval  basic_gnn_gin                       baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 338.67it/s]
+1216.903ms
+loading model: 0it [00:00, ?it/s]basic_gnn_sage
+loading model: 0it [00:02, ?it/s]
+cuda eval  basic_gnn_sage                      baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 155.04it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 160.61it/s]
+1144.588ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:06, ?it/s]
+cm3leon_generate
+cuda eval  cm3leon_generate                    baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:01<00:36,  1.27s/it]running benchmark:   7%|▋         | 2/30 [00:02<00:34,  1.22s/it]running benchmark:  10%|█         | 3/30 [00:03<00:32,  1.21s/it]running benchmark:  13%|█▎        | 4/30 [00:04<00:31,  1.20s/it]running benchmark:  17%|█▋        | 5/30 [00:06<00:29,  1.20s/it]running benchmark:  20%|██        | 6/30 [00:07<00:28,  1.19s/it]running benchmark:  23%|██▎       | 7/30 [00:08<00:27,  1.19s/it]running benchmark:  27%|██▋       | 8/30 [00:09<00:26,  1.19s/it]running benchmark:  30%|███       | 9/30 [00:10<00:24,  1.19s/it]running benchmark:  33%|███▎      | 10/30 [00:11<00:23,  1.19s/it]running benchmark:  37%|███▋      | 11/30 [00:13<00:22,  1.19s/it]running benchmark:  40%|████      | 12/30 [00:14<00:21,  1.19s/it]running benchmark:  43%|████▎     | 13/30 [00:15<00:20,  1.19s/it]running benchmark:  47%|████▋     | 14/30 [00:16<00:18,  1.18s/it]running benchmark:  50%|█████     | 15/30 [00:17<00:17,  1.18s/it]running benchmark:  53%|█████▎    | 16/30 [00:19<00:16,  1.18s/it]running benchmark:  57%|█████▋    | 17/30 [00:20<00:15,  1.18s/it]running benchmark:  60%|██████    | 18/30 [00:21<00:14,  1.18s/it]running benchmark:  63%|██████▎   | 19/30 [00:22<00:13,  1.18s/it]running benchmark:  67%|██████▋   | 20/30 [00:23<00:11,  1.18s/it]running benchmark:  70%|███████   | 21/30 [00:24<00:10,  1.18s/it]running benchmark:  73%|███████▎  | 22/30 [00:26<00:09,  1.18s/it]running benchmark:  77%|███████▋  | 23/30 [00:27<00:08,  1.18s/it]running benchmark:  80%|████████  | 24/30 [00:28<00:07,  1.18s/it]running benchmark:  83%|████████▎ | 25/30 [00:29<00:05,  1.18s/it]running benchmark:  87%|████████▋ | 26/30 [00:30<00:04,  1.18s/it]running benchmark:  90%|█████████ | 27/30 [00:32<00:03,  1.18s/it]running benchmark:  93%|█████████▎| 28/30 [00:33<00:02,  1.18s/it]running benchmark:  97%|█████████▋| 29/30 [00:34<00:01,  1.18s/it]running benchmark: 100%|██████████| 30/30 [00:35<00:00,  1.18s/it]running benchmark: 100%|██████████| 30/30 [00:35<00:00,  1.19s/it]
+4332.072ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+dcgan
+cuda eval  dcgan                               baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 495.59it/s]
+1375.917ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model demucs does not support bfloat16, running with amp instead
+loading model: 0it [00:04, ?it/s]
+WARNING:common:Model demucs does not support bfloat16, running with amp instead
+demucs
+cuda eval  demucs                              baseline                  
+WARNING:common:Model demucs does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 14.24it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 18.53it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 19.93it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 20.58it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 20.92it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 21.13it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 21.26it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 21.35it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 21.41it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 21.44it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 20.83it/s]
+1179.662ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+densenet121
+cuda eval  densenet121                         baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 16.55it/s]running benchmark:  20%|██        | 6/30 [00:00<00:00, 26.90it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 30.31it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 31.93it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 32.83it/s]running benchmark:  73%|███████▎  | 22/30 [00:00<00:00, 33.35it/s]running benchmark:  87%|████████▋ | 26/30 [00:00<00:00, 33.70it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 33.94it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 32.10it/s]
+2082.325ms
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_c4
+WARNING:common:Model detectron2_fasterrcnn_r_101_c4 does not support bfloat16, running with amp instead
+loading model: 0it [00:06, ?it/s]
+WARNING:common:Model detectron2_fasterrcnn_r_101_c4 does not support bfloat16, running with amp instead
+cuda eval  detectron2_fasterrcnn_r_101_c4      baseline                  
+WARNING:common:Model detectron2_fasterrcnn_r_101_c4 does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 02:26:50,440] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:02, 12.06it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:02, 12.71it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 12.97it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 13.08it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 13.17it/s]running benchmark:  40%|████      | 12/30 [00:00<00:01, 13.23it/s]running benchmark:  47%|████▋     | 14/30 [00:01<00:01, 13.28it/s]running benchmark:  53%|█████▎    | 16/30 [00:01<00:01, 13.31it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:00, 13.34it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 13.32it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 13.33it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 13.35it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 13.37it/s]running benchmark:  93%|█████████▎| 28/30 [00:02<00:00, 13.38it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 13.38it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 13.25it/s]
+1750.344ms
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_dc5
+loading model: 0it [00:08, ?it/s]
+WARNING:root:detectron2_fasterrcnn_r_101_dc5 failed to load
+Original Error: "roi_align_forward_kernel" not implemented for 'BFloat16'
+Eager model failed to run
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model
+    self.model_iter_fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward
+    return self.inference(batched_inputs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference
+    results, _ = self.roi_heads(images, features, proposals, None)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 747, in forward
+    pred_instances = self._forward_box(features, proposals)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 798, in _forward_box
+    box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals])
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward
+    return self.level_poolers[0](x[0], pooler_fmt_boxes)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward
+    return roi_align(
+  File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 238, in roi_align
+    return torch.ops.torchvision.roi_align(
+  File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 755, in __call__
+    return self._op(*args, **(kwargs or {}))
+RuntimeError: "roi_align_forward_kernel" not implemented for 'BFloat16'
+
+The above exception was the direct cause of the following exception:
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model
+    self.validate_model(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model
+    raise NotImplementedError("Eager model failed to run") from e
+NotImplementedError: Eager model failed to run
+
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_fpn
+WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:05, ?it/s]
+WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_fasterrcnn_r_101_fpn     baseline                  
+WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 02:28:34,925] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:08,  3.56it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:02, 10.85it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 14.70it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 16.93it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 18.27it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 18.66it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:00, 19.51it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 20.03it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 20.39it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 20.61it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 20.83it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 18.06it/s]
+2267.251ms
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_50_c4
+loading model: 0it [00:05, ?it/s]
+WARNING:root:detectron2_fasterrcnn_r_50_c4 failed to load
+Original Error: "roi_align_forward_kernel" not implemented for 'BFloat16'
+Eager model failed to run
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model
+    self.model_iter_fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward
+    return self.inference(batched_inputs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference
+    results, _ = self.roi_heads(images, features, proposals, None)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 477, in forward
+    box_features = self._shared_roi_transform(
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 456, in _shared_roi_transform
+    x = self.pooler(features, boxes)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward
+    return self.level_poolers[0](x[0], pooler_fmt_boxes)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward
+    return roi_align(
+  File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 238, in roi_align
+    return torch.ops.torchvision.roi_align(
+  File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 755, in __call__
+    return self._op(*args, **(kwargs or {}))
+RuntimeError: "roi_align_forward_kernel" not implemented for 'BFloat16'
+
+The above exception was the direct cause of the following exception:
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model
+    self.validate_model(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model
+    raise NotImplementedError("Eager model failed to run") from e
+NotImplementedError: Eager model failed to run
+
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_50_dc5
+WARNING:common:Model detectron2_fasterrcnn_r_50_dc5 does not support bfloat16, running with amp instead
+loading model: 0it [00:07, ?it/s]
+WARNING:common:Model detectron2_fasterrcnn_r_50_dc5 does not support bfloat16, running with amp instead
+cuda eval  detectron2_fasterrcnn_r_50_dc5      baseline                  
+WARNING:common:Model detectron2_fasterrcnn_r_50_dc5 does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 02:29:46,856] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  10%|█         | 3/30 [00:00<00:01, 21.52it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 23.23it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 23.78it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 24.11it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 24.30it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 24.41it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 24.48it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 24.48it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 24.51it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 24.55it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 24.24it/s]
+1387.064ms
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_50_fpn
+WARNING:common:Model detectron2_fasterrcnn_r_50_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:05, ?it/s]
+WARNING:common:Model detectron2_fasterrcnn_r_50_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_fasterrcnn_r_50_fpn      baseline                  
+WARNING:common:Model detectron2_fasterrcnn_r_50_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 02:30:53,395] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 19.03it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 23.10it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 24.36it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 25.05it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 25.45it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 25.67it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 25.74it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 25.68it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 25.52it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 25.71it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 25.24it/s]
+2048.581ms
+loading model: 0it [00:00, ?it/s]detectron2_fcos_r_50_fpn
+WARNING:common:Model detectron2_fcos_r_50_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:05, ?it/s]
+WARNING:common:Model detectron2_fcos_r_50_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_fcos_r_50_fpn            baseline                  
+WARNING:common:Model detectron2_fcos_r_50_fpn does not support bfloat16, running with amp instead
+[2023-12-12 02:32:03,808] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (8)
+[2023-12-12 02:32:03,808] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/batch_norm.py:318)
+[2023-12-12 02:32:03,808] torch._dynamo.convert_frame: [WARNING]    last reason: L['self']._pos == 0                                           # ret = self[self._pos](x)  # miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/batch_norm.py:319 in forward
+[2023-12-12 02:32:03,808] torch._dynamo.convert_frame: [WARNING] To log all recompilation reasons, use TORCH_LOGS="recompiles".
+[2023-12-12 02:32:03,808] torch._dynamo.convert_frame: [WARNING] To diagnose recompilation issues, see https://pytorch.org/docs/master/compile/troubleshooting.html.
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:05,  4.86it/s]running benchmark:  10%|█         | 3/30 [00:00<00:02, 10.99it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 14.11it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 16.06it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 17.30it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:01, 18.10it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 18.64it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 19.21it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 18.90it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 18.72it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 19.20it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 19.56it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 19.81it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 17.84it/s]
+1574.267ms
+loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_101_c4
+WARNING:common:Model detectron2_maskrcnn_r_101_c4 does not support bfloat16, running with amp instead
+loading model: 0it [00:06, ?it/s]
+WARNING:common:Model detectron2_maskrcnn_r_101_c4 does not support bfloat16, running with amp instead
+cuda eval  detectron2_maskrcnn_r_101_c4        baseline                  
+WARNING:common:Model detectron2_maskrcnn_r_101_c4 does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 02:33:27,070] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 02:33:33,643] [30/1_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:10,  2.88it/s]running benchmark:  10%|█         | 3/30 [00:00<00:04,  6.64it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:02,  8.70it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:02,  9.93it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 10.71it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:01, 11.10it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:01, 11.07it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01, 11.46it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:01, 11.73it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 11.93it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 12.08it/s]running benchmark:  77%|███████▋  | 23/30 [00:02<00:00, 12.18it/s]running benchmark:  83%|████████▎ | 25/30 [00:02<00:00, 11.76it/s]running benchmark:  90%|█████████ | 27/30 [00:02<00:00, 11.95it/s]running benchmark:  97%|█████████▋| 29/30 [00:02<00:00, 12.10it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 10.95it/s]
+1749.166ms
+loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_101_fpn
+WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:06, ?it/s]
+WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_maskrcnn_r_101_fpn       baseline                  
+WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 02:35:09,035] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 02:35:14,759] [30/1_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:03,  9.08it/s]running benchmark:  10%|█         | 3/30 [00:00<00:02, 13.25it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 14.45it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 14.98it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 15.43it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:01, 15.55it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:01, 15.78it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 15.94it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:00, 16.07it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 16.04it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 16.14it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 16.22it/s]running benchmark:  83%|████████▎ | 25/30 [00:01<00:00, 16.21it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 16.07it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 16.16it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 15.68it/s]
+2305.041ms
+loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_50_c4
+WARNING:common:Model detectron2_maskrcnn_r_50_c4 does not support bfloat16, running with amp instead
+loading model: 0it [00:05, ?it/s]
+WARNING:common:Model detectron2_maskrcnn_r_50_c4 does not support bfloat16, running with amp instead
+cuda eval  detectron2_maskrcnn_r_50_c4         baseline                  
+WARNING:common:Model detectron2_maskrcnn_r_50_c4 does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 02:36:17,163] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 02:36:24,323] [30/1_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:04,  6.07it/s]running benchmark:  10%|█         | 3/30 [00:00<00:02,  9.74it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:02, 11.26it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 12.39it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 13.01it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:01, 13.50it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:01, 13.80it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01, 14.03it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:00, 14.19it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 14.31it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 14.36it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 14.39it/s]running benchmark:  83%|████████▎ | 25/30 [00:01<00:00, 14.40it/s]running benchmark:  90%|█████████ | 27/30 [00:02<00:00, 14.11it/s]running benchmark:  97%|█████████▋| 29/30 [00:02<00:00, 14.26it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 13.53it/s]
+1559.286ms
+loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_50_fpn
+WARNING:common:Model detectron2_maskrcnn_r_50_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:07, ?it/s]
+WARNING:common:Model detectron2_maskrcnn_r_50_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_maskrcnn_r_50_fpn        baseline                  
+WARNING:common:Model detectron2_maskrcnn_r_50_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 02:37:39,716] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 02:37:45,096] [30/1_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 17.88it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:01, 17.62it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 18.34it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 18.72it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 18.94it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 18.84it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 19.00it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 19.09it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 19.16it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 19.26it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 19.28it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 19.33it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 19.40it/s]running benchmark:  93%|█████████▎| 28/30 [00:01<00:00, 19.45it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 19.49it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 19.10it/s]
+2101.316ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:13, ?it/s]
+dlrm
+cuda eval  dlrm                                baseline                  
+AUTOTUNE mm(2048x512, 512x64)
+  mm 0.0112 ms 100.0%
+  triton_mm_17 0.0115 ms 97.2%
+  triton_mm_21 0.0115 ms 97.2%
+  triton_mm_20 0.0116 ms 95.9%
+  triton_mm_18 0.0119 ms 93.8%
+  triton_mm_15 0.0125 ms 89.5%
+  triton_mm_16 0.0138 ms 81.2%
+  triton_mm_13 0.0146 ms 76.5%
+  triton_mm_14 0.0156 ms 71.8%
+  triton_mm_12 0.0211 ms 53.0%
+SingleProcess AUTOTUNE takes 1.8216 seconds
+AUTOTUNE mm(2048x100, 100x1024)
+  triton_mm_32 0.0141 ms 100.0%
+  triton_mm_33 0.0141 ms 99.5%
+  triton_mm_42 0.0143 ms 98.4%
+  triton_mm_39 0.0147 ms 95.7%
+  triton_mm_34 0.0148 ms 95.0%
+  triton_mm_36 0.0156 ms 90.2%
+  triton_mm_40 0.0160 ms 87.8%
+  triton_mm_35 0.0161 ms 87.3%
+  mm 0.0188 ms 74.8%
+  triton_mm_41 0.0196 ms 72.0%
+SingleProcess AUTOTUNE takes 1.8094 seconds
+AUTOTUNE mm(2048x1024, 1024x1024)
+  triton_mm_45 0.0330 ms 100.0%
+  triton_mm_46 0.0330 ms 100.0%
+  mm 0.0350 ms 94.2%
+  triton_mm_47 0.0395 ms 83.4%
+  triton_mm_44 0.0398 ms 82.8%
+  triton_mm_48 0.0401 ms 82.1%
+  triton_mm_52 0.0406 ms 81.1%
+  triton_mm_54 0.0583 ms 56.6%
+  triton_mm_51 0.0621 ms 53.0%
+  triton_mm_53 0.0687 ms 48.0%
+SingleProcess AUTOTUNE takes 1.7939 seconds
+AUTOTUNE mm(2048x1024, 1024x1)
+  mm 0.0129 ms 100.0%
+  triton_mm_73 0.0147 ms 87.8%
+  triton_mm_76 0.0150 ms 86.0%
+  triton_mm_71 0.0155 ms 83.3%
+  triton_mm_74 0.0166 ms 78.0%
+  triton_mm_77 0.0171 ms 75.5%
+  triton_mm_72 0.0177 ms 73.1%
+  triton_mm_69 0.0208 ms 62.1%
+  triton_mm_70 0.0223 ms 58.0%
+  triton_mm_68 0.0340 ms 38.0%
+SingleProcess AUTOTUNE takes 1.7995 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 295.13it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 294.55it/s]
+1147.255ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model doctr_det_predictor does not support bfloat16, running with amp instead
+loading model: 0it [00:05, ?it/s]
+WARNING:common:Model doctr_det_predictor does not support bfloat16, running with amp instead
+doctr_det_predictor
+cuda eval  doctr_det_predictor                 baseline                  
+WARNING:common:Model doctr_det_predictor does not support bfloat16, running with amp instead
+[2023-12-12 02:39:06,836] [1/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+malloc(): unaligned tcache chunk detected
+Run failed with return code:  -6
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead
+loading model: 0it [00:05, ?it/s]
+WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead
+doctr_reco_predictor
+cuda eval  doctr_reco_predictor                baseline                  
+WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 202.47it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 203.80it/s]
+2408.440ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+drq
+cuda eval  drq                                 baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 648.89it/s]
+3356.639ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+fastNLP_Bert
+cuda eval  fastNLP_Bert                        baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:00, 63.64it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 68.31it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 69.22it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 69.04it/s]
+2782.088ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+functorch_dp_cifar10
+cuda eval  functorch_dp_cifar10                baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 223.83it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 222.77it/s]
+5501.411ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+functorch_maml_omniglot
+cuda eval  functorch_maml_omniglot             baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 839.24it/s]
+3020.699ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+hf_Albert
+cuda eval  hf_Albert                           baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 74.09it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 80.45it/s]running benchmark:  87%|████████▋ | 26/30 [00:00<00:00, 84.20it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 83.39it/s]
+6016.647ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:06, ?it/s]
+hf_Bart
+cuda eval  hf_Bart                             baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:00, 35.48it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 57.27it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 64.53it/s]running benchmark:  93%|█████████▎| 28/30 [00:00<00:00, 67.82it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 63.94it/s]
+4140.647ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:06, ?it/s]
+hf_BigBird
+cuda eval  hf_BigBird                          baseline                  
+[2023-12-12 02:43:12,710] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 02:43:16,234] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 02:43:18,324] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 02:43:20,409] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 02:43:22,760] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 02:43:24,840] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 02:43:26,920] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 02:43:29,286] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 02:43:31,389] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 02:43:33,468] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 02:43:35,795] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 02:43:37,855] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:02, 13.77it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:01, 13.91it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 13.99it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 14.00it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 13.99it/s]running benchmark:  40%|████      | 12/30 [00:00<00:01, 13.96it/s]running benchmark:  47%|████▋     | 14/30 [00:01<00:01, 13.93it/s]running benchmark:  53%|█████▎    | 16/30 [00:01<00:01, 14.00it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:00, 14.05it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 14.00it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 13.99it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 13.99it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 13.95it/s]running benchmark:  93%|█████████▎| 28/30 [00:02<00:00, 13.99it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 14.04it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 13.99it/s]
+2429.188ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+hf_DistilBert
+cuda eval  hf_DistilBert                       baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 147.32it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 153.00it/s]
+4289.466ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:05, ?it/s]
+hf_GPT2
+cuda eval  hf_GPT2                             baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  20%|██        | 6/30 [00:00<00:00, 55.93it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 62.57it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 65.19it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 66.95it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 65.64it/s]
+3439.911ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:20, ?it/s]
+hf_GPT2_large
+cuda eval  hf_GPT2_large                       baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:02, 13.24it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:01, 16.17it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 17.47it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 18.21it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 18.70it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 19.05it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 19.31it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 19.40it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 19.37it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 19.40it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 19.36it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 19.40it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 19.41it/s]running benchmark:  93%|█████████▎| 28/30 [00:01<00:00, 19.42it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 19.57it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 18.91it/s]
+2140.341ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:06, ?it/s]
+hf_Longformer
+cuda eval  hf_Longformer                       baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:08,  3.43it/s]running benchmark:  10%|█         | 3/30 [00:00<00:03,  8.40it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:02, 11.55it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 13.53it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 14.76it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:01, 15.58it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:01, 16.20it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:00, 16.64it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:00, 16.99it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 17.22it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 17.37it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 17.51it/s]running benchmark:  83%|████████▎ | 25/30 [00:01<00:00, 17.56it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 17.52it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 17.47it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 15.45it/s]
+2006.816ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+hf_Reformer
+cuda eval  hf_Reformer                         baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 71.10it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 72.46it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 71.55it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 71.82it/s]
+3634.934ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:05, ?it/s]
+hf_T5
+cuda eval  hf_T5                               baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 17.09it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 23.20it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 25.26it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 26.26it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 26.80it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 27.15it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 27.38it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 27.50it/s]running benchmark:  87%|████████▋ | 26/30 [00:00<00:00, 27.57it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 27.66it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 26.69it/s]
+2804.253ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:07, ?it/s]
+hf_T5_base
+cuda eval  hf_T5_base                          baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:05,  5.03it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:03,  7.04it/s]running benchmark:  10%|█         | 3/30 [00:00<00:03,  8.06it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:03,  8.66it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:02,  9.03it/s]running benchmark:  20%|██        | 6/30 [00:00<00:02,  9.26it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:02,  9.42it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:02,  9.53it/s]running benchmark:  30%|███       | 9/30 [00:01<00:02,  9.60it/s]running benchmark:  33%|███▎      | 10/30 [00:01<00:02,  9.66it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:01,  9.68it/s]running benchmark:  40%|████      | 12/30 [00:01<00:01,  9.69it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:01,  9.70it/s]running benchmark:  47%|████▋     | 14/30 [00:01<00:01,  9.71it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01,  9.73it/s]running benchmark:  53%|█████▎    | 16/30 [00:01<00:01,  9.74it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:01,  9.74it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:01,  9.75it/s]running benchmark:  63%|██████▎   | 19/30 [00:02<00:01,  9.76it/s]running benchmark:  67%|██████▋   | 20/30 [00:02<00:01,  9.75it/s]running benchmark:  70%|███████   | 21/30 [00:02<00:00,  9.74it/s]running benchmark:  73%|███████▎  | 22/30 [00:02<00:00,  9.75it/s]running benchmark:  77%|███████▋  | 23/30 [00:02<00:00,  9.75it/s]running benchmark:  80%|████████  | 24/30 [00:02<00:00,  9.74it/s]running benchmark:  83%|████████▎ | 25/30 [00:02<00:00,  9.74it/s]running benchmark:  87%|████████▋ | 26/30 [00:02<00:00,  9.75it/s]running benchmark:  90%|█████████ | 27/30 [00:02<00:00,  9.76it/s]running benchmark:  93%|█████████▎| 28/30 [00:02<00:00,  9.76it/s]running benchmark:  97%|█████████▋| 29/30 [00:03<00:00,  9.75it/s]running benchmark: 100%|██████████| 30/30 [00:03<00:00,  9.75it/s]running benchmark: 100%|██████████| 30/30 [00:03<00:00,  9.46it/s]
+2780.546ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:07, ?it/s]
+hf_T5_generate
+cuda eval  hf_T5_generate                      baseline                  
+[2023-12-12 02:53:41,656] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (8)
+[2023-12-12 02:53:41,656] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py:1645)
+[2023-12-12 02:53:41,656] torch._dynamo.convert_frame: [WARNING]    last reason: ___check_obj_id(L['past_key_values'], 7628576)                # mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length  # miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py:1026 in forward
+[2023-12-12 02:53:41,656] torch._dynamo.convert_frame: [WARNING] To log all recompilation reasons, use TORCH_LOGS="recompiles".
+[2023-12-12 02:53:41,656] torch._dynamo.convert_frame: [WARNING] To diagnose recompilation issues, see https://pytorch.org/docs/master/compile/troubleshooting.html.
+[2023-12-12 02:55:57,441] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (8)
+[2023-12-12 02:55:57,441] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py:978)
+[2023-12-12 02:55:57,441] torch._dynamo.convert_frame: [WARNING]    last reason: tensor 'L['input_ids']' stride mismatch at index 0. expected 9, actual 17
+[2023-12-12 02:55:57,441] torch._dynamo.convert_frame: [WARNING] To log all recompilation reasons, use TORCH_LOGS="recompiles".
+[2023-12-12 02:55:57,441] torch._dynamo.convert_frame: [WARNING] To diagnose recompilation issues, see https://pytorch.org/docs/master/compile/troubleshooting.html.
+TIMEOUT
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:17, ?it/s]
+hf_T5_large
+cuda eval  hf_T5_large                         baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:13,  2.22it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:09,  3.02it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:04,  6.04it/s]running benchmark:  20%|██        | 6/30 [00:00<00:02,  8.36it/s]running benchmark:  27%|██▋       | 8/30 [00:01<00:02, 10.11it/s]running benchmark:  33%|███▎      | 10/30 [00:01<00:01, 11.42it/s]running benchmark:  40%|████      | 12/30 [00:01<00:01, 12.50it/s]running benchmark:  47%|████▋     | 14/30 [00:01<00:01, 13.13it/s]running benchmark:  53%|█████▎    | 16/30 [00:01<00:01, 13.68it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:00, 14.01it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 14.28it/s]running benchmark:  73%|███████▎  | 22/30 [00:02<00:00, 14.52it/s]running benchmark:  80%|████████  | 24/30 [00:02<00:00, 14.80it/s]running benchmark:  87%|████████▋ | 26/30 [00:02<00:00, 14.98it/s]running benchmark:  93%|█████████▎| 28/30 [00:02<00:00, 14.97it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 14.82it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 11.65it/s]
+4773.434ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+hf_Whisper
+cuda eval  hf_Whisper                          baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:00, 66.02it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 73.07it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 75.63it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 75.18it/s]
+3448.066ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:06, ?it/s]
+WARNING:root:hf_clip failed to load
+hf_clip
+Original Error: 'str' object has no attribute 'shape'
+Eager model failed to run
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model
+    self.model_iter_fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.py", line 1142, in forward
+    vision_outputs = self.vision_model(
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.py", line 883, in forward
+    hidden_states = self.embeddings(pixel_values)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.py", line 194, in forward
+    batch_size = pixel_values.shape[0]
+AttributeError: 'str' object has no attribute 'shape'
+
+The above exception was the direct cause of the following exception:
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model
+    self.validate_model(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model
+    raise NotImplementedError("Eager model failed to run") from e
+NotImplementedError: Eager model failed to run
+
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+lennard_jones
+cuda eval  lennard_jones                       baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 1463.52it/s]
+1440.380ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+llama
+cuda eval  llama                               baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 71.36it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 73.23it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 73.72it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 73.33it/s]
+5423.462ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [01:01, ?it/s]
+llama_v2_7b_16h
+cuda eval  llama_v2_7b_16h                     baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:03,  9.29it/s]running benchmark:  10%|█         | 3/30 [00:00<00:01, 15.15it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 17.96it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 18.95it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 19.45it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 19.75it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 19.96it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 20.04it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 20.12it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 20.19it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 20.23it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 19.46it/s]
+1376.325ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+maml_omniglot
+cuda eval  maml_omniglot                       baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 912.04it/s]
+2977.067ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+mnasnet1_0
+cuda eval  mnasnet1_0                          baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 113.05it/s]running benchmark:  83%|████████▎ | 25/30 [00:00<00:00, 120.56it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 120.32it/s]
+3446.202ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+mobilenet_v2
+cuda eval  mobilenet_v2                        baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 117.29it/s]running benchmark:  83%|████████▎ | 25/30 [00:00<00:00, 121.85it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 122.27it/s]
+5518.849ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+WARNING:root:mobilenet_v2_quantized_qat failed to load
+mobilenet_v2_quantized_qat
+The eval test only supports CPU.
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 443, in load_model
+    benchmark = benchmark_cls(
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/util/model.py", line 24, in __call__
+    obj = type.__call__(cls, *args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/models/mobilenet_v2_quantized_qat/__init__.py", line 21, in __init__
+    raise NotImplementedError("The eval test only supports CPU.")
+NotImplementedError: The eval test only supports CPU.
+
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+mobilenet_v3_large
+cuda eval  mobilenet_v3_large                  baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 104.80it/s]running benchmark:  73%|███████▎  | 22/30 [00:00<00:00, 107.78it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 108.06it/s]
+4392.932ms
+loading model: 0it [00:00, ?it/s]NCCL version 2.19.3+cuda12.0
+loading model: 0it [00:03, ?it/s]
+moco
+cuda eval  moco                                baseline                  
+[rank0]:[2023-12-12 03:31:01,710] [0/0] torch._dynamo.variables.torch: [WARNING] Profiler function <class 'torch.autograd.profiler.record_function'> will be ignored
+[rank0]:[2023-12-12 03:31:06,215] [1/0_1] torch._dynamo.backends.distributed: [WARNING] Some buckets were extended beyond their requested parameter capacities in order to ensure each subgraph has an output node, required for fx graph partitioning. This can be the case when a subgraph would have only contained nodes performing inplace mutation, and returning no logical outputs. This should not be a problem, unless it results in too few graph partitions for optimal DDP performance.
+[rank0]:[2023-12-12 03:31:06,238] [1/0_1] torch._dynamo.backends.distributed: [WARNING] DDPOptimizer extended these buckets to ensure per-subgraph output nodes:
+[rank0]:[2023-12-12 03:31:06,238] [1/0_1] torch._dynamo.backends.distributed: [WARNING] ┌─────────┬─────────────┬────────────────────────┐
+[rank0]:[2023-12-12 03:31:06,238] [1/0_1] torch._dynamo.backends.distributed: [WARNING] │   Index │   Extra Ops │   Extra Param Size (b) │
+[rank0]:[2023-12-12 03:31:06,238] [1/0_1] torch._dynamo.backends.distributed: [WARNING] ├─────────┼─────────────┼────────────────────────┤
+[rank0]:[2023-12-12 03:31:06,238] [1/0_1] torch._dynamo.backends.distributed: [WARNING] │       0 │         157 │               44910720 │
+[rank0]:[2023-12-12 03:31:06,238] [1/0_1] torch._dynamo.backends.distributed: [WARNING] └─────────┴─────────────┴────────────────────────┘
+skipping cudagraphs due to ['mutated inputs']
+[rank0]:[2023-12-12 03:31:29,846] [5/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+[rank0]:[W CUDAGraph.cpp:145] Warning: Waiting for pending NCCL work to finish before starting graph capture. (function operator())
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:00, 31.01it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 31.11it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 30.54it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 30.51it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 30.39it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 30.13it/s]running benchmark:  93%|█████████▎| 28/30 [00:00<00:00, 30.26it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 30.44it/s]
+2181.468ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+nanogpt
+number of parameters: 123.69M
+num decayed parameter tensors: 50, with 124,354,560 parameters
+num non-decayed parameter tensors: 98, with 121,344 parameters
+using fused AdamW: True
+cuda eval  nanogpt                             baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 126.18it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 131.88it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 131.59it/s]
+6866.092ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+nvidia_deeprecommender
+cuda eval  nvidia_deeprecommender              baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 202.06it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 205.43it/s]
+925.779ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+opacus_cifar10
+cuda eval  opacus_cifar10                      baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 208.61it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 209.06it/s]
+4825.083ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:27, ?it/s]
+phi_1_5
+cuda eval  phi_1_5                             baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 17.27it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 20.95it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 22.14it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 22.56it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 22.80it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 22.81it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 23.00it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 23.07it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 23.15it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 23.25it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 22.74it/s]
+2976.644ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+phlippe_densenet
+cuda eval  phlippe_densenet                    baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 94.65it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 100.10it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 100.94it/s]
+4911.829ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+phlippe_resnet
+cuda eval  phlippe_resnet                      baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  87%|████████▋ | 26/30 [00:00<00:00, 255.57it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 256.77it/s]
+4122.490ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+pyhpc_equation_of_state
+cuda eval  pyhpc_equation_of_state             baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 144.64it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 147.81it/s]
+23039.710ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+pyhpc_isoneutral_mixing
+cuda eval  pyhpc_isoneutral_mixing             baseline                  
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 90.89it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 91.83it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 91.10it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 91.15it/s]
+5979.240ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model pyhpc_turbulent_kinetic_energy does not support bfloat16, running with amp instead
+loading model: 0it [00:01, ?it/s]
+WARNING:common:Model pyhpc_turbulent_kinetic_energy does not support bfloat16, running with amp instead
+pyhpc_turbulent_kinetic_energy
+cuda eval  pyhpc_turbulent_kinetic_energy      baseline                  
+WARNING:common:Model pyhpc_turbulent_kinetic_energy does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 77.21it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 82.37it/s]running benchmark:  87%|████████▋ | 26/30 [00:00<00:00, 83.81it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 83.03it/s]
+4362.203ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+pytorch_CycleGAN_and_pix2pix
+cuda eval  pytorch_CycleGAN_and_pix2pix        baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 151.79it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 156.53it/s]
+2313.410ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+pytorch_stargan
+cuda eval  pytorch_stargan                     baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 108.60it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 117.92it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 118.22it/s]
+1905.811ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+pytorch_unet
+cuda eval  pytorch_unet                        baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:00, 37.50it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 42.86it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 44.56it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 45.34it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 45.75it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 46.01it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 45.08it/s]
+1814.741ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+resnet152
+cuda eval  resnet152                           baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  10%|█         | 3/30 [00:00<00:01, 23.97it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:00, 30.11it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 32.04it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 32.79it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 33.29it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 33.80it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 34.10it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 33.06it/s]
+2480.865ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+resnet18
+cuda eval  resnet18                            baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  87%|████████▋ | 26/30 [00:00<00:00, 250.84it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 250.70it/s]
+3656.235ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+resnet50
+cuda eval  resnet50                            baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:00, 68.98it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 79.21it/s]running benchmark:  83%|████████▎ | 25/30 [00:00<00:00, 82.39it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 81.14it/s]
+1900.407ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+WARNING:root:resnet50_quantized_qat failed to load
+resnet50_quantized_qat
+The eval test only supports CPU.
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 443, in load_model
+    benchmark = benchmark_cls(
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/util/model.py", line 24, in __call__
+    obj = type.__call__(cls, *args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/models/resnet50_quantized_qat/__init__.py", line 21, in __init__
+    raise NotImplementedError("The eval test only supports CPU.")
+NotImplementedError: The eval test only supports CPU.
+
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+resnext50_32x4d
+cuda eval  resnext50_32x4d                     baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 101.71it/s]running benchmark:  73%|███████▎  | 22/30 [00:00<00:00, 105.73it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 106.27it/s]
+4077.363ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:09, ?it/s]
+sam
+cuda eval  sam                                 baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:09,  2.93it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:06,  4.44it/s]running benchmark:  10%|█         | 3/30 [00:00<00:05,  5.33it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:04,  5.88it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:04,  6.24it/s]running benchmark:  20%|██        | 6/30 [00:01<00:03,  6.47it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:03,  6.63it/s]running benchmark:  27%|██▋       | 8/30 [00:01<00:03,  6.74it/s]running benchmark:  30%|███       | 9/30 [00:01<00:03,  6.81it/s]running benchmark:  33%|███▎      | 10/30 [00:01<00:02,  6.86it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:02,  6.89it/s]running benchmark:  40%|████      | 12/30 [00:01<00:02,  6.91it/s]running benchmark:  43%|████▎     | 13/30 [00:02<00:02,  6.93it/s]running benchmark:  47%|████▋     | 14/30 [00:02<00:02,  6.94it/s]running benchmark:  50%|█████     | 15/30 [00:02<00:02,  6.95it/s]running benchmark:  53%|█████▎    | 16/30 [00:02<00:02,  6.95it/s]running benchmark:  57%|█████▋    | 17/30 [00:02<00:01,  6.96it/s]running benchmark:  60%|██████    | 18/30 [00:02<00:01,  6.97it/s]running benchmark:  63%|██████▎   | 19/30 [00:02<00:01,  6.98it/s]running benchmark:  67%|██████▋   | 20/30 [00:03<00:01,  6.98it/s]running benchmark:  70%|███████   | 21/30 [00:03<00:01,  6.98it/s]running benchmark:  73%|███████▎  | 22/30 [00:03<00:01,  6.97it/s]running benchmark:  77%|███████▋  | 23/30 [00:03<00:01,  6.97it/s]running benchmark:  80%|████████  | 24/30 [00:03<00:00,  6.97it/s]running benchmark:  83%|████████▎ | 25/30 [00:03<00:00,  6.96it/s]running benchmark:  87%|████████▋ | 26/30 [00:03<00:00,  6.96it/s]running benchmark:  90%|█████████ | 27/30 [00:04<00:00,  6.96it/s]running benchmark:  93%|█████████▎| 28/30 [00:04<00:00,  6.96it/s]running benchmark:  97%|█████████▋| 29/30 [00:04<00:00,  6.96it/s]running benchmark: 100%|██████████| 30/30 [00:04<00:00,  6.95it/s]running benchmark: 100%|██████████| 30/30 [00:04<00:00,  6.66it/s]
+1585.266ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+shufflenet_v2_x1_0
+cuda eval  shufflenet_v2_x1_0                  baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 96.20it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 98.95it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 99.30it/s]
+3713.434ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+soft_actor_critic
+cuda eval  soft_actor_critic                   baseline                  
+AUTOTUNE mm(256x1024, 1024x1024)
+  mm 0.0128 ms 100.0%
+  triton_mm_19 0.0156 ms 81.8%
+  triton_mm_20 0.0181 ms 70.8%
+  triton_mm_14 0.0188 ms 68.2%
+  triton_mm_15 0.0190 ms 67.4%
+  triton_mm_16 0.0200 ms 64.0%
+  triton_mm_17 0.0201 ms 63.8%
+  triton_mm_13 0.0231 ms 55.3%
+  triton_mm_12 0.0233 ms 54.9%
+  triton_mm_11 0.0343 ms 37.3%
+SingleProcess AUTOTUNE takes 1.9600 seconds
+AUTOTUNE addmm(256x2, 256x1024, 1024x2)
+  triton_mm_29 0.0124 ms 100.0%
+  bias_addmm 0.0126 ms 98.7%
+  triton_mm_31 0.0134 ms 92.6%
+  triton_mm_28 0.0137 ms 90.7%
+  triton_mm_32 0.0140 ms 89.0%
+  triton_mm_26 0.0147 ms 84.3%
+  addmm 0.0158 ms 78.7%
+  triton_mm_27 0.0158 ms 78.4%
+  triton_mm_24 0.0189 ms 65.5%
+  triton_mm_25 0.0211 ms 59.0%
+SingleProcess AUTOTUNE takes 2.2548 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 1233.18it/s]
+1378.908ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+speech_transformer
+cuda eval  speech_transformer                  baseline                  
+AUTOTUNE mm(2040x320, 320x512)
+  triton_mm_1 0.0138 ms 100.0%
+  triton_mm_2 0.0143 ms 96.4%
+  mm 0.0146 ms 94.2%
+  triton_mm_3 0.0149 ms 92.5%
+  triton_mm_8 0.0149 ms 92.5%
+  triton_mm_4 0.0154 ms 89.8%
+  triton_mm_0 0.0167 ms 82.6%
+  triton_mm_5 0.0192 ms 72.0%
+  triton_mm_6 0.0194 ms 71.0%
+  triton_mm_7 0.0195 ms 70.9%
+SingleProcess AUTOTUNE takes 4.7974 seconds
+AUTOTUNE mm(2040x512, 512x512)
+  mm 0.0167 ms 100.0%
+  triton_mm_14 0.0169 ms 98.9%
+  triton_mm_13 0.0170 ms 98.3%
+  triton_mm_20 0.0180 ms 92.9%
+  triton_mm_16 0.0182 ms 91.4%
+  triton_mm_15 0.0187 ms 89.4%
+  triton_mm_12 0.0220 ms 75.8%
+  triton_mm_17 0.0241 ms 69.2%
+  triton_mm_18 0.0244 ms 68.4%
+  triton_mm_21 0.0263 ms 63.5%
+SingleProcess AUTOTUNE takes 4.7368 seconds
+AUTOTUNE mm(2040x512, 512x2048)
+  triton_mm_86 0.0313 ms 100.0%
+  triton_mm_85 0.0324 ms 96.5%
+  triton_mm_88 0.0367 ms 85.4%
+  triton_mm_87 0.0371 ms 84.5%
+  mm 0.0398 ms 78.8%
+  triton_mm_84 0.0421 ms 74.4%
+  triton_mm_92 0.0428 ms 73.1%
+  triton_mm_91 0.0435 ms 72.0%
+  triton_mm_94 0.0649 ms 48.3%
+  triton_mm_90 0.0719 ms 43.6%
+SingleProcess AUTOTUNE takes 4.5775 seconds
+AUTOTUNE mm(2040x2048, 2048x512)
+  mm 0.0348 ms 100.0%
+  triton_mm_98 0.0427 ms 81.3%
+  triton_mm_97 0.0428 ms 81.2%
+  triton_mm_104 0.0453 ms 76.7%
+  triton_mm_99 0.0473 ms 73.5%
+  triton_mm_100 0.0475 ms 73.1%
+  triton_mm_96 0.0529 ms 65.7%
+  triton_mm_101 0.0670 ms 51.9%
+  triton_mm_102 0.0680 ms 51.1%
+  triton_mm_103 0.0687 ms 50.6%
+SingleProcess AUTOTUNE takes 5.0999 seconds
+skipping cudagraphs due to ['non-cuda device in graph']
+AUTOTUNE mm(220x512, 512x512)
+  mm 0.0104 ms 100.0%
+  triton_mm_594 0.0104 ms 100.0%
+  triton_mm_593 0.0109 ms 95.9%
+  triton_mm_597 0.0112 ms 92.9%
+  triton_mm_596 0.0115 ms 91.1%
+  triton_mm_591 0.0131 ms 79.9%
+  triton_mm_592 0.0131 ms 79.8%
+  triton_mm_589 0.0151 ms 68.9%
+  triton_mm_590 0.0151 ms 68.9%
+  triton_mm_588 0.0204 ms 51.3%
+SingleProcess AUTOTUNE takes 4.8687 seconds
+AUTOTUNE mm(220x512, 512x2048)
+  mm 0.0123 ms 100.0%
+  triton_mm_727 0.0132 ms 93.2%
+  triton_mm_728 0.0138 ms 88.9%
+  triton_mm_732 0.0141 ms 86.8%
+  triton_mm_725 0.0159 ms 76.9%
+  triton_mm_726 0.0160 ms 76.6%
+  triton_mm_730 0.0169 ms 72.4%
+  triton_mm_729 0.0177 ms 69.4%
+  triton_mm_733 0.0181 ms 67.8%
+  triton_mm_724 0.0213 ms 57.5%
+SingleProcess AUTOTUNE takes 4.5695 seconds
+AUTOTUNE mm(220x2048, 2048x512)
+  mm 0.0168 ms 100.0%
+  triton_mm_742 0.0215 ms 78.2%
+  triton_mm_741 0.0219 ms 76.6%
+  triton_mm_745 0.0232 ms 72.5%
+  triton_mm_744 0.0242 ms 69.4%
+  triton_mm_739 0.0291 ms 57.7%
+  triton_mm_740 0.0292 ms 57.5%
+  triton_mm_737 0.0382 ms 44.0%
+  triton_mm_738 0.0386 ms 43.6%
+  triton_mm_736 0.0485 ms 34.6%
+SingleProcess AUTOTUNE takes 4.5253 seconds
+AUTOTUNE mm(220x512, 512x1014)
+  triton_mm_1556 0.0123 ms 100.0%
+  triton_mm_1557 0.0129 ms 95.0%
+  triton_mm_1554 0.0134 ms 91.6%
+  triton_mm_1552 0.0135 ms 91.0%
+  triton_mm_1551 0.0138 ms 89.1%
+  triton_mm_1553 0.0151 ms 81.5%
+  triton_mm_1549 0.0154 ms 80.0%
+  triton_mm_1550 0.0156 ms 78.7%
+  mm 0.0159 ms 77.1%
+  triton_mm_1548 0.0212 ms 58.1%
+SingleProcess AUTOTUNE takes 4.9891 seconds
+skipping cudagraphs due to ['non-cuda device in graph']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:06,  4.41it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 17.96it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 25.37it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 29.88it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 32.16it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 34.31it/s]running benchmark:  83%|████████▎ | 25/30 [00:00<00:00, 35.89it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 36.98it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 30.64it/s]
+1412.903ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+squeezenet1_1
+cuda eval  squeezenet1_1                       baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 308.74it/s]
+3531.496ms
+loading model: 0it [00:00, ?it/s]stable_diffusion_text_encoder
+
+Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s][A
+Loading pipeline components...:  33%|███▎      | 2/6 [00:00<00:00,  7.04it/s][A
+Loading pipeline components...:  67%|██████▋   | 4/6 [00:00<00:00,  9.56it/s][A
+Loading pipeline components...: 100%|██████████| 6/6 [00:00<00:00,  9.15it/s][ALoading pipeline components...: 100%|██████████| 6/6 [00:00<00:00,  8.94it/s]
+loading model: 0it [00:06, ?it/s]
+cuda eval  stable_diffusion_text_encoder       baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  20%|██        | 6/30 [00:00<00:00, 56.31it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 56.67it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 56.97it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 57.72it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 58.31it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 57.72it/s]
+6706.815ms
+loading model: 0it [00:00, ?it/s]stable_diffusion_unet
+
+Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s][A
+Loading pipeline components...:  17%|█▋        | 1/6 [00:00<00:00,  9.53it/s][A
+Loading pipeline components...:  33%|███▎      | 2/6 [00:00<00:00,  4.54it/s][A
+Loading pipeline components...: 100%|██████████| 6/6 [00:00<00:00,  9.33it/s][ALoading pipeline components...: 100%|██████████| 6/6 [00:00<00:00,  8.56it/s]
+loading model: 0it [00:08, ?it/s]
+cuda eval  stable_diffusion_unet               baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:05,  5.53it/s]running benchmark:  10%|█         | 3/30 [00:00<00:02,  9.25it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:02, 10.54it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:02, 11.18it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 11.54it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:01, 11.76it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:01, 11.90it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01, 12.00it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:01, 12.04it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 12.06it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 12.07it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 12.09it/s]running benchmark:  83%|████████▎ | 25/30 [00:02<00:00, 12.11it/s]running benchmark:  90%|█████████ | 27/30 [00:02<00:00, 12.14it/s]running benchmark:  97%|█████████▋| 29/30 [00:02<00:00, 12.15it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 11.69it/s]
+1368.901ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+timm_efficientdet
+cuda eval  timm_efficientdet                   baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:07,  3.72it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:07,  3.64it/s]running benchmark:  10%|█         | 3/30 [00:00<00:07,  3.68it/s]running benchmark:  13%|█▎        | 4/30 [00:01<00:07,  3.70it/s]running benchmark:  17%|█▋        | 5/30 [00:01<00:06,  3.66it/s]running benchmark:  20%|██        | 6/30 [00:01<00:06,  3.66it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:06,  3.67it/s]running benchmark:  27%|██▋       | 8/30 [00:02<00:05,  3.69it/s]running benchmark:  30%|███       | 9/30 [00:02<00:05,  3.70it/s]running benchmark:  33%|███▎      | 10/30 [00:02<00:05,  3.71it/s]running benchmark:  37%|███▋      | 11/30 [00:02<00:05,  3.72it/s]running benchmark:  40%|████      | 12/30 [00:03<00:04,  3.72it/s]running benchmark:  43%|████▎     | 13/30 [00:03<00:04,  3.74it/s]running benchmark:  47%|████▋     | 14/30 [00:03<00:04,  3.58it/s]running benchmark:  50%|█████     | 15/30 [00:04<00:04,  3.63it/s]running benchmark:  53%|█████▎    | 16/30 [00:04<00:03,  3.65it/s]running benchmark:  57%|█████▋    | 17/30 [00:04<00:03,  3.65it/s]running benchmark:  60%|██████    | 18/30 [00:04<00:03,  3.67it/s]running benchmark:  63%|██████▎   | 19/30 [00:05<00:02,  3.69it/s]running benchmark:  67%|██████▋   | 20/30 [00:05<00:02,  3.71it/s]running benchmark:  70%|███████   | 21/30 [00:05<00:02,  3.72it/s]running benchmark:  73%|███████▎  | 22/30 [00:05<00:02,  3.70it/s]running benchmark:  77%|███████▋  | 23/30 [00:06<00:01,  3.71it/s]running benchmark:  80%|████████  | 24/30 [00:06<00:01,  3.70it/s]running benchmark:  83%|████████▎ | 25/30 [00:06<00:01,  3.72it/s]running benchmark:  87%|████████▋ | 26/30 [00:07<00:01,  3.70it/s]running benchmark:  90%|█████████ | 27/30 [00:07<00:00,  3.68it/s]running benchmark:  93%|█████████▎| 28/30 [00:07<00:00,  3.71it/s]running benchmark:  97%|█████████▋| 29/30 [00:07<00:00,  3.66it/s]running benchmark: 100%|██████████| 30/30 [00:08<00:00,  3.68it/s]running benchmark: 100%|██████████| 30/30 [00:08<00:00,  3.68it/s]
+1473.071ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+timm_efficientnet
+cuda eval  timm_efficientnet                   baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  20%|██        | 6/30 [00:00<00:00, 54.85it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 59.14it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 60.45it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 61.06it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 60.40it/s]
+2281.463ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+timm_nfnet
+cuda eval  timm_nfnet                          baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:03,  7.61it/s]running benchmark:  10%|█         | 3/30 [00:00<00:02, 12.11it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 13.53it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 14.23it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 14.59it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:01, 14.82it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:01, 14.96it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:00, 15.07it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:00, 15.10it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 15.16it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 15.18it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 15.20it/s]running benchmark:  83%|████████▎ | 25/30 [00:01<00:00, 15.20it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 15.24it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 15.24it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 14.76it/s]
+1861.105ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+timm_regnet
+cuda eval  timm_regnet                         baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  10%|█         | 3/30 [00:00<00:01, 24.21it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:00, 30.21it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 32.18it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 33.12it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 33.67it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 33.92it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 34.11it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 33.15it/s]
+1376.068ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+timm_resnest
+cuda eval  timm_resnest                        baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 89.87it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 101.35it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 102.63it/s]
+1714.496ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+timm_vision_transformer
+cuda eval  timm_vision_transformer             baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 96.51it/s]running benchmark:  73%|███████▎  | 22/30 [00:00<00:00, 109.23it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 109.83it/s]
+1499.537ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:21, ?it/s]
+timm_vision_transformer_large
+cuda eval  timm_vision_transformer_large       baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:16,  1.74it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:09,  2.81it/s]running benchmark:  10%|█         | 3/30 [00:00<00:07,  3.50it/s]running benchmark:  13%|█▎        | 4/30 [00:01<00:06,  3.96it/s]running benchmark:  17%|█▋        | 5/30 [00:01<00:05,  4.27it/s]running benchmark:  20%|██        | 6/30 [00:01<00:05,  4.48it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:04,  4.61it/s]running benchmark:  27%|██▋       | 8/30 [00:01<00:04,  4.70it/s]running benchmark:  30%|███       | 9/30 [00:02<00:04,  4.78it/s]running benchmark:  33%|███▎      | 10/30 [00:02<00:04,  4.83it/s]running benchmark:  37%|███▋      | 11/30 [00:02<00:03,  4.85it/s]running benchmark:  40%|████      | 12/30 [00:02<00:03,  4.88it/s]running benchmark:  43%|████▎     | 13/30 [00:03<00:03,  4.89it/s]running benchmark:  47%|████▋     | 14/30 [00:03<00:03,  4.90it/s]running benchmark:  50%|█████     | 15/30 [00:03<00:03,  4.91it/s]running benchmark:  53%|█████▎    | 16/30 [00:03<00:02,  4.91it/s]running benchmark:  57%|█████▋    | 17/30 [00:03<00:02,  4.92it/s]running benchmark:  60%|██████    | 18/30 [00:04<00:02,  4.91it/s]running benchmark:  63%|██████▎   | 19/30 [00:04<00:02,  4.92it/s]running benchmark:  67%|██████▋   | 20/30 [00:04<00:02,  4.91it/s]running benchmark:  70%|███████   | 21/30 [00:04<00:01,  4.90it/s]running benchmark:  73%|███████▎  | 22/30 [00:04<00:01,  4.90it/s]running benchmark:  77%|███████▋  | 23/30 [00:05<00:01,  4.89it/s]running benchmark:  80%|████████  | 24/30 [00:05<00:01,  4.90it/s]running benchmark:  83%|████████▎ | 25/30 [00:05<00:01,  4.90it/s]running benchmark:  87%|████████▋ | 26/30 [00:05<00:00,  4.90it/s]running benchmark:  90%|█████████ | 27/30 [00:05<00:00,  4.90it/s]running benchmark:  93%|█████████▎| 28/30 [00:06<00:00,  4.90it/s]running benchmark:  97%|█████████▋| 29/30 [00:06<00:00,  4.90it/s]running benchmark: 100%|██████████| 30/30 [00:06<00:00,  4.90it/s]running benchmark: 100%|██████████| 30/30 [00:06<00:00,  4.64it/s]
+1006.294ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+timm_vovnet
+cuda eval  timm_vovnet                         baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:00, 69.97it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 82.61it/s]running benchmark:  87%|████████▋ | 26/30 [00:00<00:00, 85.81it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 84.45it/s]
+1796.761ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:05, ?it/s]
+torch_multimodal_clip
+cuda eval  torch_multimodal_clip               baseline                  
+skipping cudagraphs due to ['non-cuda device in graph']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:00, 49.68it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 50.37it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 50.76it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 50.88it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 50.38it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 50.41it/s]
+1737.318ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model tts_angular does not support bfloat16, running with amp instead
+loading model: 0it [00:00, ?it/s]
+WARNING:common:Model tts_angular does not support bfloat16, running with amp instead
+tts_angular
+cuda eval  tts_angular                         baseline                  
+WARNING:common:Model tts_angular does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 183.24it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 183.23it/s]
+970.672ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+vgg16
+cuda eval  vgg16                               baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 289.26it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 289.03it/s]
+1587.810ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model vision_maskrcnn does not support bfloat16, running with float16 instead
+loading model: 0it [00:03, ?it/s]
+WARNING:common:Model vision_maskrcnn does not support bfloat16, running with float16 instead
+vision_maskrcnn
+cuda eval  vision_maskrcnn                     baseline                  
+WARNING:common:Model vision_maskrcnn does not support bfloat16, running with float16 instead
+ERROR:common:Backend dynamo failed in warmup()
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2380, in warmup
+    fn(model, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 488, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/vision/torchvision/models/detection/generalized_rcnn.py", line 104, in forward
+    proposals, proposal_losses = self.rpn(images, features, targets)
+  File "/home/cdhernandez/local/vision/torchvision/models/detection/generalized_rcnn.py", line 105, in resume_in_forward
+    detections, detector_losses = self.roi_heads(features, proposals, images.image_sizes, targets)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/vision/torchvision/models/detection/roi_heads.py", line 761, in forward
+    box_features = self.box_roi_pool(features, proposals, image_shapes)
+  File "/home/cdhernandez/local/vision/torchvision/models/detection/roi_heads.py", line 775, in resume_in_forward
+    boxes, scores, labels = self.postprocess_detections(class_logits, box_regression, proposals, image_shapes)
+  File "/home/cdhernandez/local/vision/torchvision/models/detection/roi_heads.py", line 804, in resume_in_forward
+    mask_features = self.mask_roi_pool(features, mask_proposals, image_shapes)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 654, in catch_errors
+    return callback(frame, cache_entry, hooks, frame_state)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 727, in _convert_frame
+    result = inner_convert(frame, cache_entry, hooks, frame_state)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 383, in _convert_frame_assert
+    compiled_product = _compile(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 646, in _compile
+    guarded_code = compile_inner(code, one_graph, hooks, transform)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 562, in compile_inner
+    out_code = transform_code_object(code, transform)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/bytecode_transformation.py", line 1033, in transform_code_object
+    transformations(instructions, code_options)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 151, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 527, in transform
+    tracer.run()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2144, in run
+    super().run()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 818, in run
+    and self.step()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 781, in step
+    getattr(self, inst.opname)(inst)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2259, in RETURN_VALUE
+    self.output.compile_subgraph(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 945, in compile_subgraph
+    self.compile_and_call_fx_graph(tx, pass2.graph_output_vars(), root)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1087, in compile_and_call_fx_graph
+    compiled_fn = self.call_user_compiler(gm)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1159, in call_user_compiler
+    raise BackendCompilerFailed(self.compiler_fn, e).with_traceback(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1140, in call_user_compiler
+    compiled_fn = compiler_fn(gm, self.example_inputs())
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_dynamo.py", line 117, in debug_wrapper
+    compiled_gm = compiler_fn(gm, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/__init__.py", line 1672, in __call__
+    return compile_fx(model_, inputs_, config_patches=self.config)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 952, in compile_fx
+    return compile_fx(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1168, in compile_fx
+    return aot_autograd(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/backends/common.py", line 55, in compiler_fn
+    cg = aot_module_simplified(gm, example_inputs, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 887, in aot_module_simplified
+    compiled_fn = create_aot_dispatcher_function(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 600, in create_aot_dispatcher_function
+    compiled_fn = compiler_fn(flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 425, in aot_wrapper_dedupe
+    return compiler_fn(flat_fn, leaf_flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 630, in aot_wrapper_synthetic_base
+    return compiler_fn(flat_fn, flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py", line 97, in aot_dispatch_base
+    compiled_fw = compiler(fw_module, updated_flat_args)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1100, in fw_compiler_base
+    return inner_compile(
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_aot.py", line 83, in debug_wrapper
+    inner_compiled_fn = compiler_fn(gm, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/debug.py", line 305, in inner
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 320, in compile_fx_inner
+    compiled_graph = fx_codegen_and_compile(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 535, in fx_codegen_and_compile
+    graph.run(*example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 518, in run
+    return super().run(*args)
+  File "/home/cdhernandez/local/pytorch/torch/fx/interpreter.py", line 138, in run
+    self.env[node] = self.run_node(node)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 815, in run_node
+    result = self.call_function(n.target, args, kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 695, in call_function
+    raise LoweringException(e, target, args, kwargs).with_traceback(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 692, in call_function
+    out = lowerings[target](*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/lowering.py", line 291, in wrapped
+    out = decomp_fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/kernel/conv.py", line 367, in convolution
+    result = convolution(x, weight, None, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/lowering.py", line 291, in wrapped
+    out = decomp_fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/kernel/conv.py", line 457, in convolution
+    return autotune_select_algorithm("convolution", choices, args, layout)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/select_algorithm.py", line 991, in autotune_select_algorithm
+    return _ALGORITHM_SELECTOR_CACHE(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/select_algorithm.py", line 748, in __call__
+    timings = self.lookup(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 291, in lookup
+    timings = benchmark(choices)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/select_algorithm.py", line 739, in autotune
+    return make_benchmark_fn()(choices)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/select_algorithm.py", line 865, in benchmark_in_current_process
+    raise AssertionError(  # noqa: TRY200
+torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised:
+LoweringException: AssertionError: Incorrect result from choice ExternKernelCaller(extern_kernels.convolution)
+
+expected size 256==256, stride 196==1 at dim=1
+  target: aten.convolution.default
+  args[0]: TensorBox(StorageBox(
+    InputBuffer(name='arg12_1', layout=FixedLayout('cuda', torch.float16, size=[0, 256, 14, 14], stride=[50176, 196, 14, 1]))
+  ))
+  args[1]: TensorBox(StorageBox(
+    InputBuffer(name='arg0_1', layout=FixedLayout('cuda', torch.float16, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
+  ))
+  args[2]: TensorBox(StorageBox(
+    InputBuffer(name='arg1_1', layout=FixedLayout('cuda', torch.float16, size=[256], stride=[1]))
+  ))
+  args[3]: [1, 1]
+  args[4]: [1, 1]
+  args[5]: [1, 1]
+  args[6]: False
+  args[7]: [0, 0]
+  args[8]: 1
+
+Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information
+
+
+You can suppress this exception and fall back to eager by setting:
+    import torch._dynamo
+    torch._dynamo.config.suppress_errors = True
+
+Run failed with return code:  255
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+yolov3
+cuda eval  yolov3                              baseline                  
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:00, 45.32it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 51.86it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 54.03it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 54.87it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 54.88it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 53.83it/s]
+1848.739ms
+
+Summary for tag=0.000000:
+speedup             gmean=0.00x mean=0.000x
+abs_latency         gmean=0.00x mean=0.000x
+compilation_latency mean=0.000 seconds
+compression_ratio   mean=0.000x
+eager_peak_mem      gmean=0.00x mean=0.000x
+dynamo_peak_mem     gmean=0.00x mean=0.000x
+calls_captured      gmean=0.00x mean=0.000x
+unique_graphs       gmean=0.00x mean=0.000x
+graph_breaks        gmean=0.00x mean=0.000x
+unique_graph_breaks gmean=0.00x mean=0.000x
+
+Summary for tag=int8dynamic:
+speedup             gmean=9.20x mean=113.389x
+abs_latency         gmean=4.24x mean=10.510x
+compilation_latency mean=34.839 seconds
+compression_ratio   mean=1.263x
+eager_peak_mem      gmean=0.38x mean=0.878x
+dynamo_peak_mem     gmean=0.36x mean=0.844x
+calls_captured      gmean=233.44x mean=564.988x
+unique_graphs       gmean=1.86x mean=7.136x
+graph_breaks        gmean=0.00x mean=5.160x
+unique_graph_breaks gmean=0.00x mean=1.333x
+
+Summary for tag=int8weightonly:
+speedup             gmean=2.46x mean=2.889x
+abs_latency         gmean=4.51x mean=11.782x
+compilation_latency mean=31.136 seconds
+compression_ratio   mean=1.098x
+eager_peak_mem      gmean=0.38x mean=0.871x
+dynamo_peak_mem     gmean=0.46x mean=0.896x
+calls_captured      gmean=233.16x mean=563.963x
+unique_graphs       gmean=1.85x mean=7.183x
+graph_breaks        gmean=0.00x mean=5.220x
+unique_graph_breaks gmean=0.00x mean=1.317x
+
+Summary for tag=int4weightonly:
+speedup             gmean=2.01x mean=2.520x
+abs_latency         gmean=6.14x mean=33.943x
+compilation_latency mean=27.431 seconds
+compression_ratio   mean=1.140x
+eager_peak_mem      gmean=0.33x mean=0.696x
+dynamo_peak_mem     gmean=0.37x mean=0.739x
+calls_captured      gmean=219.02x mean=494.800x
+unique_graphs       gmean=1.83x mean=7.125x
+graph_breaks        gmean=0.00x mean=5.088x
+unique_graph_breaks gmean=0.00x mean=1.312x
+
+Summary for tag=baseline:
+speedup             gmean=2.42x mean=2.935x
+abs_latency         gmean=4.22x mean=13.273x
+compilation_latency mean=36.647 seconds
+compression_ratio   mean=1.125x
+eager_peak_mem      gmean=0.42x mean=1.075x
+dynamo_peak_mem     gmean=0.45x mean=1.120x
+calls_captured      gmean=240.73x mean=595.060x
+unique_graphs       gmean=1.89x mean=6.619x
+graph_breaks        gmean=0.00x mean=5.071x
+unique_graph_breaks gmean=0.00x mean=1.333x
+start int8 weight only batchsize 1
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+torchrec_dlrm
+/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/fbgemm_gpu_py.so: undefined symbol: _ZNK5torch8autograd4Node4nameEv
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 36, in <module>
+    run()
+  File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 30, in run
+    benchmark.run(bm_args)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/run.py", line 24, in run
+    main(TorchBenchmarkRunner(), original_dir, args)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3133, in main
+    process_entry(0, runner, original_dir, args)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3090, in process_entry
+    return maybe_fresh_cache(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1714, in inner
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 383, in load_model
+    module = importlib.import_module(c)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/importlib/__init__.py", line 126, in import_module
+    return _bootstrap._gcd_import(name[level:], package, level)
+  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
+  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
+  File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
+  File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
+  File "<frozen importlib._bootstrap_external>", line 883, in exec_module
+  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/__init__.py", line 7, in <module>
+    from .data.dlrm_dataloader import get_dataloader
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/data/dlrm_dataloader.py", line 13, in <module>
+    from torchrec.datasets.criteo import (
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/__init__.py", line 8, in <module>
+    import torchrec.distributed  # noqa
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/__init__.py", line 36, in <module>
+    from torchrec.distributed.model_parallel import DistributedModelParallel  # noqa
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/model_parallel.py", line 21, in <module>
+    from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/__init__.py", line 22, in <module>
+    from torchrec.distributed.planner.planners import EmbeddingShardingPlanner  # noqa
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/planners.py", line 19, in <module>
+    from torchrec.distributed.planner.constants import BATCH_SIZE, MAX_SIZE
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/constants.py", line 10, in <module>
+    from torchrec.distributed.embedding_types import EmbeddingComputeKernel
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/embedding_types.py", line 14, in <module>
+    from fbgemm_gpu.split_table_batched_embeddings_ops_training import EmbeddingLocation
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 23, in <module>
+    from . import _fbgemm_gpu_docs, sparse_ops  # noqa: F401, E402  # noqa: F401, E402
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/_fbgemm_gpu_docs.py", line 19, in <module>
+    torch.ops.fbgemm.jagged_2d_to_dense,
+  File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 822, in __getattr__
+    raise AttributeError(
+AttributeError: '_OpNamespace' 'fbgemm' object has no attribute 'jagged_2d_to_dense'
+Run failed with return code:  1
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+BERT_pytorch
+cuda eval  BERT_pytorch                        int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:00, 49.76it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 52.78it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 53.73it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 54.13it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 54.45it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 53.88it/s]
+7138.604ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+Background_Matting
+cuda eval  Background_Matting                  int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:00, 35.18it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 39.73it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 41.18it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 41.81it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 42.13it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 42.34it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 41.58it/s]
+2051.535ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model DALLE2_pytorch does not support bfloat16, running with amp instead
+loading model: 0it [00:13, ?it/s]
+WARNING:common:Model DALLE2_pytorch does not support bfloat16, running with amp instead
+DALLE2_pytorch
+cuda eval  DALLE2_pytorch                      int8weightonly-bs1        
+WARNING:common:Model DALLE2_pytorch does not support bfloat16, running with amp instead
+[2023-12-12 04:01:01,839] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 04:01:02,098] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 04:01:02,286] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 04:01:02,466] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 04:01:02,651] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 04:01:03,157] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 04:01:03,345] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 04:01:03,528] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 04:01:03,711] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 04:01:03,896] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 04:01:04,083] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 04:01:04,274] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 04:01:36,147] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 04:01:36,326] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 04:01:36,505] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 04:01:36,684] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 04:01:36,860] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 04:01:37,040] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 04:01:37,217] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 04:01:37,392] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 04:01:37,570] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 04:01:37,750] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 04:01:37,929] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 04:01:38,112] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:07,  3.88it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:06,  4.12it/s]running benchmark:  10%|█         | 3/30 [00:00<00:06,  4.12it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:06,  4.10it/s]running benchmark:  17%|█▋        | 5/30 [00:01<00:06,  4.11it/s]running benchmark:  20%|██        | 6/30 [00:01<00:05,  4.12it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:05,  4.12it/s]running benchmark:  27%|██▋       | 8/30 [00:01<00:05,  4.17it/s]running benchmark:  30%|███       | 9/30 [00:02<00:04,  4.23it/s]running benchmark:  33%|███▎      | 10/30 [00:02<00:04,  4.27it/s]running benchmark:  37%|███▋      | 11/30 [00:02<00:04,  4.32it/s]running benchmark:  40%|████      | 12/30 [00:02<00:04,  4.34it/s]running benchmark:  43%|████▎     | 13/30 [00:03<00:03,  4.35it/s]running benchmark:  47%|████▋     | 14/30 [00:03<00:03,  4.36it/s]running benchmark:  50%|█████     | 15/30 [00:03<00:03,  4.36it/s]running benchmark:  53%|█████▎    | 16/30 [00:03<00:03,  4.38it/s]running benchmark:  57%|█████▋    | 17/30 [00:03<00:02,  4.39it/s]running benchmark:  60%|██████    | 18/30 [00:04<00:02,  4.39it/s]running benchmark:  63%|██████▎   | 19/30 [00:04<00:02,  4.41it/s]running benchmark:  67%|██████▋   | 20/30 [00:04<00:02,  4.42it/s]running benchmark:  70%|███████   | 21/30 [00:04<00:02,  4.43it/s]running benchmark:  73%|███████▎  | 22/30 [00:05<00:01,  4.40it/s]running benchmark:  77%|███████▋  | 23/30 [00:05<00:01,  4.39it/s]running benchmark:  80%|████████  | 24/30 [00:05<00:01,  4.40it/s]running benchmark:  83%|████████▎ | 25/30 [00:05<00:01,  4.40it/s]running benchmark:  87%|████████▋ | 26/30 [00:06<00:00,  4.39it/s]running benchmark:  90%|█████████ | 27/30 [00:06<00:00,  4.33it/s]running benchmark:  93%|█████████▎| 28/30 [00:06<00:00,  4.36it/s]running benchmark:  97%|█████████▋| 29/30 [00:06<00:00,  4.36it/s]running benchmark: 100%|██████████| 30/30 [00:06<00:00,  4.38it/s]running benchmark: 100%|██████████| 30/30 [00:06<00:00,  4.31it/s]
+3766.648ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+LearningToPaint
+cuda eval  LearningToPaint                     int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  83%|████████▎ | 25/30 [00:00<00:00, 248.86it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 248.67it/s]
+5118.531ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model Super_SloMo does not support bfloat16, running with amp instead
+loading model: 0it [00:04, ?it/s]
+WARNING:common:Model Super_SloMo does not support bfloat16, running with amp instead
+Super_SloMo
+cuda eval  Super_SloMo                         int8weightonly-bs1        
+WARNING:common:Model Super_SloMo does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 18.85it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 44.74it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 52.55it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 56.44it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 58.63it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 53.80it/s]
+2797.495ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+alexnet
+cuda eval  alexnet                             int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 492.45it/s]
+1541.692ms
+loading model: 0it [00:00, ?it/s]basic_gnn_edgecnn
+loading model: 0it [00:04, ?it/s]
+cuda eval  basic_gnn_edgecnn                   int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  73%|███████▎  | 22/30 [00:00<00:00, 212.54it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 214.34it/s]
+1379.142ms
+loading model: 0it [00:00, ?it/s]basic_gnn_gcn
+loading model: 0it [00:02, ?it/s]
+cuda eval  basic_gnn_gcn                       int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 133.93it/s]running benchmark:  93%|█████████▎| 28/30 [00:00<00:00, 133.95it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 133.60it/s]
+1052.609ms
+loading model: 0it [00:00, ?it/s]basic_gnn_gin
+loading model: 0it [00:03, ?it/s]
+cuda eval  basic_gnn_gin                       int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 352.34it/s]
+1209.882ms
+loading model: 0it [00:00, ?it/s]basic_gnn_sage
+loading model: 0it [00:02, ?it/s]
+cuda eval  basic_gnn_sage                      int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 166.12it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 169.97it/s]
+1276.921ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:06, ?it/s]
+cm3leon_generate
+cuda eval  cm3leon_generate                    int8weightonly-bs1        
+TIMEOUT
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+dcgan
+cuda eval  dcgan                               int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 862.48it/s]
+2378.070ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model demucs does not support bfloat16, running with amp instead
+loading model: 0it [00:05, ?it/s]
+WARNING:common:Model demucs does not support bfloat16, running with amp instead
+demucs
+cuda eval  demucs                              int8weightonly-bs1        
+WARNING:common:Model demucs does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 14.43it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 18.67it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 20.01it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 20.64it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 20.99it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 21.19it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 21.31it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 21.37it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 21.40it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 21.46it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 20.88it/s]
+1180.755ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+densenet121
+cuda eval  densenet121                         int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:00, 44.74it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 46.57it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 47.29it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 47.62it/s]running benchmark:  83%|████████▎ | 25/30 [00:00<00:00, 47.83it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 48.01it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 47.54it/s]
+6992.733ms
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_c4
+WARNING:common:Model detectron2_fasterrcnn_r_101_c4 does not support bfloat16, running with amp instead
+loading model: 0it [00:08, ?it/s]
+WARNING:common:Model detectron2_fasterrcnn_r_101_c4 does not support bfloat16, running with amp instead
+cuda eval  detectron2_fasterrcnn_r_101_c4      int8weightonly-bs1        
+WARNING:common:Model detectron2_fasterrcnn_r_101_c4 does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 04:43:59,517] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:02, 11.59it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:02, 12.13it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 12.21it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 12.61it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 12.73it/s]running benchmark:  40%|████      | 12/30 [00:00<00:01, 12.91it/s]running benchmark:  47%|████▋     | 14/30 [00:01<00:01, 13.05it/s]running benchmark:  53%|█████▎    | 16/30 [00:01<00:01, 12.85it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:00, 12.98it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 12.99it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 12.80it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 12.95it/s]running benchmark:  87%|████████▋ | 26/30 [00:02<00:00, 13.07it/s]running benchmark:  93%|█████████▎| 28/30 [00:02<00:00, 13.14it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 13.20it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 12.89it/s]
+1749.183ms
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_dc5
+loading model: 0it [00:07, ?it/s]
+WARNING:root:detectron2_fasterrcnn_r_101_dc5 failed to load
+Original Error: "roi_align_forward_kernel" not implemented for 'BFloat16'
+Eager model failed to run
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model
+    self.model_iter_fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward
+    return self.inference(batched_inputs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference
+    results, _ = self.roi_heads(images, features, proposals, None)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 747, in forward
+    pred_instances = self._forward_box(features, proposals)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 798, in _forward_box
+    box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals])
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward
+    return self.level_poolers[0](x[0], pooler_fmt_boxes)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward
+    return roi_align(
+  File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 238, in roi_align
+    return torch.ops.torchvision.roi_align(
+  File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 755, in __call__
+    return self._op(*args, **(kwargs or {}))
+RuntimeError: "roi_align_forward_kernel" not implemented for 'BFloat16'
+
+The above exception was the direct cause of the following exception:
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model
+    self.validate_model(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model
+    raise NotImplementedError("Eager model failed to run") from e
+NotImplementedError: Eager model failed to run
+
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_fpn
+WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:05, ?it/s]
+WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_fasterrcnn_r_101_fpn     int8weightonly-bs1        
+WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 04:46:01,882] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 18.66it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:01, 18.53it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 19.78it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 20.30it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 20.58it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 20.74it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 20.85it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 20.88it/s]running benchmark:  83%|████████▎ | 25/30 [00:01<00:00, 20.90it/s]running benchmark:  93%|█████████▎| 28/30 [00:01<00:00, 20.90it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 20.60it/s]
+2261.248ms
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_50_c4
+loading model: 0it [00:07, ?it/s]
+WARNING:root:detectron2_fasterrcnn_r_50_c4 failed to load
+Original Error: "roi_align_forward_kernel" not implemented for 'BFloat16'
+Eager model failed to run
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model
+    self.model_iter_fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward
+    return self.inference(batched_inputs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference
+    results, _ = self.roi_heads(images, features, proposals, None)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 477, in forward
+    box_features = self._shared_roi_transform(
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 456, in _shared_roi_transform
+    x = self.pooler(features, boxes)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward
+    return self.level_poolers[0](x[0], pooler_fmt_boxes)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward
+    return roi_align(
+  File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 238, in roi_align
+    return torch.ops.torchvision.roi_align(
+  File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 755, in __call__
+    return self._op(*args, **(kwargs or {}))
+RuntimeError: "roi_align_forward_kernel" not implemented for 'BFloat16'
+
+The above exception was the direct cause of the following exception:
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model
+    self.validate_model(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model
+    raise NotImplementedError("Eager model failed to run") from e
+NotImplementedError: Eager model failed to run
+
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_50_dc5
+WARNING:common:Model detectron2_fasterrcnn_r_50_dc5 does not support bfloat16, running with amp instead
+loading model: 0it [00:09, ?it/s]
+WARNING:common:Model detectron2_fasterrcnn_r_50_dc5 does not support bfloat16, running with amp instead
+cuda eval  detectron2_fasterrcnn_r_50_dc5      int8weightonly-bs1        
+WARNING:common:Model detectron2_fasterrcnn_r_50_dc5 does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 04:47:20,782] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  10%|█         | 3/30 [00:00<00:01, 19.12it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 18.05it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 18.37it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 19.79it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 20.51it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 20.92it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 21.22it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 21.40it/s]running benchmark:  83%|████████▎ | 25/30 [00:01<00:00, 21.55it/s]running benchmark:  93%|█████████▎| 28/30 [00:01<00:00, 21.62it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 20.87it/s]
+1461.732ms
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_50_fpn
+WARNING:common:Model detectron2_fasterrcnn_r_50_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:06, ?it/s]
+WARNING:common:Model detectron2_fasterrcnn_r_50_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_fasterrcnn_r_50_fpn      int8weightonly-bs1        
+WARNING:common:Model detectron2_fasterrcnn_r_50_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 04:48:27,530] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  10%|█         | 3/30 [00:00<00:01, 23.07it/s]running benchmark:  20%|██        | 6/30 [00:00<00:00, 24.47it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 24.80it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 25.06it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 25.10it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 25.29it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 25.39it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 25.46it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 25.52it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 25.58it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 25.24it/s]
+2053.923ms
+loading model: 0it [00:00, ?it/s]detectron2_fcos_r_50_fpn
+WARNING:common:Model detectron2_fcos_r_50_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:06, ?it/s]
+WARNING:common:Model detectron2_fcos_r_50_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_fcos_r_50_fpn            int8weightonly-bs1        
+WARNING:common:Model detectron2_fcos_r_50_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:02, 13.81it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:01, 14.89it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 15.95it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 16.42it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 16.81it/s]running benchmark:  40%|████      | 12/30 [00:00<00:01, 17.08it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 17.24it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 17.23it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:00, 17.26it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 17.07it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 17.23it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 17.28it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 17.49it/s]running benchmark:  93%|█████████▎| 28/30 [00:01<00:00, 17.64it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 17.49it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 17.01it/s]
+1145.140ms
+loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_101_c4
+WARNING:common:Model detectron2_maskrcnn_r_101_c4 does not support bfloat16, running with amp instead
+loading model: 0it [00:06, ?it/s]
+WARNING:common:Model detectron2_maskrcnn_r_101_c4 does not support bfloat16, running with amp instead
+cuda eval  detectron2_maskrcnn_r_101_c4        int8weightonly-bs1        
+WARNING:common:Model detectron2_maskrcnn_r_101_c4 does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 04:51:27,965] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 04:51:36,373] [30/1_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:03,  9.17it/s]running benchmark:  10%|█         | 3/30 [00:00<00:02, 11.21it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:02, 11.73it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 11.94it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 12.04it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:01, 12.12it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:01, 12.16it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01, 12.21it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:01, 12.24it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 12.27it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 12.26it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 12.29it/s]running benchmark:  83%|████████▎ | 25/30 [00:02<00:00, 12.31it/s]running benchmark:  90%|█████████ | 27/30 [00:02<00:00, 12.32it/s]running benchmark:  97%|█████████▋| 29/30 [00:02<00:00, 12.34it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 12.15it/s]
+1736.754ms
+loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_101_fpn
+WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:06, ?it/s]
+WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_maskrcnn_r_101_fpn       int8weightonly-bs1        
+WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 04:53:14,587] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 04:53:21,261] [30/1_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:07,  3.64it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:07,  3.83it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:03,  7.60it/s]running benchmark:  20%|██        | 6/30 [00:00<00:02, 10.52it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 12.74it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 14.47it/s]running benchmark:  40%|████      | 12/30 [00:01<00:01, 15.75it/s]running benchmark:  47%|████▋     | 14/30 [00:01<00:00, 16.69it/s]running benchmark:  53%|█████▎    | 16/30 [00:01<00:00, 17.36it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:00, 17.85it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 18.13it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 18.41it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 18.59it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 18.71it/s]running benchmark:  93%|█████████▎| 28/30 [00:01<00:00, 18.77it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 18.85it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 14.91it/s]
+2089.813ms
+loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_50_c4
+WARNING:common:Model detectron2_maskrcnn_r_50_c4 does not support bfloat16, running with amp instead
+loading model: 0it [00:07, ?it/s]
+WARNING:common:Model detectron2_maskrcnn_r_50_c4 does not support bfloat16, running with amp instead
+cuda eval  detectron2_maskrcnn_r_50_c4         int8weightonly-bs1        
+WARNING:common:Model detectron2_maskrcnn_r_50_c4 does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 04:54:27,620] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 04:54:40,321] [30/1_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:02, 12.09it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:01, 13.40it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 13.87it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 14.11it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 14.26it/s]running benchmark:  40%|████      | 12/30 [00:00<00:01, 14.32it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:01, 14.36it/s]running benchmark:  53%|█████▎    | 16/30 [00:01<00:00, 14.41it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:00, 14.41it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 14.32it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 14.37it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 14.42it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 14.45it/s]running benchmark:  93%|█████████▎| 28/30 [00:01<00:00, 14.45it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 14.47it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 14.27it/s]
+1556.237ms
+loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_50_fpn
+WARNING:common:Model detectron2_maskrcnn_r_50_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:05, ?it/s]
+WARNING:common:Model detectron2_maskrcnn_r_50_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_maskrcnn_r_50_fpn        int8weightonly-bs1        
+WARNING:common:Model detectron2_maskrcnn_r_50_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 04:55:52,948] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 04:55:59,869] [30/1_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  10%|█         | 3/30 [00:00<00:01, 21.33it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 20.88it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 21.62it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 22.00it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 22.25it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 22.38it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 22.44it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 22.53it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 22.53it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 22.61it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 22.28it/s]
+1870.253ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:12, ?it/s]
+dlrm
+cuda eval  dlrm                                int8weightonly-bs1        
+AUTOTUNE bmm(1x9x64, 1x64x9)
+  triton_bmm_2 0.0060 ms 100.0%
+  triton_bmm_3 0.0060 ms 100.0%
+  triton_bmm_4 0.0061 ms 99.5%
+  triton_bmm_0 0.0065 ms 93.1%
+  triton_bmm_1 0.0066 ms 91.3%
+  triton_bmm_5 0.0067 ms 90.6%
+  triton_bmm_6 0.0075 ms 80.8%
+  triton_bmm_7 0.0075 ms 80.8%
+  bmm 0.0079 ms 76.5%
+SingleProcess AUTOTUNE takes 3.0044 seconds
+AUTOTUNE mixed_mm(1x1024, 1024x1)
+  triton_mm_12 0.0128 ms 100.0%
+  triton_mm_10 0.0142 ms 90.5%
+  triton_mm_11 0.0142 ms 90.2%
+  fallback_mixed_mm 0.0144 ms 88.9%
+  triton_mm_13 0.0150 ms 85.3%
+  triton_mm_9 0.0178 ms 72.3%
+  triton_mm_8 0.0285 ms 45.1%
+  triton_mm_14 0.0342 ms 37.5%
+SingleProcess AUTOTUNE takes 2.2746 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 462.08it/s]
+2244.350ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model doctr_det_predictor does not support bfloat16, running with amp instead
+loading model: 0it [00:05, ?it/s]
+WARNING:common:Model doctr_det_predictor does not support bfloat16, running with amp instead
+doctr_det_predictor
+cuda eval  doctr_det_predictor                 int8weightonly-bs1        
+WARNING:common:Model doctr_det_predictor does not support bfloat16, running with amp instead
+[2023-12-12 04:57:18,518] [1/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+Fatal glibc error: malloc.c:2496 (sysmalloc): assertion failed: (old_top == initial_top (av) && old_size == 0) || ((unsigned long) (old_size) >= MINSIZE && prev_inuse (old_top) && ((unsigned long) old_end & (pagesize - 1)) == 0)
+Run failed with return code:  -6
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead
+loading model: 0it [00:05, ?it/s]
+WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead
+doctr_reco_predictor
+cuda eval  doctr_reco_predictor                int8weightonly-bs1        
+WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 186.33it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 189.12it/s]
+2488.337ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+drq
+cuda eval  drq                                 int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 464.86it/s]
+3848.256ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+fastNLP_Bert
+cuda eval  fastNLP_Bert                        int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:00, 47.40it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 47.62it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 48.47it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 48.60it/s]running benchmark:  83%|████████▎ | 25/30 [00:00<00:00, 48.66it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 48.71it/s]
+3268.187ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+functorch_dp_cifar10
+cuda eval  functorch_dp_cifar10                int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  87%|████████▋ | 26/30 [00:00<00:00, 258.16it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 257.91it/s]
+5018.593ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+functorch_maml_omniglot
+cuda eval  functorch_maml_omniglot             int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 793.11it/s]
+2934.337ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+hf_Albert
+cuda eval  hf_Albert                           int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  20%|██        | 6/30 [00:00<00:00, 52.76it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 54.31it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 54.95it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 54.88it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 55.10it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 54.79it/s]
+6769.560ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:07, ?it/s]
+hf_Bart
+cuda eval  hf_Bart                             int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 18.13it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:00, 33.38it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 38.48it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 40.87it/s]running benchmark:  73%|███████▎  | 22/30 [00:00<00:00, 41.45it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 41.75it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 39.76it/s]
+4187.449ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:06, ?it/s]
+hf_BigBird
+cuda eval  hf_BigBird                          int8weightonly-bs1        
+[2023-12-12 05:01:58,478] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 05:02:01,429] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 05:02:03,196] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 05:02:04,934] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 05:02:06,912] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 05:02:08,689] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 05:02:10,403] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 05:02:12,095] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 05:02:14,114] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 05:02:15,848] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 05:02:17,633] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 05:02:19,438] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:02, 11.62it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:02, 11.51it/s]running benchmark:  20%|██        | 6/30 [00:00<00:02, 11.45it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 11.58it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 11.66it/s]running benchmark:  40%|████      | 12/30 [00:01<00:01, 11.66it/s]running benchmark:  47%|████▋     | 14/30 [00:01<00:01, 11.65it/s]running benchmark:  53%|█████▎    | 16/30 [00:01<00:01, 11.61it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:01, 11.61it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 11.70it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 11.72it/s]running benchmark:  80%|████████  | 24/30 [00:02<00:00, 11.73it/s]running benchmark:  87%|████████▋ | 26/30 [00:02<00:00, 11.76it/s]running benchmark:  93%|█████████▎| 28/30 [00:02<00:00, 11.72it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 11.69it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 11.66it/s]
+2634.267ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+hf_DistilBert
+cuda eval  hf_DistilBert                       int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 88.09it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 94.51it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 96.44it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 95.28it/s]
+4332.001ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:06, ?it/s]
+hf_GPT2
+cuda eval  hf_GPT2                             int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  20%|██        | 6/30 [00:00<00:00, 58.79it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 61.76it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 62.52it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 63.00it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 62.49it/s]
+3450.249ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:19, ?it/s]
+hf_GPT2_large
+cuda eval  hf_GPT2_large                       int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:03,  9.46it/s]running benchmark:  10%|█         | 3/30 [00:00<00:01, 15.23it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 17.25it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 18.25it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 18.79it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 19.15it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 19.21it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 19.33it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 19.49it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 19.60it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 19.70it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 19.74it/s]running benchmark:  83%|████████▎ | 25/30 [00:01<00:00, 19.73it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 19.78it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 19.84it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 19.06it/s]
+2044.745ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:06, ?it/s]
+hf_Longformer
+cuda eval  hf_Longformer                       int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:02, 11.87it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:01, 13.25it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 13.80it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 14.06it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 14.19it/s]running benchmark:  40%|████      | 12/30 [00:00<00:01, 14.27it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:01, 14.32it/s]running benchmark:  53%|█████▎    | 16/30 [00:01<00:00, 14.36it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:00, 14.41it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 14.42it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 14.45it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 14.46it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 14.45it/s]running benchmark:  93%|█████████▎| 28/30 [00:01<00:00, 14.44it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 14.44it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 14.25it/s]
+2398.162ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+hf_Reformer
+cuda eval  hf_Reformer                         int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:00, 60.75it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 62.40it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 63.33it/s]running benchmark:  93%|█████████▎| 28/30 [00:00<00:00, 63.03it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 62.80it/s]
+3875.878ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+hf_T5
+cuda eval  hf_T5                               int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:03,  7.93it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:01, 17.36it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 20.45it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 21.92it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 22.75it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 23.24it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 23.57it/s]running benchmark:  73%|███████▎  | 22/30 [00:00<00:00, 23.78it/s]running benchmark:  83%|████████▎ | 25/30 [00:01<00:00, 23.93it/s]running benchmark:  93%|█████████▎| 28/30 [00:01<00:00, 24.02it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 22.65it/s]
+2776.008ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:08, ?it/s]
+hf_T5_base
+cuda eval  hf_T5_base                          int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:16,  1.73it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:08,  3.27it/s]running benchmark:  10%|█         | 3/30 [00:00<00:05,  4.59it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:04,  5.65it/s]running benchmark:  17%|█▋        | 5/30 [00:01<00:03,  6.49it/s]running benchmark:  20%|██        | 6/30 [00:01<00:03,  7.13it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:03,  7.60it/s]running benchmark:  27%|██▋       | 8/30 [00:01<00:02,  7.94it/s]running benchmark:  30%|███       | 9/30 [00:01<00:02,  8.19it/s]running benchmark:  33%|███▎      | 10/30 [00:01<00:02,  8.37it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:02,  8.50it/s]running benchmark:  40%|████      | 12/30 [00:01<00:02,  8.59it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:01,  8.65it/s]running benchmark:  47%|████▋     | 14/30 [00:02<00:01,  8.70it/s]running benchmark:  50%|█████     | 15/30 [00:02<00:01,  8.73it/s]running benchmark:  53%|█████▎    | 16/30 [00:02<00:01,  8.76it/s]running benchmark:  57%|█████▋    | 17/30 [00:02<00:01,  8.78it/s]running benchmark:  60%|██████    | 18/30 [00:02<00:01,  8.79it/s]running benchmark:  63%|██████▎   | 19/30 [00:02<00:01,  8.79it/s]running benchmark:  67%|██████▋   | 20/30 [00:02<00:01,  8.79it/s]running benchmark:  70%|███████   | 21/30 [00:02<00:01,  8.79it/s]running benchmark:  73%|███████▎  | 22/30 [00:02<00:00,  8.80it/s]running benchmark:  77%|███████▋  | 23/30 [00:03<00:00,  8.79it/s]running benchmark:  80%|████████  | 24/30 [00:03<00:00,  8.79it/s]running benchmark:  83%|████████▎ | 25/30 [00:03<00:00,  8.79it/s]running benchmark:  87%|████████▋ | 26/30 [00:03<00:00,  8.80it/s]running benchmark:  90%|█████████ | 27/30 [00:03<00:00,  8.80it/s]running benchmark:  93%|█████████▎| 28/30 [00:03<00:00,  8.79it/s]running benchmark:  97%|█████████▋| 29/30 [00:03<00:00,  8.78it/s]running benchmark: 100%|██████████| 30/30 [00:03<00:00,  8.78it/s]running benchmark: 100%|██████████| 30/30 [00:03<00:00,  7.74it/s]
+2791.379ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:08, ?it/s]
+hf_T5_generate
+cuda eval  hf_T5_generate                      int8weightonly-bs1        
+[2023-12-12 05:26:35,950] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (1000)
+[2023-12-12 05:26:35,950] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py:1645)
+[2023-12-12 05:26:35,950] torch._dynamo.convert_frame: [WARNING]    last reason: ___check_obj_id(L['past_key_values'], 7628576)                # mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length  # miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py:1026 in forward
+[2023-12-12 05:26:35,950] torch._dynamo.convert_frame: [WARNING] To log all recompilation reasons, use TORCH_LOGS="recompiles".
+[2023-12-12 05:26:35,950] torch._dynamo.convert_frame: [WARNING] To diagnose recompilation issues, see https://pytorch.org/docs/master/compile/troubleshooting.html.
+[2023-12-12 05:42:25,312] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (1000)
+[2023-12-12 05:42:25,312] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py:978)
+[2023-12-12 05:42:25,312] torch._dynamo.convert_frame: [WARNING]    last reason: tensor 'L['input_ids']' stride mismatch at index 0. expected 65, actual 129
+[2023-12-12 05:42:25,312] torch._dynamo.convert_frame: [WARNING] To log all recompilation reasons, use TORCH_LOGS="recompiles".
+[2023-12-12 05:42:25,312] torch._dynamo.convert_frame: [WARNING] To diagnose recompilation issues, see https://pytorch.org/docs/master/compile/troubleshooting.html.
+TIMEOUT
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:20, ?it/s]
+hf_T5_large
+cuda eval  hf_T5_large                         int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:05,  4.93it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:04,  5.98it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:03,  8.04it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:02,  8.47it/s]running benchmark:  20%|██        | 6/30 [00:00<00:02,  8.81it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:02,  9.10it/s]running benchmark:  30%|███       | 9/30 [00:01<00:02,  9.60it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:01,  9.90it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:01,  9.95it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01, 10.07it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:01, 10.16it/s]running benchmark:  63%|██████▎   | 19/30 [00:02<00:01, 10.21it/s]running benchmark:  70%|███████   | 21/30 [00:02<00:00, 10.26it/s]running benchmark:  77%|███████▋  | 23/30 [00:02<00:00, 10.14it/s]running benchmark:  83%|████████▎ | 25/30 [00:02<00:00, 10.17it/s]running benchmark:  90%|█████████ | 27/30 [00:02<00:00, 10.20it/s]running benchmark:  97%|█████████▋| 29/30 [00:02<00:00, 10.25it/s]running benchmark: 100%|██████████| 30/30 [00:03<00:00,  9.70it/s]
+4181.003ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+hf_Whisper
+cuda eval  hf_Whisper                          int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 107.13it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 108.98it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 109.38it/s]
+4895.009ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:05, ?it/s]
+WARNING:root:hf_clip failed to load
+hf_clip
+Original Error: 'str' object has no attribute 'shape'
+Eager model failed to run
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model
+    self.model_iter_fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.py", line 1142, in forward
+    vision_outputs = self.vision_model(
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.py", line 883, in forward
+    hidden_states = self.embeddings(pixel_values)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.py", line 194, in forward
+    batch_size = pixel_values.shape[0]
+AttributeError: 'str' object has no attribute 'shape'
+
+The above exception was the direct cause of the following exception:
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model
+    self.validate_model(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model
+    raise NotImplementedError("Eager model failed to run") from e
+NotImplementedError: Eager model failed to run
+
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+lennard_jones
+cuda eval  lennard_jones                       int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 915.55it/s]
+2422.290ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+llama
+cuda eval  llama                               int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:00, 45.88it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 46.75it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 46.95it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 46.66it/s]running benchmark:  83%|████████▎ | 25/30 [00:00<00:00, 46.64it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 46.59it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 46.60it/s]
+7330.999ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [01:00, ?it/s]
+llama_v2_7b_16h
+cuda eval  llama_v2_7b_16h                     int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:05,  5.42it/s]running benchmark:  10%|█         | 3/30 [00:00<00:02,  9.10it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:02, 10.36it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:02, 10.98it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 11.33it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:01, 11.53it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:01, 11.67it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01, 11.76it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:01, 11.81it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 11.85it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 11.89it/s]running benchmark:  77%|███████▋  | 23/30 [00:02<00:00, 11.91it/s]running benchmark:  83%|████████▎ | 25/30 [00:02<00:00, 11.91it/s]running benchmark:  90%|█████████ | 27/30 [00:02<00:00, 11.92it/s]running benchmark:  97%|█████████▋| 29/30 [00:02<00:00, 11.93it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 11.48it/s]
+1336.998ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+maml_omniglot
+cuda eval  maml_omniglot                       int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 758.22it/s]
+2867.601ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+mnasnet1_0
+cuda eval  mnasnet1_0                          int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 141.22it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 142.30it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 142.01it/s]
+6691.629ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+mobilenet_v2
+cuda eval  mobilenet_v2                        int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 135.39it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 139.34it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 138.67it/s]
+7344.366ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+WARNING:root:mobilenet_v2_quantized_qat failed to load
+mobilenet_v2_quantized_qat
+The eval test only supports CPU.
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 443, in load_model
+    benchmark = benchmark_cls(
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/util/model.py", line 24, in __call__
+    obj = type.__call__(cls, *args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/models/mobilenet_v2_quantized_qat/__init__.py", line 21, in __init__
+    raise NotImplementedError("The eval test only supports CPU.")
+NotImplementedError: The eval test only supports CPU.
+
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+mobilenet_v3_large
+cuda eval  mobilenet_v3_large                  int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 115.17it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 117.42it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 117.48it/s]
+7687.320ms
+loading model: 0it [00:00, ?it/s]NCCL version 2.19.3+cuda12.0
+loading model: 0it [00:03, ?it/s]
+moco
+cuda eval  moco                                int8weightonly-bs1        
+ERROR:common:Backend eager failed in warmup()
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2380, in warmup
+    fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/parallel/distributed.py", line 1523, in forward
+    else self._run_ddp_forward(*inputs, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/parallel/distributed.py", line 1359, in _run_ddp_forward
+    return self.module(*inputs, **kwargs)  # type: ignore[index]
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/models/moco/moco/builder.py", line 130, in forward
+    self._momentum_update_key_encoder()  # update the key encoder
+  File "/home/cdhernandez/local/pytorch/torch/utils/_contextlib.py", line 115, in decorate_context
+    return func(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/models/moco/moco/builder.py", line 50, in _momentum_update_key_encoder
+    param_k.mul_(self.m).add_(param_q.mul(1. - self.m))
+TypeError: add_(): argument 'other' (position 1) must be Tensor, not NoneType
+Run failed with return code:  255
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+nanogpt
+number of parameters: 123.69M
+num decayed parameter tensors: 50, with 124,354,560 parameters
+num non-decayed parameter tensors: 98, with 121,344 parameters
+using fused AdamW: True
+cuda eval  nanogpt                             int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 79.04it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 81.06it/s]running benchmark:  87%|████████▋ | 26/30 [00:00<00:00, 82.07it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 81.70it/s]
+5213.018ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+nvidia_deeprecommender
+cuda eval  nvidia_deeprecommender              int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 824.49it/s]
+863.059ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+opacus_cifar10
+cuda eval  opacus_cifar10                      int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  73%|███████▎  | 22/30 [00:00<00:00, 216.42it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 216.16it/s]
+5018.777ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:26, ?it/s]
+phi_1_5
+cuda eval  phi_1_5                             int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:03,  9.26it/s]running benchmark:  10%|█         | 3/30 [00:00<00:01, 14.02it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 15.48it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 16.11it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 16.52it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:01, 16.64it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:01, 16.82it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 16.93it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:00, 16.99it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 16.96it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 16.97it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 16.99it/s]running benchmark:  83%|████████▎ | 25/30 [00:01<00:00, 17.05it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 17.01it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 17.05it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 16.61it/s]
+2202.234ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+phlippe_densenet
+cuda eval  phlippe_densenet                    int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 98.18it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 104.14it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 104.11it/s]
+9519.741ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+phlippe_resnet
+cuda eval  phlippe_resnet                      int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 260.99it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 260.35it/s]
+6344.697ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+pyhpc_equation_of_state
+cuda eval  pyhpc_equation_of_state             int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 233.13it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 233.11it/s]
+17180.850ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+pyhpc_isoneutral_mixing
+cuda eval  pyhpc_isoneutral_mixing             int8weightonly-bs1        
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 163.52it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 163.40it/s]
+20870.954ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model pyhpc_turbulent_kinetic_energy does not support bfloat16, running with amp instead
+loading model: 0it [00:01, ?it/s]
+WARNING:common:Model pyhpc_turbulent_kinetic_energy does not support bfloat16, running with amp instead
+pyhpc_turbulent_kinetic_energy
+cuda eval  pyhpc_turbulent_kinetic_energy      int8weightonly-bs1        
+WARNING:common:Model pyhpc_turbulent_kinetic_energy does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 72.34it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 78.32it/s]running benchmark:  87%|████████▋ | 26/30 [00:00<00:00, 79.78it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 79.13it/s]
+4621.270ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+pytorch_CycleGAN_and_pix2pix
+cuda eval  pytorch_CycleGAN_and_pix2pix        int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 152.99it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 157.97it/s]
+2266.659ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+pytorch_stargan
+cuda eval  pytorch_stargan                     int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 112.42it/s]running benchmark:  83%|████████▎ | 25/30 [00:00<00:00, 119.91it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 119.51it/s]
+1931.627ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+pytorch_unet
+cuda eval  pytorch_unet                        int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:00, 37.48it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 42.81it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 44.48it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 45.29it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 45.73it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 46.00it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 45.04it/s]
+1814.124ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+resnet152
+cuda eval  resnet152                           int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:00, 40.87it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 42.27it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 42.55it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 42.78it/s]running benchmark:  83%|████████▎ | 25/30 [00:00<00:00, 42.99it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 43.12it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 42.79it/s]
+6642.462ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+resnet18
+cuda eval  resnet18                            int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  87%|████████▋ | 26/30 [00:00<00:00, 257.95it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 257.76it/s]
+4655.595ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+resnet50
+cuda eval  resnet50                            int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 118.22it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 117.65it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 118.45it/s]
+5922.338ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+WARNING:root:resnet50_quantized_qat failed to load
+resnet50_quantized_qat
+The eval test only supports CPU.
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 443, in load_model
+    benchmark = benchmark_cls(
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/util/model.py", line 24, in __call__
+    obj = type.__call__(cls, *args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/models/resnet50_quantized_qat/__init__.py", line 21, in __init__
+    raise NotImplementedError("The eval test only supports CPU.")
+NotImplementedError: The eval test only supports CPU.
+
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+resnext50_32x4d
+cuda eval  resnext50_32x4d                     int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 115.47it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 117.45it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 117.18it/s]
+6122.892ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:11, ?it/s]
+sam
+cuda eval  sam                                 int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:11,  2.62it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:07,  3.76it/s]running benchmark:  10%|█         | 3/30 [00:00<00:06,  4.37it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:05,  4.72it/s]running benchmark:  17%|█▋        | 5/30 [00:01<00:05,  4.96it/s]running benchmark:  20%|██        | 6/30 [00:01<00:04,  5.10it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:04,  5.19it/s]running benchmark:  27%|██▋       | 8/30 [00:01<00:04,  5.25it/s]running benchmark:  30%|███       | 9/30 [00:01<00:03,  5.30it/s]running benchmark:  33%|███▎      | 10/30 [00:02<00:03,  5.34it/s]running benchmark:  37%|███▋      | 11/30 [00:02<00:03,  5.36it/s]running benchmark:  40%|████      | 12/30 [00:02<00:03,  5.37it/s]running benchmark:  43%|████▎     | 13/30 [00:02<00:03,  5.39it/s]running benchmark:  47%|████▋     | 14/30 [00:02<00:02,  5.39it/s]running benchmark:  50%|█████     | 15/30 [00:02<00:02,  5.39it/s]running benchmark:  53%|█████▎    | 16/30 [00:03<00:02,  5.40it/s]running benchmark:  57%|█████▋    | 17/30 [00:03<00:02,  5.40it/s]running benchmark:  60%|██████    | 18/30 [00:03<00:02,  5.41it/s]running benchmark:  63%|██████▎   | 19/30 [00:03<00:02,  5.41it/s]running benchmark:  67%|██████▋   | 20/30 [00:03<00:01,  5.42it/s]running benchmark:  70%|███████   | 21/30 [00:04<00:01,  5.43it/s]running benchmark:  73%|███████▎  | 22/30 [00:04<00:01,  5.43it/s]running benchmark:  77%|███████▋  | 23/30 [00:04<00:01,  5.42it/s]running benchmark:  80%|████████  | 24/30 [00:04<00:01,  5.41it/s]running benchmark:  83%|████████▎ | 25/30 [00:04<00:00,  5.41it/s]running benchmark:  87%|████████▋ | 26/30 [00:05<00:00,  5.41it/s]running benchmark:  90%|█████████ | 27/30 [00:05<00:00,  5.41it/s]running benchmark:  93%|█████████▎| 28/30 [00:05<00:00,  5.40it/s]running benchmark:  97%|█████████▋| 29/30 [00:05<00:00,  5.40it/s]running benchmark: 100%|██████████| 30/30 [00:05<00:00,  5.40it/s]running benchmark: 100%|██████████| 30/30 [00:05<00:00,  5.22it/s]
+2069.679ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+shufflenet_v2_x1_0
+cuda eval  shufflenet_v2_x1_0                  int8weightonly-bs1        
+ERROR:common:Backend dynamo failed in warmup()
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2380, in warmup
+    fn(model, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 488, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 654, in catch_errors
+    return callback(frame, cache_entry, hooks, frame_state)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 727, in _convert_frame
+    result = inner_convert(frame, cache_entry, hooks, frame_state)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 383, in _convert_frame_assert
+    compiled_product = _compile(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 646, in _compile
+    guarded_code = compile_inner(code, one_graph, hooks, transform)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 562, in compile_inner
+    out_code = transform_code_object(code, transform)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/bytecode_transformation.py", line 1033, in transform_code_object
+    transformations(instructions, code_options)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 151, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 527, in transform
+    tracer.run()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2144, in run
+    super().run()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 818, in run
+    and self.step()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 781, in step
+    getattr(self, inst.opname)(inst)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2259, in RETURN_VALUE
+    self.output.compile_subgraph(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 919, in compile_subgraph
+    self.compile_and_call_fx_graph(tx, list(reversed(stack_values)), root)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1087, in compile_and_call_fx_graph
+    compiled_fn = self.call_user_compiler(gm)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1159, in call_user_compiler
+    raise BackendCompilerFailed(self.compiler_fn, e).with_traceback(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1140, in call_user_compiler
+    compiled_fn = compiler_fn(gm, self.example_inputs())
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_dynamo.py", line 117, in debug_wrapper
+    compiled_gm = compiler_fn(gm, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/__init__.py", line 1672, in __call__
+    return compile_fx(model_, inputs_, config_patches=self.config)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 952, in compile_fx
+    return compile_fx(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1168, in compile_fx
+    return aot_autograd(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/backends/common.py", line 55, in compiler_fn
+    cg = aot_module_simplified(gm, example_inputs, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 887, in aot_module_simplified
+    compiled_fn = create_aot_dispatcher_function(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 600, in create_aot_dispatcher_function
+    compiled_fn = compiler_fn(flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 425, in aot_wrapper_dedupe
+    return compiler_fn(flat_fn, leaf_flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 630, in aot_wrapper_synthetic_base
+    return compiler_fn(flat_fn, flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py", line 97, in aot_dispatch_base
+    compiled_fw = compiler(fw_module, updated_flat_args)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1100, in fw_compiler_base
+    return inner_compile(
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_aot.py", line 83, in debug_wrapper
+    inner_compiled_fn = compiler_fn(gm, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/debug.py", line 305, in inner
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 320, in compile_fx_inner
+    compiled_graph = fx_codegen_and_compile(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 535, in fx_codegen_and_compile
+    graph.run(*example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 518, in run
+    return super().run(*args)
+  File "/home/cdhernandez/local/pytorch/torch/fx/interpreter.py", line 138, in run
+    self.env[node] = self.run_node(node)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 815, in run_node
+    result = self.call_function(n.target, args, kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 695, in call_function
+    raise LoweringException(e, target, args, kwargs).with_traceback(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 692, in call_function
+    out = lowerings[target](*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/lowering.py", line 291, in wrapped
+    out = decomp_fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/kernel/conv.py", line 363, in convolution
+    return convert_1x1_conv_to_mm(x, weight, bias)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/kernel/conv.py", line 280, in convert_1x1_conv_to_mm
+    x.freeze_layout()
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/ir.py", line 6264, in __getattr__
+    fn = getattr(self.data, name)
+torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised:
+LoweringException: AttributeError: 'SliceView' object has no attribute 'freeze_layout'
+  target: aten.convolution.default
+  args[0]: TensorBox(
+    SliceView(
+      View(
+        StorageBox(
+          ComputedBuffer(name='buf14', layout=FlexibleLayout('cuda', torch.bfloat16, size=[1, 58, 2, 28, 28], stride=[90944, 1568, 784, 28, 1]), data=Pointwise(
+            'cuda',
+            torch.bfloat16,
+            def inner_fn(index):
+                _, i1, i2, i3, i4 = index
+                tmp0 = ops.load(buf13, i4 + 28 * i3 + 784 * i1 + 45472 * i2)
+                return tmp0
+            ,
+            ranges=[1, 58, 2, 28, 28],
+            origin_node=clone,
+            origins={clone}
+          ))
+        ),
+        size=[1, 116, 28, 28],
+        reindex=lambda i0, i1, i2, i3: [0, ModularIndexing(i1, 2, 58), ModularIndexing(i1, 1, 2), i2, i3],
+        origins={clone, view_1}
+      ),
+      size=[1, 58, 28, 28],
+      reindex=lambda i0, i1, i2, i3: [i0, i1 + 58, i2, i3],
+      origins={split}
+    )
+  )
+  args[1]: TensorBox(StorageBox(
+    InputBuffer(name='arg18_1', layout=FixedLayout('cuda', torch.bfloat16, size=[58, 58, 1, 1], stride=[58, 1, 1, 1]))
+  ))
+  args[2]: None
+  args[3]: [1, 1]
+  args[4]: [0, 0]
+  args[5]: [1, 1]
+  args[6]: False
+  args[7]: [0, 0]
+  args[8]: 1
+
+Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information
+
+
+You can suppress this exception and fall back to eager by setting:
+    import torch._dynamo
+    torch._dynamo.config.suppress_errors = True
+
+Run failed with return code:  255
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+soft_actor_critic
+cuda eval  soft_actor_critic                   int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 916.70it/s]
+1943.204ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+speech_transformer
+cuda eval  speech_transformer                  int8weightonly-bs1        
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['non-cuda device in graph']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:05,  5.58it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:01, 16.18it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 21.38it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 24.22it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 25.71it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 27.01it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 27.81it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 28.71it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 29.05it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 29.15it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 25.80it/s]
+1676.330ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+squeezenet1_1
+cuda eval  squeezenet1_1                       int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 339.79it/s]
+5697.116ms
+loading model: 0it [00:00, ?it/s]stable_diffusion_text_encoder
+
+Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s][A
+Loading pipeline components...:  17%|█▋        | 1/6 [00:00<00:01,  4.91it/s][A
+Loading pipeline components...:  67%|██████▋   | 4/6 [00:00<00:00,  6.06it/s][ALoading pipeline components...: 100%|██████████| 6/6 [00:00<00:00,  7.79it/s]
+loading model: 0it [00:05, ?it/s]
+cuda eval  stable_diffusion_text_encoder       int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  10%|█         | 3/30 [00:00<00:01, 26.86it/s]running benchmark:  20%|██        | 6/30 [00:00<00:00, 27.74it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 27.82it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 27.84it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 28.04it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 28.13it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 28.23it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 28.24it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 28.19it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 28.25it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 28.08it/s]
+6905.475ms
+loading model: 0it [00:00, ?it/s]stable_diffusion_unet
+
+Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s][A
+Loading pipeline components...:  33%|███▎      | 2/6 [00:00<00:00,  7.43it/s][A
+Loading pipeline components...:  83%|████████▎ | 5/6 [00:00<00:00, 14.37it/s][ALoading pipeline components...: 100%|██████████| 6/6 [00:00<00:00,  7.38it/s]
+loading model: 0it [00:07, ?it/s]
+cuda eval  stable_diffusion_unet               int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:05,  4.85it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:04,  6.87it/s]running benchmark:  10%|█         | 3/30 [00:00<00:03,  7.95it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:03,  8.57it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:02,  8.94it/s]running benchmark:  20%|██        | 6/30 [00:00<00:02,  9.18it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:02,  9.32it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:02,  9.43it/s]running benchmark:  30%|███       | 9/30 [00:01<00:02,  9.51it/s]running benchmark:  33%|███▎      | 10/30 [00:01<00:02,  9.58it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:01,  9.63it/s]running benchmark:  40%|████      | 12/30 [00:01<00:01,  9.67it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:01,  9.68it/s]running benchmark:  47%|████▋     | 14/30 [00:01<00:01,  9.71it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01,  9.73it/s]running benchmark:  53%|█████▎    | 16/30 [00:01<00:01,  9.74it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:01,  9.72it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:01,  9.71it/s]running benchmark:  63%|██████▎   | 19/30 [00:02<00:01,  9.70it/s]running benchmark:  67%|██████▋   | 20/30 [00:02<00:01,  9.70it/s]running benchmark:  70%|███████   | 21/30 [00:02<00:00,  9.70it/s]running benchmark:  73%|███████▎  | 22/30 [00:02<00:00,  9.71it/s]running benchmark:  77%|███████▋  | 23/30 [00:02<00:00,  9.73it/s]running benchmark:  80%|████████  | 24/30 [00:02<00:00,  9.75it/s]running benchmark:  83%|████████▎ | 25/30 [00:02<00:00,  9.76it/s]running benchmark:  87%|████████▋ | 26/30 [00:02<00:00,  9.76it/s]running benchmark:  90%|█████████ | 27/30 [00:02<00:00,  9.77it/s]running benchmark:  93%|█████████▎| 28/30 [00:02<00:00,  9.76it/s]running benchmark:  97%|█████████▋| 29/30 [00:03<00:00,  9.76it/s]running benchmark: 100%|██████████| 30/30 [00:03<00:00,  9.75it/s]running benchmark: 100%|██████████| 30/30 [00:03<00:00,  9.41it/s]
+1667.697ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+timm_efficientdet
+cuda eval  timm_efficientdet                   int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:04,  7.08it/s]running benchmark:  10%|█         | 3/30 [00:00<00:02, 12.72it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 14.90it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 16.03it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 16.69it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:01, 17.06it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 17.31it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 17.38it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:00, 17.41it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 17.41it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 17.48it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 17.58it/s]running benchmark:  83%|████████▎ | 25/30 [00:01<00:00, 17.64it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 17.72it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 17.72it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 16.87it/s]
+9087.583ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+timm_efficientnet
+cuda eval  timm_efficientnet                   int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 93.62it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 95.03it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 95.40it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 95.10it/s]
+8178.705ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+timm_nfnet
+cuda eval  timm_nfnet                          int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  20%|██        | 6/30 [00:00<00:00, 55.20it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 57.00it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 57.30it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 57.53it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 57.61it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 57.33it/s]
+6519.316ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+timm_regnet
+cuda eval  timm_regnet                         int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  20%|██        | 6/30 [00:00<00:00, 54.24it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 57.46it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 57.77it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 58.42it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 58.76it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 58.13it/s]
+5234.980ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+timm_resnest
+cuda eval  timm_resnest                        int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 173.24it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 177.72it/s]
+5639.391ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+timm_vision_transformer
+cuda eval  timm_vision_transformer             int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 92.88it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 92.89it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 94.01it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 93.64it/s]
+5783.307ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:19, ?it/s]
+timm_vision_transformer_large
+cuda eval  timm_vision_transformer_large       int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 17.62it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 21.84it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 23.09it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 23.37it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 23.48it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 23.58it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 23.73it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 23.63it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 23.55it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 23.55it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 23.29it/s]
+2222.173ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+timm_vovnet
+cuda eval  timm_vovnet                         int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 123.42it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 129.79it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 129.25it/s]
+5490.365ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+torch_multimodal_clip
+cuda eval  torch_multimodal_clip               int8weightonly-bs1        
+skipping cudagraphs due to ['non-cuda device in graph']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  10%|█         | 3/30 [00:00<00:01, 23.55it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 23.75it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 23.48it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 23.53it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 23.64it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 23.65it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 23.50it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 23.52it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 23.61it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 23.58it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 23.57it/s]
+2230.487ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model tts_angular does not support bfloat16, running with amp instead
+loading model: 0it [00:02, ?it/s]
+WARNING:common:Model tts_angular does not support bfloat16, running with amp instead
+tts_angular
+cuda eval  tts_angular                         int8weightonly-bs1        
+WARNING:common:Model tts_angular does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 109.03it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 109.82it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 110.21it/s]
+926.113ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+vgg16
+cuda eval  vgg16                               int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 265.84it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 266.40it/s]
+1532.563ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model vision_maskrcnn does not support bfloat16, running with float16 instead
+loading model: 0it [00:03, ?it/s]
+WARNING:common:Model vision_maskrcnn does not support bfloat16, running with float16 instead
+vision_maskrcnn
+cuda eval  vision_maskrcnn                     int8weightonly-bs1        
+WARNING:common:Model vision_maskrcnn does not support bfloat16, running with float16 instead
+ERROR:common:Backend dynamo failed in warmup()
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2380, in warmup
+    fn(model, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 488, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/vision/torchvision/models/detection/generalized_rcnn.py", line 104, in forward
+    proposals, proposal_losses = self.rpn(images, features, targets)
+  File "/home/cdhernandez/local/vision/torchvision/models/detection/generalized_rcnn.py", line 105, in resume_in_forward
+    detections, detector_losses = self.roi_heads(features, proposals, images.image_sizes, targets)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/vision/torchvision/models/detection/roi_heads.py", line 761, in forward
+    box_features = self.box_roi_pool(features, proposals, image_shapes)
+  File "/home/cdhernandez/local/vision/torchvision/models/detection/roi_heads.py", line 775, in resume_in_forward
+    boxes, scores, labels = self.postprocess_detections(class_logits, box_regression, proposals, image_shapes)
+  File "/home/cdhernandez/local/vision/torchvision/models/detection/roi_heads.py", line 804, in resume_in_forward
+    mask_features = self.mask_roi_pool(features, mask_proposals, image_shapes)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 654, in catch_errors
+    return callback(frame, cache_entry, hooks, frame_state)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 727, in _convert_frame
+    result = inner_convert(frame, cache_entry, hooks, frame_state)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 383, in _convert_frame_assert
+    compiled_product = _compile(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 646, in _compile
+    guarded_code = compile_inner(code, one_graph, hooks, transform)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 562, in compile_inner
+    out_code = transform_code_object(code, transform)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/bytecode_transformation.py", line 1033, in transform_code_object
+    transformations(instructions, code_options)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 151, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 527, in transform
+    tracer.run()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2144, in run
+    super().run()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 818, in run
+    and self.step()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 781, in step
+    getattr(self, inst.opname)(inst)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2259, in RETURN_VALUE
+    self.output.compile_subgraph(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 945, in compile_subgraph
+    self.compile_and_call_fx_graph(tx, pass2.graph_output_vars(), root)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1087, in compile_and_call_fx_graph
+    compiled_fn = self.call_user_compiler(gm)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1159, in call_user_compiler
+    raise BackendCompilerFailed(self.compiler_fn, e).with_traceback(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1140, in call_user_compiler
+    compiled_fn = compiler_fn(gm, self.example_inputs())
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_dynamo.py", line 117, in debug_wrapper
+    compiled_gm = compiler_fn(gm, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/__init__.py", line 1672, in __call__
+    return compile_fx(model_, inputs_, config_patches=self.config)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 952, in compile_fx
+    return compile_fx(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1168, in compile_fx
+    return aot_autograd(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/backends/common.py", line 55, in compiler_fn
+    cg = aot_module_simplified(gm, example_inputs, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 887, in aot_module_simplified
+    compiled_fn = create_aot_dispatcher_function(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 600, in create_aot_dispatcher_function
+    compiled_fn = compiler_fn(flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 425, in aot_wrapper_dedupe
+    return compiler_fn(flat_fn, leaf_flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 630, in aot_wrapper_synthetic_base
+    return compiler_fn(flat_fn, flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py", line 97, in aot_dispatch_base
+    compiled_fw = compiler(fw_module, updated_flat_args)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1100, in fw_compiler_base
+    return inner_compile(
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_aot.py", line 83, in debug_wrapper
+    inner_compiled_fn = compiler_fn(gm, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/debug.py", line 305, in inner
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 320, in compile_fx_inner
+    compiled_graph = fx_codegen_and_compile(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 535, in fx_codegen_and_compile
+    graph.run(*example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 518, in run
+    return super().run(*args)
+  File "/home/cdhernandez/local/pytorch/torch/fx/interpreter.py", line 138, in run
+    self.env[node] = self.run_node(node)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 815, in run_node
+    result = self.call_function(n.target, args, kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 695, in call_function
+    raise LoweringException(e, target, args, kwargs).with_traceback(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 692, in call_function
+    out = lowerings[target](*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/lowering.py", line 291, in wrapped
+    out = decomp_fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/kernel/conv.py", line 367, in convolution
+    result = convolution(x, weight, None, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/lowering.py", line 291, in wrapped
+    out = decomp_fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/kernel/conv.py", line 457, in convolution
+    return autotune_select_algorithm("convolution", choices, args, layout)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/select_algorithm.py", line 991, in autotune_select_algorithm
+    return _ALGORITHM_SELECTOR_CACHE(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/select_algorithm.py", line 748, in __call__
+    timings = self.lookup(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 291, in lookup
+    timings = benchmark(choices)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/select_algorithm.py", line 739, in autotune
+    return make_benchmark_fn()(choices)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/select_algorithm.py", line 865, in benchmark_in_current_process
+    raise AssertionError(  # noqa: TRY200
+torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised:
+LoweringException: AssertionError: Incorrect result from choice ExternKernelCaller(extern_kernels.convolution)
+
+expected size 256==256, stride 196==1 at dim=1
+  target: aten.convolution.default
+  args[0]: TensorBox(StorageBox(
+    InputBuffer(name='arg12_1', layout=FixedLayout('cuda', torch.float16, size=[0, 256, 14, 14], stride=[50176, 196, 14, 1]))
+  ))
+  args[1]: TensorBox(StorageBox(
+    InputBuffer(name='arg0_1', layout=FixedLayout('cuda', torch.float16, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
+  ))
+  args[2]: TensorBox(StorageBox(
+    InputBuffer(name='arg1_1', layout=FixedLayout('cuda', torch.float16, size=[256], stride=[1]))
+  ))
+  args[3]: [1, 1]
+  args[4]: [1, 1]
+  args[5]: [1, 1]
+  args[6]: False
+  args[7]: [0, 0]
+  args[8]: 1
+
+Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information
+
+
+You can suppress this exception and fall back to eager by setting:
+    import torch._dynamo
+    torch._dynamo.config.suppress_errors = True
+
+Run failed with return code:  255
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+yolov3
+cuda eval  yolov3                              int8weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  20%|██        | 6/30 [00:00<00:00, 57.75it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 66.31it/s]running benchmark:  73%|███████▎  | 22/30 [00:00<00:00, 68.69it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 70.45it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 68.70it/s]
+5839.303ms
+
+Summary for tag=0.000000:
+speedup             gmean=0.00x mean=0.000x
+abs_latency         gmean=0.00x mean=0.000x
+compilation_latency mean=0.000 seconds
+compression_ratio   mean=0.000x
+eager_peak_mem      gmean=0.00x mean=0.000x
+dynamo_peak_mem     gmean=0.00x mean=0.000x
+calls_captured      gmean=0.00x mean=0.000x
+unique_graphs       gmean=0.00x mean=0.000x
+graph_breaks        gmean=0.00x mean=0.000x
+unique_graph_breaks gmean=0.00x mean=0.000x
+
+Summary for tag=int8dynamic:
+speedup             gmean=9.20x mean=113.389x
+abs_latency         gmean=4.24x mean=10.510x
+compilation_latency mean=34.839 seconds
+compression_ratio   mean=1.263x
+eager_peak_mem      gmean=0.38x mean=0.878x
+dynamo_peak_mem     gmean=0.36x mean=0.844x
+calls_captured      gmean=233.44x mean=564.988x
+unique_graphs       gmean=1.86x mean=7.136x
+graph_breaks        gmean=0.00x mean=5.160x
+unique_graph_breaks gmean=0.00x mean=1.333x
+
+Summary for tag=int8weightonly:
+speedup             gmean=2.46x mean=2.889x
+abs_latency         gmean=4.51x mean=11.782x
+compilation_latency mean=31.136 seconds
+compression_ratio   mean=1.098x
+eager_peak_mem      gmean=0.38x mean=0.871x
+dynamo_peak_mem     gmean=0.46x mean=0.896x
+calls_captured      gmean=233.16x mean=563.963x
+unique_graphs       gmean=1.85x mean=7.183x
+graph_breaks        gmean=0.00x mean=5.220x
+unique_graph_breaks gmean=0.00x mean=1.317x
+
+Summary for tag=int4weightonly:
+speedup             gmean=2.01x mean=2.520x
+abs_latency         gmean=6.14x mean=33.943x
+compilation_latency mean=27.431 seconds
+compression_ratio   mean=1.140x
+eager_peak_mem      gmean=0.33x mean=0.696x
+dynamo_peak_mem     gmean=0.37x mean=0.739x
+calls_captured      gmean=219.02x mean=494.800x
+unique_graphs       gmean=1.83x mean=7.125x
+graph_breaks        gmean=0.00x mean=5.088x
+unique_graph_breaks gmean=0.00x mean=1.312x
+
+Summary for tag=baseline:
+speedup             gmean=2.42x mean=2.935x
+abs_latency         gmean=4.22x mean=13.273x
+compilation_latency mean=36.647 seconds
+compression_ratio   mean=1.125x
+eager_peak_mem      gmean=0.42x mean=1.075x
+dynamo_peak_mem     gmean=0.45x mean=1.120x
+calls_captured      gmean=240.73x mean=595.060x
+unique_graphs       gmean=1.89x mean=6.619x
+graph_breaks        gmean=0.00x mean=5.071x
+unique_graph_breaks gmean=0.00x mean=1.333x
+
+Summary for tag=int8weightonly-bs1:
+speedup             gmean=3.24x mean=4.117x
+abs_latency         gmean=2.99x mean=8.375x
+compilation_latency mean=35.067 seconds
+compression_ratio   mean=0.937x
+eager_peak_mem      gmean=0.24x mean=0.786x
+dynamo_peak_mem     gmean=0.38x mean=0.886x
+calls_captured      gmean=232.72x mean=567.580x
+unique_graphs       gmean=1.87x mean=7.259x
+graph_breaks        gmean=0.00x mean=5.284x
+unique_graph_breaks gmean=0.00x mean=1.333x
+start int4 weight only batchsize 1
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+torchrec_dlrm
+/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/fbgemm_gpu_py.so: undefined symbol: _ZNK5torch8autograd4Node4nameEv
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 36, in <module>
+    run()
+  File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 30, in run
+    benchmark.run(bm_args)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/run.py", line 24, in run
+    main(TorchBenchmarkRunner(), original_dir, args)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3133, in main
+    process_entry(0, runner, original_dir, args)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3090, in process_entry
+    return maybe_fresh_cache(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1714, in inner
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 383, in load_model
+    module = importlib.import_module(c)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/importlib/__init__.py", line 126, in import_module
+    return _bootstrap._gcd_import(name[level:], package, level)
+  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
+  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
+  File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
+  File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
+  File "<frozen importlib._bootstrap_external>", line 883, in exec_module
+  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/__init__.py", line 7, in <module>
+    from .data.dlrm_dataloader import get_dataloader
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/data/dlrm_dataloader.py", line 13, in <module>
+    from torchrec.datasets.criteo import (
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/__init__.py", line 8, in <module>
+    import torchrec.distributed  # noqa
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/__init__.py", line 36, in <module>
+    from torchrec.distributed.model_parallel import DistributedModelParallel  # noqa
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/model_parallel.py", line 21, in <module>
+    from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/__init__.py", line 22, in <module>
+    from torchrec.distributed.planner.planners import EmbeddingShardingPlanner  # noqa
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/planners.py", line 19, in <module>
+    from torchrec.distributed.planner.constants import BATCH_SIZE, MAX_SIZE
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/constants.py", line 10, in <module>
+    from torchrec.distributed.embedding_types import EmbeddingComputeKernel
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/embedding_types.py", line 14, in <module>
+    from fbgemm_gpu.split_table_batched_embeddings_ops_training import EmbeddingLocation
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 23, in <module>
+    from . import _fbgemm_gpu_docs, sparse_ops  # noqa: F401, E402  # noqa: F401, E402
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/_fbgemm_gpu_docs.py", line 19, in <module>
+    torch.ops.fbgemm.jagged_2d_to_dense,
+  File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 822, in __getattr__
+    raise AttributeError(
+AttributeError: '_OpNamespace' 'fbgemm' object has no attribute 'jagged_2d_to_dense'
+Run failed with return code:  1
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+BERT_pytorch
+cuda eval  BERT_pytorch                        int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:00, 48.46it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 50.02it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 50.61it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 50.51it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 50.49it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 50.36it/s]
+5488.414ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:05, ?it/s]
+Background_Matting
+cuda eval  Background_Matting                  int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:00, 35.41it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 39.90it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 41.39it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 42.05it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 42.35it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 42.55it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 41.79it/s]
+2057.673ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model DALLE2_pytorch does not support bfloat16, running with amp instead
+loading model: 0it [00:11, ?it/s]
+WARNING:common:Model DALLE2_pytorch does not support bfloat16, running with amp instead
+DALLE2_pytorch
+cuda eval  DALLE2_pytorch                      int4weightonly-bs1        
+WARNING:common:Model DALLE2_pytorch does not support bfloat16, running with amp instead
+[2023-12-12 06:16:38,557] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 06:16:38,818] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 06:16:39,033] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 06:16:39,247] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 06:16:39,459] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 06:16:39,670] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 06:16:39,881] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 06:16:40,096] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 06:16:40,306] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 06:16:40,516] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 06:16:40,726] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 06:16:40,940] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 06:17:12,037] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 06:17:12,235] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 06:17:12,428] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 06:17:12,626] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 06:17:12,819] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 06:17:13,012] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 06:17:13,212] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 06:17:13,407] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 06:17:13,602] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 06:17:13,800] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 06:17:13,998] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 06:17:14,197] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+ERROR:common:Backend dynamo failed in warmup()
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2380, in warmup
+    fn(model, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 488, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/utils/_contextlib.py", line 115, in decorate_context
+    return func(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/dalle2_pytorch/dalle2_pytorch.py", line 95, in inner
+    model.eval()
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/dalle2_pytorch/dalle2_pytorch.py", line 96, in resume_in_inner
+    out = fn(model, *args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/dalle2_pytorch/dalle2_pytorch.py", line 3329, in forward
+    image_embed = self.prior.sample(text, num_samples_per_batch = self.prior_num_samples, cond_scale = prior_cond_scale)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/dalle2_pytorch/dalle2_pytorch.py", line 3332, in resume_in_forward
+    images = self.decoder.sample(image_embed = image_embed, text = text_cond, cond_scale = cond_scale)
+  File "/home/cdhernandez/local/pytorch/torch/utils/_contextlib.py", line 115, in decorate_context
+    return func(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/dalle2_pytorch/dalle2_pytorch.py", line 95, in inner
+    model.eval()
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/dalle2_pytorch/dalle2_pytorch.py", line 96, in resume_in_inner
+    out = fn(model, *args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/dalle2_pytorch/dalle2_pytorch.py", line 3199, in sample
+    img = self.p_sample_loop(
+  File "/home/cdhernandez/local/pytorch/torch/utils/_contextlib.py", line 115, in decorate_context
+    return func(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/dalle2_pytorch/dalle2_pytorch.py", line 3019, in p_sample_loop
+    @torch.no_grad()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 488, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/external_utils.py", line 17, in inner
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 901, in forward
+    return compiled_fn(full_args)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/utils.py", line 81, in g
+    return f(*args)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 94, in runtime_wrapper
+    all_outs = call_func_at_runtime_with_args(
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/utils.py", line 105, in call_func_at_runtime_with_args
+    out = normalize_as_list(f(args))
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 297, in inner_fn
+    unwrapped_outs = runtime_fn(unwrapped_args)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py", line 118, in rng_functionalization_wrapper
+    return compiled_fw(args)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 863, in __call__
+    return self.get_current_callable()(inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 665, in run
+    return compiled_fn(new_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/cudagraph_trees.py", line 380, in deferred_cudagraphify
+    fn, out = cudagraphify(model, inputs, new_static_input_idxs, *args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/cudagraph_trees.py", line 408, in cudagraphify
+    return manager.add_function(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/cudagraph_trees.py", line 1941, in add_function
+    return fn, fn(inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/cudagraph_trees.py", line 1755, in run
+    out = self._run(new_inputs, function_id)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/cudagraph_trees.py", line 1796, in _run
+    return self.run_eager(new_inputs, function_id)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/cudagraph_trees.py", line 1911, in run_eager
+    return node.run(new_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/cudagraph_trees.py", line 611, in run
+    out = self.wrapped_function.model(new_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 891, in _run_from_cache
+    return compiled_graph.compiled_artifact(inputs)
+  File "/tmp/torchinductor_cdhernandez/3e/c3e2nwnk77sukrondqzw7w7udogyeub3rowfjpo3yjxc2g6ymebc.py", line 12668, in call
+    buf170 = aten._weight_int4pack_mm(buf169, arg141_1, 128, arg142_1)
+  File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 755, in __call__
+    return self._op(*args, **(kwargs or {}))
+  File "/home/cdhernandez/local/pytorch/torch/utils/_device.py", line 77, in __torch_function__
+    return func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 755, in __call__
+    return self._op(*args, **(kwargs or {}))
+RuntimeError: Expected A.is_contiguous() to be true, but got false.  (Could this error message be improved?  If so, please report an enhancement request to PyTorch.)
+Run failed with return code:  255
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:05, ?it/s]
+LearningToPaint
+cuda eval  LearningToPaint                     int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 264.39it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 262.74it/s]
+4928.157ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model Super_SloMo does not support bfloat16, running with amp instead
+loading model: 0it [00:03, ?it/s]
+WARNING:common:Model Super_SloMo does not support bfloat16, running with amp instead
+Super_SloMo
+cuda eval  Super_SloMo                         int4weightonly-bs1        
+WARNING:common:Model Super_SloMo does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  10%|█         | 3/30 [00:00<00:00, 27.65it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 45.87it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 52.13it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 55.02it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 56.47it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 53.30it/s]
+2939.843ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+alexnet
+cuda eval  alexnet                             int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 598.19it/s]
+2736.407ms
+loading model: 0it [00:00, ?it/s]basic_gnn_edgecnn
+loading model: 0it [00:04, ?it/s]
+cuda eval  basic_gnn_edgecnn                   int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 198.92it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 205.25it/s]
+1378.744ms
+loading model: 0it [00:00, ?it/s]basic_gnn_gcn
+loading model: 0it [00:03, ?it/s]
+cuda eval  basic_gnn_gcn                       int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 131.59it/s]running benchmark:  93%|█████████▎| 28/30 [00:00<00:00, 132.68it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 132.39it/s]
+1041.507ms
+loading model: 0it [00:00, ?it/s]basic_gnn_gin
+loading model: 0it [00:03, ?it/s]
+cuda eval  basic_gnn_gin                       int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 336.32it/s]
+1208.392ms
+loading model: 0it [00:00, ?it/s]basic_gnn_sage
+loading model: 0it [00:03, ?it/s]
+cuda eval  basic_gnn_sage                      int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 157.60it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 162.22it/s]
+1146.057ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:06, ?it/s]
+cm3leon_generate
+cuda eval  cm3leon_generate                    int4weightonly-bs1        
+AUTOTUNE bmm(16x1x96, 16x96x877)
+  triton_bmm_83866 0.0100 ms 100.0%
+  triton_bmm_83862 0.0101 ms 98.1%
+  triton_bmm_83860 0.0102 ms 97.8%
+  triton_bmm_83861 0.0103 ms 96.6%
+  triton_bmm_83863 0.0104 ms 95.4%
+  triton_bmm_83865 0.0104 ms 95.4%
+  triton_bmm_83864 0.0105 ms 95.1%
+  triton_bmm_83870 0.0107 ms 92.8%
+  triton_bmm_83868 0.0108 ms 91.7%
+  triton_bmm_83869 0.0109 ms 91.5%
+SingleProcess AUTOTUNE takes 2.3205 seconds
+AUTOTUNE bmm(16x1x877, 16x877x96)
+  bmm 0.0160 ms 100.0%
+  triton_bmm_83893 0.0183 ms 87.6%
+  triton_bmm_83892 0.0206 ms 77.8%
+  triton_bmm_83889 0.0277 ms 57.9%
+  triton_bmm_83886 0.0284 ms 56.5%
+  triton_bmm_83890 0.0285 ms 56.2%
+  triton_bmm_83888 0.0287 ms 55.9%
+  triton_bmm_83885 0.0296 ms 54.2%
+  triton_bmm_83887 0.0310 ms 51.6%
+  triton_bmm_83884 0.0392 ms 40.9%
+SingleProcess AUTOTUNE takes 4.4981 seconds
+AUTOTUNE bmm(16x1x96, 16x96x878)
+  triton_bmm_83960 0.0099 ms 100.0%
+  triton_bmm_83957 0.0102 ms 96.6%
+  triton_bmm_83959 0.0103 ms 95.4%
+  triton_bmm_83963 0.0104 ms 95.1%
+  triton_bmm_83962 0.0105 ms 94.2%
+  triton_bmm_83956 0.0107 ms 91.9%
+  triton_bmm_83958 0.0108 ms 91.7%
+  triton_bmm_83964 0.0108 ms 91.1%
+  triton_bmm_83961 0.0110 ms 89.8%
+  triton_bmm_83967 0.0110 ms 89.8%
+SingleProcess AUTOTUNE takes 3.8190 seconds
+AUTOTUNE bmm(16x1x878, 16x878x96)
+  triton_bmm_83985 0.0128 ms 100.0%
+  triton_bmm_83988 0.0132 ms 97.1%
+  triton_bmm_83986 0.0132 ms 97.0%
+  triton_bmm_83989 0.0140 ms 91.5%
+  triton_bmm_83984 0.0142 ms 90.3%
+  triton_bmm_83983 0.0157 ms 81.7%
+  bmm 0.0172 ms 74.3%
+  triton_bmm_83982 0.0176 ms 72.6%
+  triton_bmm_83981 0.0180 ms 70.9%
+  triton_bmm_83980 0.0268 ms 47.8%
+SingleProcess AUTOTUNE takes 3.9811 seconds
+AUTOTUNE bmm(16x1x96, 16x96x879)
+  triton_bmm_84058 0.0100 ms 100.0%
+  triton_bmm_84052 0.0102 ms 98.4%
+  triton_bmm_84054 0.0102 ms 98.1%
+  triton_bmm_84060 0.0102 ms 97.8%
+  triton_bmm_84057 0.0104 ms 96.0%
+  triton_bmm_84056 0.0105 ms 95.1%
+  triton_bmm_84053 0.0108 ms 92.3%
+  triton_bmm_84061 0.0109 ms 92.1%
+  triton_bmm_84059 0.0109 ms 91.5%
+  triton_bmm_84055 0.0110 ms 91.0%
+SingleProcess AUTOTUNE takes 4.0687 seconds
+AUTOTUNE bmm(16x1x879, 16x879x96)
+  bmm 0.0165 ms 100.0%
+  triton_bmm_84085 0.0188 ms 87.6%
+  triton_bmm_84084 0.0211 ms 78.2%
+  triton_bmm_84081 0.0277 ms 59.6%
+  triton_bmm_84080 0.0281 ms 58.7%
+  triton_bmm_84082 0.0283 ms 58.3%
+  triton_bmm_84078 0.0288 ms 57.3%
+  triton_bmm_84077 0.0296 ms 55.7%
+  triton_bmm_84079 0.0310 ms 53.3%
+  triton_bmm_84076 0.0392 ms 42.1%
+SingleProcess AUTOTUNE takes 3.9910 seconds
+AUTOTUNE bmm(16x1x96, 16x96x880)
+  triton_bmm_84152 0.0099 ms 100.0%
+  triton_bmm_84148 0.0100 ms 99.0%
+  triton_bmm_84150 0.0101 ms 97.8%
+  triton_bmm_84155 0.0101 ms 97.2%
+  triton_bmm_84157 0.0104 ms 94.8%
+  triton_bmm_84154 0.0104 ms 94.5%
+  triton_bmm_84159 0.0104 ms 94.5%
+  triton_bmm_84149 0.0106 ms 93.3%
+  triton_bmm_84153 0.0107 ms 91.9%
+  triton_bmm_84158 0.0107 ms 91.9%
+SingleProcess AUTOTUNE takes 3.9324 seconds
+AUTOTUNE bmm(16x1x880, 16x880x96)
+  triton_bmm_84177 0.0129 ms 100.0%
+  bmm 0.0130 ms 99.8%
+  triton_bmm_84178 0.0130 ms 99.8%
+  triton_bmm_84180 0.0137 ms 94.2%
+  triton_bmm_84176 0.0140 ms 92.7%
+  triton_bmm_84181 0.0148 ms 87.4%
+  triton_bmm_84175 0.0152 ms 84.9%
+  triton_bmm_84174 0.0170 ms 76.1%
+  triton_bmm_84173 0.0186 ms 69.4%
+  triton_bmm_84172 0.0266 ms 48.5%
+SingleProcess AUTOTUNE takes 4.3320 seconds
+AUTOTUNE bmm(16x1x96, 16x96x881)
+  triton_bmm_84248 0.0100 ms 100.0%
+  triton_bmm_84250 0.0100 ms 99.7%
+  triton_bmm_84252 0.0104 ms 96.6%
+  triton_bmm_84251 0.0104 ms 96.3%
+  triton_bmm_84249 0.0104 ms 96.0%
+  triton_bmm_84254 0.0107 ms 93.7%
+  triton_bmm_84244 0.0107 ms 93.4%
+  triton_bmm_84246 0.0108 ms 93.2%
+  triton_bmm_84253 0.0108 ms 92.9%
+  triton_bmm_84245 0.0108 ms 92.6%
+SingleProcess AUTOTUNE takes 3.8435 seconds
+AUTOTUNE bmm(16x1x881, 16x881x96)
+  bmm 0.0150 ms 100.0%
+  triton_bmm_84277 0.0183 ms 82.0%
+  triton_bmm_84276 0.0211 ms 71.1%
+  triton_bmm_84274 0.0279 ms 53.7%
+  triton_bmm_84270 0.0282 ms 53.2%
+  triton_bmm_84273 0.0283 ms 53.1%
+  triton_bmm_84272 0.0285 ms 52.6%
+  triton_bmm_84269 0.0302 ms 49.6%
+  triton_bmm_84271 0.0313 ms 48.0%
+  triton_bmm_84268 0.0389 ms 38.6%
+SingleProcess AUTOTUNE takes 3.8844 seconds
+AUTOTUNE bmm(16x1x96, 16x96x882)
+  triton_bmm_84344 0.0099 ms 100.0%
+  triton_bmm_84340 0.0101 ms 97.5%
+  triton_bmm_84348 0.0102 ms 97.2%
+  triton_bmm_84342 0.0102 ms 96.6%
+  triton_bmm_84343 0.0103 ms 96.0%
+  triton_bmm_84345 0.0104 ms 94.8%
+  triton_bmm_84351 0.0104 ms 94.8%
+  triton_bmm_84346 0.0105 ms 93.9%
+  triton_bmm_84341 0.0108 ms 92.0%
+  triton_bmm_84347 0.0108 ms 91.2%
+SingleProcess AUTOTUNE takes 3.9638 seconds
+AUTOTUNE bmm(16x1x882, 16x882x96)
+  triton_bmm_84369 0.0122 ms 100.0%
+  triton_bmm_84370 0.0131 ms 93.6%
+  triton_bmm_84372 0.0132 ms 92.7%
+  triton_bmm_84373 0.0145 ms 84.1%
+  triton_bmm_84368 0.0148 ms 82.9%
+  triton_bmm_84367 0.0156 ms 78.4%
+  triton_bmm_84366 0.0178 ms 68.7%
+  triton_bmm_84365 0.0180 ms 67.7%
+  bmm 0.0181 ms 67.4%
+  triton_bmm_84364 0.0272 ms 44.9%
+SingleProcess AUTOTUNE takes 4.0534 seconds
+AUTOTUNE bmm(16x1x96, 16x96x883)
+  triton_bmm_84440 0.0100 ms 100.0%
+  triton_bmm_84442 0.0100 ms 100.0%
+  triton_bmm_84436 0.0102 ms 98.4%
+  triton_bmm_84437 0.0102 ms 98.1%
+  triton_bmm_84444 0.0103 ms 97.2%
+  triton_bmm_84443 0.0104 ms 96.3%
+  triton_bmm_84441 0.0106 ms 94.8%
+  triton_bmm_84438 0.0108 ms 92.9%
+  triton_bmm_84439 0.0109 ms 92.1%
+  triton_bmm_84447 0.0112 ms 89.4%
+SingleProcess AUTOTUNE takes 3.7972 seconds
+AUTOTUNE bmm(16x1x883, 16x883x96)
+  bmm 0.0160 ms 100.0%
+  triton_bmm_84469 0.0183 ms 87.6%
+  triton_bmm_84468 0.0211 ms 76.0%
+  triton_bmm_84462 0.0282 ms 56.9%
+  triton_bmm_84465 0.0282 ms 56.8%
+  triton_bmm_84466 0.0282 ms 56.8%
+  triton_bmm_84464 0.0283 ms 56.6%
+  triton_bmm_84461 0.0301 ms 53.3%
+  triton_bmm_84463 0.0314 ms 51.1%
+  triton_bmm_84460 0.0389 ms 41.2%
+SingleProcess AUTOTUNE takes 3.7589 seconds
+AUTOTUNE bmm(16x1x96, 16x96x884)
+  triton_bmm_84536 0.0099 ms 100.0%
+  triton_bmm_84538 0.0099 ms 99.7%
+  triton_bmm_84532 0.0100 ms 99.0%
+  triton_bmm_84533 0.0100 ms 98.4%
+  triton_bmm_84535 0.0102 ms 96.9%
+  triton_bmm_84540 0.0102 ms 96.2%
+  triton_bmm_84537 0.0104 ms 94.5%
+  triton_bmm_84534 0.0107 ms 92.5%
+  triton_bmm_84539 0.0109 ms 90.6%
+  triton_bmm_84543 0.0110 ms 89.4%
+SingleProcess AUTOTUNE takes 4.2190 seconds
+AUTOTUNE bmm(16x1x884, 16x884x96)
+  triton_bmm_84562 0.0124 ms 100.0%
+  triton_bmm_84561 0.0127 ms 97.7%
+  triton_bmm_84565 0.0137 ms 90.9%
+  triton_bmm_84564 0.0137 ms 90.7%
+  triton_bmm_84560 0.0145 ms 85.9%
+  triton_bmm_84559 0.0156 ms 79.6%
+  triton_bmm_84558 0.0175 ms 71.0%
+  bmm 0.0180 ms 69.1%
+  triton_bmm_84557 0.0186 ms 67.1%
+  triton_bmm_84556 0.0262 ms 47.5%
+SingleProcess AUTOTUNE takes 4.2935 seconds
+AUTOTUNE bmm(16x1x96, 16x96x885)
+  triton_bmm_84634 0.0100 ms 100.0%
+  triton_bmm_84628 0.0102 ms 98.4%
+  triton_bmm_84630 0.0102 ms 98.4%
+  triton_bmm_84629 0.0102 ms 98.1%
+  triton_bmm_84636 0.0104 ms 96.3%
+  triton_bmm_84631 0.0104 ms 96.0%
+  triton_bmm_84633 0.0104 ms 96.0%
+  triton_bmm_84632 0.0105 ms 95.7%
+  triton_bmm_84639 0.0107 ms 94.0%
+  triton_bmm_84637 0.0109 ms 92.1%
+SingleProcess AUTOTUNE takes 3.9578 seconds
+AUTOTUNE bmm(16x1x885, 16x885x96)
+  bmm 0.0160 ms 100.0%
+  triton_bmm_84661 0.0183 ms 87.6%
+  triton_bmm_84660 0.0211 ms 75.9%
+  triton_bmm_84658 0.0280 ms 57.3%
+  triton_bmm_84657 0.0281 ms 57.1%
+  triton_bmm_84656 0.0281 ms 57.1%
+  triton_bmm_84654 0.0288 ms 55.7%
+  triton_bmm_84653 0.0303 ms 53.0%
+  triton_bmm_84655 0.0316 ms 50.8%
+  triton_bmm_84652 0.0390 ms 41.1%
+SingleProcess AUTOTUNE takes 3.8441 seconds
+AUTOTUNE bmm(16x1x96, 16x96x886)
+  triton_bmm_84728 0.0099 ms 100.0%
+  triton_bmm_84729 0.0104 ms 95.1%
+  triton_bmm_84730 0.0105 ms 93.9%
+  triton_bmm_84724 0.0107 ms 92.2%
+  triton_bmm_84726 0.0108 ms 92.0%
+  triton_bmm_84725 0.0108 ms 91.7%
+  triton_bmm_84732 0.0108 ms 91.7%
+  triton_bmm_84727 0.0108 ms 91.2%
+  triton_bmm_84731 0.0108 ms 91.2%
+  triton_bmm_84735 0.0110 ms 89.8%
+SingleProcess AUTOTUNE takes 3.9524 seconds
+AUTOTUNE bmm(16x1x886, 16x886x96)
+  triton_bmm_84754 0.0127 ms 100.0%
+  triton_bmm_84753 0.0128 ms 99.4%
+  triton_bmm_84756 0.0137 ms 92.2%
+  triton_bmm_84757 0.0140 ms 90.6%
+  triton_bmm_84752 0.0140 ms 90.4%
+  triton_bmm_84751 0.0152 ms 83.5%
+  triton_bmm_84750 0.0172 ms 73.5%
+  bmm 0.0177 ms 71.6%
+  triton_bmm_84749 0.0180 ms 70.5%
+  triton_bmm_84748 0.0272 ms 46.6%
+SingleProcess AUTOTUNE takes 4.1293 seconds
+AUTOTUNE bmm(16x1x96, 16x96x887)
+  triton_bmm_84826 0.0100 ms 100.0%
+  triton_bmm_84820 0.0102 ms 98.4%
+  triton_bmm_84821 0.0102 ms 97.8%
+  triton_bmm_84828 0.0102 ms 97.8%
+  triton_bmm_84824 0.0106 ms 94.6%
+  triton_bmm_84830 0.0107 ms 93.7%
+  triton_bmm_84822 0.0108 ms 93.2%
+  triton_bmm_84827 0.0109 ms 92.1%
+  triton_bmm_84823 0.0109 ms 91.8%
+  triton_bmm_84825 0.0110 ms 91.3%
+SingleProcess AUTOTUNE takes 4.3321 seconds
+TIMEOUT
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+dcgan
+cuda eval  dcgan                               int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 720.92it/s]
+2791.475ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model demucs does not support bfloat16, running with amp instead
+loading model: 0it [00:04, ?it/s]
+WARNING:common:Model demucs does not support bfloat16, running with amp instead
+demucs
+cuda eval  demucs                              int4weightonly-bs1        
+WARNING:common:Model demucs does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:02, 13.84it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 18.05it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 19.38it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 19.95it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 20.26it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 20.48it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 20.64it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 20.73it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 20.81it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 20.85it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 20.24it/s]
+1178.509ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+densenet121
+cuda eval  densenet121                         int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:00, 32.87it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 35.96it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 37.38it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 37.86it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 38.01it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 38.25it/s]running benchmark:  93%|█████████▎| 28/30 [00:00<00:00, 38.13it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 37.70it/s]
+8940.764ms
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_c4
+WARNING:common:Model detectron2_fasterrcnn_r_101_c4 does not support bfloat16, running with amp instead
+loading model: 0it [00:07, ?it/s]
+WARNING:common:Model detectron2_fasterrcnn_r_101_c4 does not support bfloat16, running with amp instead
+cuda eval  detectron2_fasterrcnn_r_101_c4      int4weightonly-bs1        
+WARNING:common:Model detectron2_fasterrcnn_r_101_c4 does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 06:58:31,153] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:03,  9.16it/s]running benchmark:  10%|█         | 3/30 [00:00<00:02, 11.83it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:02, 12.34it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 12.74it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 12.95it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:01, 13.08it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:01, 13.16it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01, 13.23it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:00, 13.27it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 13.30it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 13.32it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 13.33it/s]running benchmark:  83%|████████▎ | 25/30 [00:01<00:00, 13.35it/s]running benchmark:  90%|█████████ | 27/30 [00:02<00:00, 13.36it/s]running benchmark:  97%|█████████▋| 29/30 [00:02<00:00, 13.35it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 13.10it/s]
+1743.954ms
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_dc5
+loading model: 0it [00:09, ?it/s]
+WARNING:root:detectron2_fasterrcnn_r_101_dc5 failed to load
+Original Error: "roi_align_forward_kernel" not implemented for 'BFloat16'
+Eager model failed to run
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model
+    self.model_iter_fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward
+    return self.inference(batched_inputs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference
+    results, _ = self.roi_heads(images, features, proposals, None)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 747, in forward
+    pred_instances = self._forward_box(features, proposals)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 798, in _forward_box
+    box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals])
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward
+    return self.level_poolers[0](x[0], pooler_fmt_boxes)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward
+    return roi_align(
+  File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 238, in roi_align
+    return torch.ops.torchvision.roi_align(
+  File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 755, in __call__
+    return self._op(*args, **(kwargs or {}))
+RuntimeError: "roi_align_forward_kernel" not implemented for 'BFloat16'
+
+The above exception was the direct cause of the following exception:
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model
+    self.validate_model(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model
+    raise NotImplementedError("Eager model failed to run") from e
+NotImplementedError: Eager model failed to run
+
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_fpn
+WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:06, ?it/s]
+WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_fasterrcnn_r_101_fpn     int4weightonly-bs1        
+WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 07:00:17,151] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 15.87it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 18.44it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 19.26it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 19.69it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 19.92it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 20.08it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 20.17it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 20.24it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 20.24it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 20.25it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 19.90it/s]
+2132.011ms
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_50_c4
+loading model: 0it [00:07, ?it/s]
+WARNING:root:detectron2_fasterrcnn_r_50_c4 failed to load
+Original Error: "roi_align_forward_kernel" not implemented for 'BFloat16'
+Eager model failed to run
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model
+    self.model_iter_fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward
+    return self.inference(batched_inputs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference
+    results, _ = self.roi_heads(images, features, proposals, None)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 477, in forward
+    box_features = self._shared_roi_transform(
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 456, in _shared_roi_transform
+    x = self.pooler(features, boxes)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward
+    return self.level_poolers[0](x[0], pooler_fmt_boxes)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward
+    return roi_align(
+  File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 238, in roi_align
+    return torch.ops.torchvision.roi_align(
+  File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 755, in __call__
+    return self._op(*args, **(kwargs or {}))
+RuntimeError: "roi_align_forward_kernel" not implemented for 'BFloat16'
+
+The above exception was the direct cause of the following exception:
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model
+    self.validate_model(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model
+    raise NotImplementedError("Eager model failed to run") from e
+NotImplementedError: Eager model failed to run
+
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_50_dc5
+WARNING:common:Model detectron2_fasterrcnn_r_50_dc5 does not support bfloat16, running with amp instead
+loading model: 0it [00:07, ?it/s]
+WARNING:common:Model detectron2_fasterrcnn_r_50_dc5 does not support bfloat16, running with amp instead
+cuda eval  detectron2_fasterrcnn_r_50_dc5      int4weightonly-bs1        
+WARNING:common:Model detectron2_fasterrcnn_r_50_dc5 does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 07:01:29,516] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 14.49it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:01, 15.35it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 15.67it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 15.85it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 15.93it/s]running benchmark:  40%|████      | 12/30 [00:00<00:01, 15.97it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:01, 15.99it/s]running benchmark:  53%|█████▎    | 16/30 [00:01<00:00, 16.04it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:00, 16.07it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 16.10it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 16.12it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 16.13it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 16.15it/s]running benchmark:  93%|█████████▎| 28/30 [00:01<00:00, 16.17it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 16.15it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 16.00it/s]
+1238.779ms
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_50_fpn
+WARNING:common:Model detectron2_fasterrcnn_r_50_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:07, ?it/s]
+WARNING:common:Model detectron2_fasterrcnn_r_50_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_fasterrcnn_r_50_fpn      int4weightonly-bs1        
+WARNING:common:Model detectron2_fasterrcnn_r_50_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 07:02:37,935] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:08,  3.29it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:02, 10.95it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 15.54it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 18.46it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 20.36it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 21.63it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 22.51it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 21.29it/s]running benchmark:  83%|████████▎ | 25/30 [00:01<00:00, 21.92it/s]running benchmark:  93%|█████████▎| 28/30 [00:01<00:00, 22.66it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 19.57it/s]
+1902.311ms
+loading model: 0it [00:00, ?it/s]detectron2_fcos_r_50_fpn
+WARNING:common:Model detectron2_fcos_r_50_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:05, ?it/s]
+WARNING:common:Model detectron2_fcos_r_50_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_fcos_r_50_fpn            int4weightonly-bs1        
+WARNING:common:Model detectron2_fcos_r_50_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 16.06it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:01, 16.88it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 17.25it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 17.39it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 17.53it/s]running benchmark:  40%|████      | 12/30 [00:00<00:01, 17.71it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 17.74it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 17.75it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:00, 17.84it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 17.55it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 17.58it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 17.75it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 17.90it/s]running benchmark:  93%|█████████▎| 28/30 [00:01<00:00, 17.91it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 17.88it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 17.65it/s]
+1160.477ms
+loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_101_c4
+WARNING:common:Model detectron2_maskrcnn_r_101_c4 does not support bfloat16, running with amp instead
+loading model: 0it [00:08, ?it/s]
+WARNING:common:Model detectron2_maskrcnn_r_101_c4 does not support bfloat16, running with amp instead
+cuda eval  detectron2_maskrcnn_r_101_c4        int4weightonly-bs1        
+WARNING:common:Model detectron2_maskrcnn_r_101_c4 does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 07:05:14,450] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 07:05:22,660] [30/1_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:08,  3.34it/s]running benchmark:  10%|█         | 3/30 [00:00<00:03,  7.19it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:02,  9.11it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:02, 10.22it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 10.91it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:01, 11.32it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:01, 11.64it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01, 11.85it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:01, 11.98it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 12.08it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 12.16it/s]running benchmark:  77%|███████▋  | 23/30 [00:02<00:00, 12.22it/s]running benchmark:  83%|████████▎ | 25/30 [00:02<00:00, 12.24it/s]running benchmark:  90%|█████████ | 27/30 [00:02<00:00, 12.23it/s]running benchmark:  97%|█████████▋| 29/30 [00:02<00:00, 12.26it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 11.26it/s]
+1727.963ms
+loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_101_fpn
+WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:06, ?it/s]
+WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_maskrcnn_r_101_fpn       int4weightonly-bs1        
+WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 07:06:58,825] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 07:07:05,598] [30/1_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:02, 13.24it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:01, 14.80it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 16.03it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 16.40it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 16.94it/s]running benchmark:  40%|████      | 12/30 [00:00<00:01, 17.25it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 17.41it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 17.59it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:00, 17.61it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 17.63it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 17.72it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 17.79it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 17.81it/s]running benchmark:  93%|█████████▎| 28/30 [00:01<00:00, 17.83it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 17.90it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 17.28it/s]
+1982.032ms
+loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_50_c4
+WARNING:common:Model detectron2_maskrcnn_r_50_c4 does not support bfloat16, running with amp instead
+loading model: 0it [00:06, ?it/s]
+WARNING:common:Model detectron2_maskrcnn_r_50_c4 does not support bfloat16, running with amp instead
+cuda eval  detectron2_maskrcnn_r_50_c4         int4weightonly-bs1        
+WARNING:common:Model detectron2_maskrcnn_r_50_c4 does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 07:08:08,937] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 07:08:17,588] [30/1_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:02, 13.46it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:01, 13.94it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 14.14it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 14.26it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 14.34it/s]running benchmark:  40%|████      | 12/30 [00:00<00:01, 14.38it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:01, 14.43it/s]running benchmark:  53%|█████▎    | 16/30 [00:01<00:00, 14.45it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:00, 14.43it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 14.46it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 14.48it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 14.47it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 14.48it/s]running benchmark:  93%|█████████▎| 28/30 [00:01<00:00, 14.48it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 14.49it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 14.39it/s]
+1548.688ms
+loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_50_fpn
+WARNING:common:Model detectron2_maskrcnn_r_50_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:06, ?it/s]
+WARNING:common:Model detectron2_maskrcnn_r_50_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_maskrcnn_r_50_fpn        int4weightonly-bs1        
+WARNING:common:Model detectron2_maskrcnn_r_50_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 07:09:35,272] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 07:09:41,345] [30/1_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 15.16it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 18.82it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 19.02it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 20.05it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 20.43it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 20.78it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 20.99it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 21.17it/s]running benchmark:  83%|████████▎ | 25/30 [00:01<00:00, 21.25it/s]running benchmark:  93%|█████████▎| 28/30 [00:01<00:00, 21.29it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 20.64it/s]
+1788.002ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:12, ?it/s]
+dlrm
+cuda eval  dlrm                                int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 400.11it/s]
+2662.533ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model doctr_det_predictor does not support bfloat16, running with amp instead
+loading model: 0it [00:05, ?it/s]
+WARNING:common:Model doctr_det_predictor does not support bfloat16, running with amp instead
+doctr_det_predictor
+cuda eval  doctr_det_predictor                 int4weightonly-bs1        
+WARNING:common:Model doctr_det_predictor does not support bfloat16, running with amp instead
+[2023-12-12 07:10:53,438] [1/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+malloc(): unaligned tcache chunk detected
+Run failed with return code:  -6
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead
+loading model: 0it [00:06, ?it/s]
+WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead
+doctr_reco_predictor
+cuda eval  doctr_reco_predictor                int4weightonly-bs1        
+WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 202.07it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 201.64it/s]
+2376.094ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+drq
+cuda eval  drq                                 int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 558.05it/s]
+3467.838ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+fastNLP_Bert
+cuda eval  fastNLP_Bert                        int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:00, 33.41it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 35.94it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 36.54it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 37.11it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 37.40it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 37.60it/s]running benchmark:  93%|█████████▎| 28/30 [00:00<00:00, 37.70it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 37.14it/s]
+1699.015ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+functorch_dp_cifar10
+cuda eval  functorch_dp_cifar10                int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  87%|████████▋ | 26/30 [00:00<00:00, 252.54it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 253.74it/s]
+5121.032ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+functorch_maml_omniglot
+cuda eval  functorch_maml_omniglot             int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 831.10it/s]
+2935.167ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+hf_Albert
+cuda eval  hf_Albert                           int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 19.98it/s]running benchmark:  20%|██        | 6/30 [00:00<00:00, 27.06it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 28.58it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 29.47it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 30.03it/s]running benchmark:  73%|███████▎  | 22/30 [00:00<00:00, 30.47it/s]running benchmark:  87%|████████▋ | 26/30 [00:00<00:00, 30.90it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 30.70it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 29.79it/s]
+2351.247ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:07, ?it/s]
+hf_Bart
+cuda eval  hf_Bart                             int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 18.55it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 24.85it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 27.05it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 27.99it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 28.42it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 28.85it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 29.00it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 29.18it/s]running benchmark:  87%|████████▋ | 26/30 [00:00<00:00, 29.20it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 29.18it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 28.31it/s]
+1642.642ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:05, ?it/s]
+hf_BigBird
+cuda eval  hf_BigBird                          int4weightonly-bs1        
+[2023-12-12 07:15:06,365] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 07:15:09,038] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 07:15:11,110] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 07:15:12,829] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 07:15:14,601] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 07:15:16,341] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 07:15:18,429] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 07:15:20,186] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 07:15:21,934] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 07:15:23,744] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 07:15:25,966] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 07:15:27,774] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:06,  4.18it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:06,  4.49it/s]running benchmark:  10%|█         | 3/30 [00:00<00:05,  4.59it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:05,  4.64it/s]running benchmark:  17%|█▋        | 5/30 [00:01<00:05,  4.67it/s]running benchmark:  20%|██        | 6/30 [00:01<00:05,  4.69it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:04,  4.70it/s]running benchmark:  27%|██▋       | 8/30 [00:01<00:04,  4.70it/s]running benchmark:  30%|███       | 9/30 [00:01<00:04,  4.71it/s]running benchmark:  33%|███▎      | 10/30 [00:02<00:04,  4.72it/s]running benchmark:  37%|███▋      | 11/30 [00:02<00:04,  4.73it/s]running benchmark:  40%|████      | 12/30 [00:02<00:03,  4.73it/s]running benchmark:  43%|████▎     | 13/30 [00:02<00:03,  4.73it/s]running benchmark:  47%|████▋     | 14/30 [00:02<00:03,  4.74it/s]running benchmark:  50%|█████     | 15/30 [00:03<00:03,  4.74it/s]running benchmark:  53%|█████▎    | 16/30 [00:03<00:02,  4.74it/s]running benchmark:  57%|█████▋    | 17/30 [00:03<00:02,  4.71it/s]running benchmark:  60%|██████    | 18/30 [00:03<00:02,  4.72it/s]running benchmark:  63%|██████▎   | 19/30 [00:04<00:02,  4.72it/s]running benchmark:  67%|██████▋   | 20/30 [00:04<00:02,  4.73it/s]running benchmark:  70%|███████   | 21/30 [00:04<00:01,  4.73it/s]running benchmark:  73%|███████▎  | 22/30 [00:04<00:01,  4.73it/s]running benchmark:  77%|███████▋  | 23/30 [00:04<00:01,  4.73it/s]running benchmark:  80%|████████  | 24/30 [00:05<00:01,  4.72it/s]running benchmark:  83%|████████▎ | 25/30 [00:05<00:01,  4.71it/s]running benchmark:  87%|████████▋ | 26/30 [00:05<00:00,  4.72it/s]running benchmark:  90%|█████████ | 27/30 [00:05<00:00,  4.72it/s]running benchmark:  93%|█████████▎| 28/30 [00:05<00:00,  4.72it/s]running benchmark:  97%|█████████▋| 29/30 [00:06<00:00,  4.73it/s]running benchmark: 100%|██████████| 30/30 [00:06<00:00,  4.74it/s]running benchmark: 100%|██████████| 30/30 [00:06<00:00,  4.71it/s]
+1428.811ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+hf_DistilBert
+cuda eval  hf_DistilBert                       int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  20%|██        | 6/30 [00:00<00:00, 52.87it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 58.38it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 60.18it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 60.58it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 59.74it/s]
+1614.184ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:05, ?it/s]
+hf_GPT2
+cuda eval  hf_GPT2                             int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:00, 32.28it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 37.21it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 38.93it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 39.64it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 39.92it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 40.17it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 39.34it/s]
+1780.150ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:20, ?it/s]
+hf_GPT2_large
+cuda eval  hf_GPT2_large                       int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:04,  6.86it/s]running benchmark:  10%|█         | 3/30 [00:00<00:02, 10.97it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:02, 12.40it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 13.00it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 13.42it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:01, 13.66it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:01, 13.89it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01, 13.97it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:00, 14.01it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 14.00it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 14.06it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 14.09it/s]running benchmark:  83%|████████▎ | 25/30 [00:01<00:00, 14.11it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 14.08it/s]running benchmark:  97%|█████████▋| 29/30 [00:02<00:00, 14.09it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 13.62it/s]
+1598.808ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:07, ?it/s]
+hf_Longformer
+cuda eval  hf_Longformer                       int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:08,  3.36it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:06,  4.16it/s]running benchmark:  10%|█         | 3/30 [00:00<00:05,  4.50it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:05,  4.69it/s]running benchmark:  17%|█▋        | 5/30 [00:01<00:05,  4.80it/s]running benchmark:  20%|██        | 6/30 [00:01<00:04,  4.87it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:04,  4.91it/s]running benchmark:  27%|██▋       | 8/30 [00:01<00:04,  4.94it/s]running benchmark:  30%|███       | 9/30 [00:01<00:04,  4.96it/s]running benchmark:  33%|███▎      | 10/30 [00:02<00:04,  4.98it/s]running benchmark:  37%|███▋      | 11/30 [00:02<00:03,  4.98it/s]running benchmark:  40%|████      | 12/30 [00:02<00:03,  4.99it/s]running benchmark:  43%|████▎     | 13/30 [00:02<00:03,  5.00it/s]running benchmark:  47%|████▋     | 14/30 [00:02<00:03,  5.00it/s]running benchmark:  50%|█████     | 15/30 [00:03<00:02,  5.00it/s]running benchmark:  53%|█████▎    | 16/30 [00:03<00:02,  5.00it/s]running benchmark:  57%|█████▋    | 17/30 [00:03<00:02,  5.00it/s]running benchmark:  60%|██████    | 18/30 [00:03<00:02,  5.00it/s]running benchmark:  63%|██████▎   | 19/30 [00:03<00:02,  5.00it/s]running benchmark:  67%|██████▋   | 20/30 [00:04<00:01,  5.00it/s]running benchmark:  70%|███████   | 21/30 [00:04<00:01,  5.00it/s]running benchmark:  73%|███████▎  | 22/30 [00:04<00:01,  5.00it/s]running benchmark:  77%|███████▋  | 23/30 [00:04<00:01,  5.00it/s]running benchmark:  80%|████████  | 24/30 [00:04<00:01,  5.00it/s]running benchmark:  83%|████████▎ | 25/30 [00:05<00:00,  5.00it/s]running benchmark:  87%|████████▋ | 26/30 [00:05<00:00,  5.00it/s]running benchmark:  90%|█████████ | 27/30 [00:05<00:00,  5.00it/s]running benchmark:  93%|█████████▎| 28/30 [00:05<00:00,  5.00it/s]running benchmark:  97%|█████████▋| 29/30 [00:05<00:00,  5.00it/s]running benchmark: 100%|██████████| 30/30 [00:06<00:00,  5.01it/s]running benchmark: 100%|██████████| 30/30 [00:06<00:00,  4.92it/s]
+1203.995ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+hf_Reformer
+cuda eval  hf_Reformer                         int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  10%|█         | 3/30 [00:00<00:00, 27.01it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:00, 32.28it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 33.95it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 34.71it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 35.17it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 35.42it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 35.56it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 34.76it/s]
+1171.722ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+hf_T5
+cuda eval  hf_T5                               int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:07,  4.14it/s]running benchmark:  10%|█         | 3/30 [00:00<00:03,  7.81it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:02,  9.31it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:02, 10.08it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 10.52it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:01, 10.78it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:01, 10.97it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01, 11.10it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:01, 11.17it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 11.22it/s]running benchmark:  70%|███████   | 21/30 [00:02<00:00, 11.26it/s]running benchmark:  77%|███████▋  | 23/30 [00:02<00:00, 11.29it/s]running benchmark:  83%|████████▎ | 25/30 [00:02<00:00, 11.31it/s]running benchmark:  90%|█████████ | 27/30 [00:02<00:00, 11.33it/s]running benchmark:  97%|█████████▋| 29/30 [00:02<00:00, 11.33it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 10.73it/s]
+1477.730ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:07, ?it/s]
+hf_T5_base
+cuda eval  hf_T5_base                          int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:15,  1.93it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:11,  2.45it/s]running benchmark:  10%|█         | 3/30 [00:01<00:08,  3.04it/s]running benchmark:  13%|█▎        | 4/30 [00:01<00:07,  3.42it/s]running benchmark:  17%|█▋        | 5/30 [00:01<00:06,  3.67it/s]running benchmark:  20%|██        | 6/30 [00:01<00:06,  3.85it/s]running benchmark:  23%|██▎       | 7/30 [00:02<00:05,  3.97it/s]running benchmark:  27%|██▋       | 8/30 [00:02<00:05,  4.05it/s]running benchmark:  30%|███       | 9/30 [00:02<00:05,  4.11it/s]running benchmark:  33%|███▎      | 10/30 [00:02<00:04,  4.15it/s]running benchmark:  37%|███▋      | 11/30 [00:02<00:04,  4.18it/s]running benchmark:  40%|████      | 12/30 [00:03<00:04,  4.19it/s]running benchmark:  43%|████▎     | 13/30 [00:03<00:04,  4.21it/s]running benchmark:  47%|████▋     | 14/30 [00:03<00:03,  4.22it/s]running benchmark:  50%|█████     | 15/30 [00:03<00:03,  4.22it/s]running benchmark:  53%|█████▎    | 16/30 [00:04<00:03,  4.23it/s]running benchmark:  57%|█████▋    | 17/30 [00:04<00:03,  4.23it/s]running benchmark:  60%|██████    | 18/30 [00:04<00:02,  4.23it/s]running benchmark:  63%|██████▎   | 19/30 [00:04<00:02,  4.23it/s]running benchmark:  67%|██████▋   | 20/30 [00:05<00:02,  4.23it/s]running benchmark:  70%|███████   | 21/30 [00:05<00:02,  4.24it/s]running benchmark:  73%|███████▎  | 22/30 [00:05<00:01,  4.24it/s]running benchmark:  77%|███████▋  | 23/30 [00:05<00:01,  4.24it/s]running benchmark:  80%|████████  | 24/30 [00:06<00:01,  4.24it/s]running benchmark:  83%|████████▎ | 25/30 [00:06<00:01,  4.24it/s]running benchmark:  87%|████████▋ | 26/30 [00:06<00:00,  4.24it/s]running benchmark:  90%|█████████ | 27/30 [00:06<00:00,  4.24it/s]running benchmark:  93%|█████████▎| 28/30 [00:06<00:00,  4.24it/s]running benchmark:  97%|█████████▋| 29/30 [00:07<00:00,  4.24it/s]running benchmark: 100%|██████████| 30/30 [00:07<00:00,  4.24it/s]running benchmark: 100%|██████████| 30/30 [00:07<00:00,  4.02it/s]
+1526.591ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:06, ?it/s]
+hf_T5_generate
+cuda eval  hf_T5_generate                      int4weightonly-bs1        
+[2023-12-12 07:38:35,666] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (1000)
+[2023-12-12 07:38:35,666] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py:1645)
+[2023-12-12 07:38:35,666] torch._dynamo.convert_frame: [WARNING]    last reason: ___check_obj_id(L['past_key_values'], 7628576)                # mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length  # miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py:1026 in forward
+[2023-12-12 07:38:35,666] torch._dynamo.convert_frame: [WARNING] To log all recompilation reasons, use TORCH_LOGS="recompiles".
+[2023-12-12 07:38:35,666] torch._dynamo.convert_frame: [WARNING] To diagnose recompilation issues, see https://pytorch.org/docs/master/compile/troubleshooting.html.
+[2023-12-12 07:53:35,133] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (1000)
+[2023-12-12 07:53:35,133] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py:978)
+[2023-12-12 07:53:35,133] torch._dynamo.convert_frame: [WARNING]    last reason: tensor 'L['input_ids']' stride mismatch at index 0. expected 65, actual 129
+[2023-12-12 07:53:35,133] torch._dynamo.convert_frame: [WARNING] To log all recompilation reasons, use TORCH_LOGS="recompiles".
+[2023-12-12 07:53:35,133] torch._dynamo.convert_frame: [WARNING] To diagnose recompilation issues, see https://pytorch.org/docs/master/compile/troubleshooting.html.
+AUTOTUNE bmm(8x1x64, 8x64x138)
+  triton_bmm_33041 0.0068 ms 100.0%
+  triton_bmm_33045 0.0070 ms 96.8%
+  triton_bmm_33038 0.0072 ms 94.9%
+  triton_bmm_33044 0.0072 ms 94.2%
+  triton_bmm_33042 0.0072 ms 94.0%
+  triton_bmm_33040 0.0073 ms 93.0%
+  triton_bmm_33037 0.0074 ms 91.8%
+  triton_bmm_33039 0.0076 ms 90.3%
+  triton_bmm_33047 0.0077 ms 88.8%
+  bmm 0.0077 ms 88.4%
+SingleProcess AUTOTUNE takes 3.8877 seconds
+AUTOTUNE bmm(8x1x138, 8x138x64)
+  triton_bmm_33052 0.0070 ms 100.0%
+  triton_bmm_33051 0.0070 ms 99.5%
+  triton_bmm_33050 0.0078 ms 90.1%
+  triton_bmm_33053 0.0078 ms 89.8%
+  triton_bmm_33054 0.0079 ms 88.7%
+  triton_bmm_33049 0.0082 ms 85.7%
+  triton_bmm_33048 0.0096 ms 73.2%
+  triton_bmm_33055 0.0108 ms 65.0%
+  triton_bmm_33056 0.0110 ms 63.7%
+  bmm 0.0111 ms 62.9%
+SingleProcess AUTOTUNE takes 3.2722 seconds
+TIMEOUT
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:21, ?it/s]
+hf_T5_large
+cuda eval  hf_T5_large                         int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:10,  2.87it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:06,  4.24it/s]running benchmark:  10%|█         | 3/30 [00:00<00:05,  5.18it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:04,  5.79it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:04,  6.20it/s]running benchmark:  20%|██        | 6/30 [00:01<00:03,  6.53it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:03,  6.80it/s]running benchmark:  27%|██▋       | 8/30 [00:01<00:03,  6.94it/s]running benchmark:  30%|███       | 9/30 [00:01<00:02,  7.06it/s]running benchmark:  33%|███▎      | 10/30 [00:01<00:02,  7.16it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:02,  7.18it/s]running benchmark:  40%|████      | 12/30 [00:01<00:02,  7.26it/s]running benchmark:  43%|████▎     | 13/30 [00:02<00:02,  7.30it/s]running benchmark:  47%|████▋     | 14/30 [00:02<00:02,  7.35it/s]running benchmark:  50%|█████     | 15/30 [00:02<00:02,  7.37it/s]running benchmark:  53%|█████▎    | 16/30 [00:02<00:01,  7.38it/s]running benchmark:  57%|█████▋    | 17/30 [00:02<00:01,  7.33it/s]running benchmark:  60%|██████    | 18/30 [00:02<00:01,  7.29it/s]running benchmark:  63%|██████▎   | 19/30 [00:02<00:01,  7.33it/s]running benchmark:  67%|██████▋   | 20/30 [00:02<00:01,  7.31it/s]running benchmark:  70%|███████   | 21/30 [00:03<00:01,  7.33it/s]running benchmark:  73%|███████▎  | 22/30 [00:03<00:01,  7.35it/s]running benchmark:  77%|███████▋  | 23/30 [00:03<00:00,  7.32it/s]running benchmark:  80%|████████  | 24/30 [00:03<00:00,  7.29it/s]running benchmark:  83%|████████▎ | 25/30 [00:03<00:00,  7.33it/s]running benchmark:  87%|████████▋ | 26/30 [00:03<00:00,  7.37it/s]running benchmark:  90%|█████████ | 27/30 [00:03<00:00,  7.42it/s]running benchmark:  93%|█████████▎| 28/30 [00:04<00:00,  7.34it/s]running benchmark:  97%|█████████▋| 29/30 [00:04<00:00,  7.29it/s]running benchmark: 100%|██████████| 30/30 [00:04<00:00,  7.29it/s]running benchmark: 100%|██████████| 30/30 [00:04<00:00,  6.92it/s]
+1558.178ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+hf_Whisper
+cuda eval  hf_Whisper                          int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:00, 62.07it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 65.66it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 67.19it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 68.76it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 67.66it/s]
+2131.049ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:06, ?it/s]
+WARNING:root:hf_clip failed to load
+hf_clip
+Original Error: 'str' object has no attribute 'shape'
+Eager model failed to run
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model
+    self.model_iter_fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.py", line 1142, in forward
+    vision_outputs = self.vision_model(
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.py", line 883, in forward
+    hidden_states = self.embeddings(pixel_values)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.py", line 194, in forward
+    batch_size = pixel_values.shape[0]
+AttributeError: 'str' object has no attribute 'shape'
+
+The above exception was the direct cause of the following exception:
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model
+    self.validate_model(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model
+    raise NotImplementedError("Eager model failed to run") from e
+NotImplementedError: Eager model failed to run
+
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+lennard_jones
+cuda eval  lennard_jones                       int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 877.71it/s]
+2744.037ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+llama
+cuda eval  llama                               int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  20%|██        | 6/30 [00:00<00:00, 55.78it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 56.89it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 57.13it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 57.27it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 57.74it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 57.36it/s]
+6522.067ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [01:01, ?it/s]
+llama_v2_7b_16h
+cuda eval  llama_v2_7b_16h                     int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:27,  1.04it/s]running benchmark:   7%|▋         | 2/30 [00:01<00:17,  1.60it/s]running benchmark:  10%|█         | 3/30 [00:01<00:13,  1.93it/s]running benchmark:  13%|█▎        | 4/30 [00:02<00:12,  2.14it/s]running benchmark:  17%|█▋        | 5/30 [00:02<00:11,  2.27it/s]running benchmark:  20%|██        | 6/30 [00:02<00:10,  2.36it/s]running benchmark:  23%|██▎       | 7/30 [00:03<00:09,  2.42it/s]running benchmark:  27%|██▋       | 8/30 [00:03<00:08,  2.46it/s]running benchmark:  30%|███       | 9/30 [00:04<00:08,  2.49it/s]running benchmark:  33%|███▎      | 10/30 [00:04<00:07,  2.51it/s]running benchmark:  37%|███▋      | 11/30 [00:04<00:07,  2.52it/s]running benchmark:  40%|████      | 12/30 [00:05<00:07,  2.53it/s]running benchmark:  43%|████▎     | 13/30 [00:05<00:06,  2.54it/s]running benchmark:  47%|████▋     | 14/30 [00:06<00:06,  2.54it/s]running benchmark:  50%|█████     | 15/30 [00:06<00:05,  2.55it/s]running benchmark:  53%|█████▎    | 16/30 [00:06<00:05,  2.55it/s]running benchmark:  57%|█████▋    | 17/30 [00:07<00:05,  2.55it/s]running benchmark:  60%|██████    | 18/30 [00:07<00:04,  2.55it/s]running benchmark:  63%|██████▎   | 19/30 [00:08<00:04,  2.55it/s]running benchmark:  67%|██████▋   | 20/30 [00:08<00:03,  2.55it/s]running benchmark:  70%|███████   | 21/30 [00:08<00:03,  2.55it/s]running benchmark:  73%|███████▎  | 22/30 [00:09<00:03,  2.55it/s]running benchmark:  77%|███████▋  | 23/30 [00:09<00:02,  2.55it/s]running benchmark:  80%|████████  | 24/30 [00:09<00:02,  2.55it/s]running benchmark:  83%|████████▎ | 25/30 [00:10<00:01,  2.55it/s]running benchmark:  87%|████████▋ | 26/30 [00:10<00:01,  2.55it/s]running benchmark:  90%|█████████ | 27/30 [00:11<00:01,  2.55it/s]running benchmark:  93%|█████████▎| 28/30 [00:11<00:00,  2.55it/s]running benchmark:  97%|█████████▋| 29/30 [00:11<00:00,  2.56it/s]running benchmark: 100%|██████████| 30/30 [00:12<00:00,  2.55it/s]running benchmark: 100%|██████████| 30/30 [00:12<00:00,  2.44it/s]
+1042.605ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+maml_omniglot
+cuda eval  maml_omniglot                       int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 691.89it/s]
+3419.025ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+mnasnet1_0
+cuda eval  mnasnet1_0                          int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 145.31it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 147.51it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 147.02it/s]
+6528.744ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+mobilenet_v2
+cuda eval  mobilenet_v2                        int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 127.25it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 131.93it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 131.91it/s]
+7731.466ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+WARNING:root:mobilenet_v2_quantized_qat failed to load
+mobilenet_v2_quantized_qat
+The eval test only supports CPU.
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 443, in load_model
+    benchmark = benchmark_cls(
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/util/model.py", line 24, in __call__
+    obj = type.__call__(cls, *args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/models/mobilenet_v2_quantized_qat/__init__.py", line 21, in __init__
+    raise NotImplementedError("The eval test only supports CPU.")
+NotImplementedError: The eval test only supports CPU.
+
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+mobilenet_v3_large
+cuda eval  mobilenet_v3_large                  int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 117.11it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 117.67it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 117.20it/s]
+7905.294ms
+loading model: 0it [00:00, ?it/s]NCCL version 2.19.3+cuda12.0
+loading model: 0it [00:03, ?it/s]
+moco
+cuda eval  moco                                int4weightonly-bs1        
+ERROR:common:Backend eager failed in warmup()
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2380, in warmup
+    fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/parallel/distributed.py", line 1523, in forward
+    else self._run_ddp_forward(*inputs, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/parallel/distributed.py", line 1359, in _run_ddp_forward
+    return self.module(*inputs, **kwargs)  # type: ignore[index]
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/models/moco/moco/builder.py", line 130, in forward
+    self._momentum_update_key_encoder()  # update the key encoder
+  File "/home/cdhernandez/local/pytorch/torch/utils/_contextlib.py", line 115, in decorate_context
+    return func(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/models/moco/moco/builder.py", line 50, in _momentum_update_key_encoder
+    param_k.mul_(self.m).add_(param_q.mul(1. - self.m))
+TypeError: add_(): argument 'other' (position 1) must be Tensor, not NoneType
+Run failed with return code:  255
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+nanogpt
+number of parameters: 123.69M
+num decayed parameter tensors: 50, with 124,354,560 parameters
+num non-decayed parameter tensors: 98, with 121,344 parameters
+using fused AdamW: True
+cuda eval  nanogpt                             int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 81.52it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 82.28it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 82.74it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 82.55it/s]
+5059.972ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+nvidia_deeprecommender
+cuda eval  nvidia_deeprecommender              int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 834.94it/s]
+869.732ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+opacus_cifar10
+cuda eval  opacus_cifar10                      int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  73%|███████▎  | 22/30 [00:00<00:00, 210.99it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 208.91it/s]
+5150.089ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:28, ?it/s]
+phi_1_5
+cuda eval  phi_1_5                             int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:11,  2.50it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:07,  3.56it/s]running benchmark:  10%|█         | 3/30 [00:00<00:06,  4.37it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:05,  4.89it/s]running benchmark:  17%|█▋        | 5/30 [00:01<00:04,  5.24it/s]running benchmark:  20%|██        | 6/30 [00:01<00:04,  5.48it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:04,  5.64it/s]running benchmark:  27%|██▋       | 8/30 [00:01<00:03,  5.75it/s]running benchmark:  30%|███       | 9/30 [00:01<00:03,  5.83it/s]running benchmark:  33%|███▎      | 10/30 [00:01<00:03,  5.89it/s]running benchmark:  37%|███▋      | 11/30 [00:02<00:03,  5.93it/s]running benchmark:  40%|████      | 12/30 [00:02<00:03,  5.95it/s]running benchmark:  43%|████▎     | 13/30 [00:02<00:02,  5.97it/s]running benchmark:  47%|████▋     | 14/30 [00:02<00:02,  5.98it/s]running benchmark:  50%|█████     | 15/30 [00:02<00:02,  5.99it/s]running benchmark:  53%|█████▎    | 16/30 [00:02<00:02,  6.00it/s]running benchmark:  57%|█████▋    | 17/30 [00:03<00:02,  6.00it/s]running benchmark:  60%|██████    | 18/30 [00:03<00:01,  6.00it/s]running benchmark:  63%|██████▎   | 19/30 [00:03<00:01,  6.01it/s]running benchmark:  67%|██████▋   | 20/30 [00:03<00:01,  6.00it/s]running benchmark:  70%|███████   | 21/30 [00:03<00:01,  6.00it/s]running benchmark:  73%|███████▎  | 22/30 [00:03<00:01,  6.00it/s]running benchmark:  77%|███████▋  | 23/30 [00:04<00:01,  6.00it/s]running benchmark:  80%|████████  | 24/30 [00:04<00:00,  6.00it/s]running benchmark:  83%|████████▎ | 25/30 [00:04<00:00,  6.00it/s]running benchmark:  87%|████████▋ | 26/30 [00:04<00:00,  6.01it/s]running benchmark:  90%|█████████ | 27/30 [00:04<00:00,  6.01it/s]running benchmark:  93%|█████████▎| 28/30 [00:04<00:00,  6.01it/s]running benchmark:  97%|█████████▋| 29/30 [00:05<00:00,  6.01it/s]running benchmark: 100%|██████████| 30/30 [00:05<00:00,  6.01it/s]running benchmark: 100%|██████████| 30/30 [00:05<00:00,  5.70it/s]
+1124.103ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+phlippe_densenet
+cuda eval  phlippe_densenet                    int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 96.24it/s]running benchmark:  73%|███████▎  | 22/30 [00:00<00:00, 105.00it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 104.91it/s]
+9788.117ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+phlippe_resnet
+cuda eval  phlippe_resnet                      int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 263.20it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 263.29it/s]
+6342.454ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+pyhpc_equation_of_state
+cuda eval  pyhpc_equation_of_state             int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 233.56it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 233.74it/s]
+17096.725ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+pyhpc_isoneutral_mixing
+cuda eval  pyhpc_isoneutral_mixing             int4weightonly-bs1        
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 162.91it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 162.19it/s]
+20963.552ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model pyhpc_turbulent_kinetic_energy does not support bfloat16, running with amp instead
+loading model: 0it [00:01, ?it/s]
+WARNING:common:Model pyhpc_turbulent_kinetic_energy does not support bfloat16, running with amp instead
+pyhpc_turbulent_kinetic_energy
+cuda eval  pyhpc_turbulent_kinetic_energy      int4weightonly-bs1        
+WARNING:common:Model pyhpc_turbulent_kinetic_energy does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:05,  5.60it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 37.33it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 55.17it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 64.87it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 55.25it/s]
+4567.338ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+pytorch_CycleGAN_and_pix2pix
+cuda eval  pytorch_CycleGAN_and_pix2pix        int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 145.45it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 152.14it/s]
+2325.332ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+pytorch_stargan
+cuda eval  pytorch_stargan                     int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 102.07it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 113.24it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 113.32it/s]
+2010.738ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+pytorch_unet
+cuda eval  pytorch_unet                        int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:00, 37.55it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 42.91it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 44.58it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 45.39it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 45.81it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 46.06it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 45.12it/s]
+1819.523ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+resnet152
+cuda eval  resnet152                           int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:00, 39.43it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 40.17it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 40.40it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 41.12it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 41.40it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 41.17it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 40.94it/s]
+7011.921ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+resnet18
+cuda eval  resnet18                            int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  87%|████████▋ | 26/30 [00:00<00:00, 252.10it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 252.18it/s]
+4846.233ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+resnet50
+cuda eval  resnet50                            int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 105.04it/s]running benchmark:  73%|███████▎  | 22/30 [00:00<00:00, 107.76it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 108.01it/s]
+6343.167ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+WARNING:root:resnet50_quantized_qat failed to load
+resnet50_quantized_qat
+The eval test only supports CPU.
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 443, in load_model
+    benchmark = benchmark_cls(
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/util/model.py", line 24, in __call__
+    obj = type.__call__(cls, *args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/models/resnet50_quantized_qat/__init__.py", line 21, in __init__
+    raise NotImplementedError("The eval test only supports CPU.")
+NotImplementedError: The eval test only supports CPU.
+
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+resnext50_32x4d
+cuda eval  resnext50_32x4d                     int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 88.29it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 89.00it/s]running benchmark:  93%|█████████▎| 28/30 [00:00<00:00, 90.06it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 89.85it/s]
+8065.616ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:09, ?it/s]
+sam
+cuda eval  sam                                 int4weightonly-bs1        
+ERROR:common:Backend dynamo failed in warmup()
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2380, in warmup
+    fn(model, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 488, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 538, in forward_pass
+    def forward_pass(self, mod, inputs, collect_outputs=True):
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 488, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/external_utils.py", line 17, in inner
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 901, in forward
+    return compiled_fn(full_args)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/utils.py", line 81, in g
+    return f(*args)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 94, in runtime_wrapper
+    all_outs = call_func_at_runtime_with_args(
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/utils.py", line 105, in call_func_at_runtime_with_args
+    out = normalize_as_list(f(args))
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 297, in inner_fn
+    unwrapped_outs = runtime_fn(unwrapped_args)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py", line 118, in rng_functionalization_wrapper
+    return compiled_fw(args)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 863, in __call__
+    return self.get_current_callable()(inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 665, in run
+    return compiled_fn(new_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/cudagraph_trees.py", line 380, in deferred_cudagraphify
+    fn, out = cudagraphify(model, inputs, new_static_input_idxs, *args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/cudagraph_trees.py", line 408, in cudagraphify
+    return manager.add_function(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/cudagraph_trees.py", line 1941, in add_function
+    return fn, fn(inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/cudagraph_trees.py", line 1755, in run
+    out = self._run(new_inputs, function_id)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/cudagraph_trees.py", line 1796, in _run
+    return self.run_eager(new_inputs, function_id)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/cudagraph_trees.py", line 1911, in run_eager
+    return node.run(new_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/cudagraph_trees.py", line 611, in run
+    out = self.wrapped_function.model(new_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 891, in _run_from_cache
+    return compiled_graph.compiled_artifact(inputs)
+  File "/tmp/torchinductor_cdhernandez/g4/cg4rxxzan5bcfti7rnynechvdhv3ychv7ycv3no3cfgi3zry5ymt.py", line 8391, in call
+    buf1057 = aten._weight_int4pack_mm(buf1056, arg610_1, 128, arg611_1)
+  File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 755, in __call__
+    return self._op(*args, **(kwargs or {}))
+  File "/home/cdhernandez/local/pytorch/torch/utils/_device.py", line 77, in __torch_function__
+    return func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 755, in __call__
+    return self._op(*args, **(kwargs or {}))
+RuntimeError: Expected A.is_contiguous() to be true, but got false.  (Could this error message be improved?  If so, please report an enhancement request to PyTorch.)
+Run failed with return code:  255
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+shufflenet_v2_x1_0
+cuda eval  shufflenet_v2_x1_0                  int4weightonly-bs1        
+ERROR:common:Backend dynamo failed in warmup()
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2380, in warmup
+    fn(model, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 488, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 654, in catch_errors
+    return callback(frame, cache_entry, hooks, frame_state)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 727, in _convert_frame
+    result = inner_convert(frame, cache_entry, hooks, frame_state)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 383, in _convert_frame_assert
+    compiled_product = _compile(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 646, in _compile
+    guarded_code = compile_inner(code, one_graph, hooks, transform)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 562, in compile_inner
+    out_code = transform_code_object(code, transform)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/bytecode_transformation.py", line 1033, in transform_code_object
+    transformations(instructions, code_options)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 151, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 527, in transform
+    tracer.run()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2144, in run
+    super().run()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 818, in run
+    and self.step()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 781, in step
+    getattr(self, inst.opname)(inst)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2259, in RETURN_VALUE
+    self.output.compile_subgraph(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 919, in compile_subgraph
+    self.compile_and_call_fx_graph(tx, list(reversed(stack_values)), root)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1087, in compile_and_call_fx_graph
+    compiled_fn = self.call_user_compiler(gm)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1159, in call_user_compiler
+    raise BackendCompilerFailed(self.compiler_fn, e).with_traceback(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1140, in call_user_compiler
+    compiled_fn = compiler_fn(gm, self.example_inputs())
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_dynamo.py", line 117, in debug_wrapper
+    compiled_gm = compiler_fn(gm, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/__init__.py", line 1672, in __call__
+    return compile_fx(model_, inputs_, config_patches=self.config)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 952, in compile_fx
+    return compile_fx(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1168, in compile_fx
+    return aot_autograd(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/backends/common.py", line 55, in compiler_fn
+    cg = aot_module_simplified(gm, example_inputs, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 887, in aot_module_simplified
+    compiled_fn = create_aot_dispatcher_function(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 600, in create_aot_dispatcher_function
+    compiled_fn = compiler_fn(flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 425, in aot_wrapper_dedupe
+    return compiler_fn(flat_fn, leaf_flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 630, in aot_wrapper_synthetic_base
+    return compiler_fn(flat_fn, flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py", line 97, in aot_dispatch_base
+    compiled_fw = compiler(fw_module, updated_flat_args)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1100, in fw_compiler_base
+    return inner_compile(
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_aot.py", line 83, in debug_wrapper
+    inner_compiled_fn = compiler_fn(gm, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/debug.py", line 305, in inner
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 320, in compile_fx_inner
+    compiled_graph = fx_codegen_and_compile(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 535, in fx_codegen_and_compile
+    graph.run(*example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 518, in run
+    return super().run(*args)
+  File "/home/cdhernandez/local/pytorch/torch/fx/interpreter.py", line 138, in run
+    self.env[node] = self.run_node(node)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 815, in run_node
+    result = self.call_function(n.target, args, kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 695, in call_function
+    raise LoweringException(e, target, args, kwargs).with_traceback(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 692, in call_function
+    out = lowerings[target](*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/lowering.py", line 291, in wrapped
+    out = decomp_fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/kernel/conv.py", line 363, in convolution
+    return convert_1x1_conv_to_mm(x, weight, bias)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/kernel/conv.py", line 280, in convert_1x1_conv_to_mm
+    x.freeze_layout()
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/ir.py", line 6264, in __getattr__
+    fn = getattr(self.data, name)
+torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised:
+LoweringException: AttributeError: 'SliceView' object has no attribute 'freeze_layout'
+  target: aten.convolution.default
+  args[0]: TensorBox(
+    SliceView(
+      View(
+        StorageBox(
+          ComputedBuffer(name='buf14', layout=FlexibleLayout('cuda', torch.bfloat16, size=[1, 58, 2, 28, 28], stride=[90944, 1568, 784, 28, 1]), data=Pointwise(
+            'cuda',
+            torch.bfloat16,
+            def inner_fn(index):
+                _, i1, i2, i3, i4 = index
+                tmp0 = ops.load(buf13, i4 + 28 * i3 + 784 * i1 + 45472 * i2)
+                return tmp0
+            ,
+            ranges=[1, 58, 2, 28, 28],
+            origin_node=clone,
+            origins={clone}
+          ))
+        ),
+        size=[1, 116, 28, 28],
+        reindex=lambda i0, i1, i2, i3: [0, ModularIndexing(i1, 2, 58), ModularIndexing(i1, 1, 2), i2, i3],
+        origins={view_1, clone}
+      ),
+      size=[1, 58, 28, 28],
+      reindex=lambda i0, i1, i2, i3: [i0, i1 + 58, i2, i3],
+      origins={split}
+    )
+  )
+  args[1]: TensorBox(StorageBox(
+    InputBuffer(name='arg18_1', layout=FixedLayout('cuda', torch.bfloat16, size=[58, 58, 1, 1], stride=[58, 1, 1, 1]))
+  ))
+  args[2]: None
+  args[3]: [1, 1]
+  args[4]: [0, 0]
+  args[5]: [1, 1]
+  args[6]: False
+  args[7]: [0, 0]
+  args[8]: 1
+
+Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information
+
+
+You can suppress this exception and fall back to eager by setting:
+    import torch._dynamo
+    torch._dynamo.config.suppress_errors = True
+
+Run failed with return code:  255
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+soft_actor_critic
+cuda eval  soft_actor_critic                   int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 908.61it/s]
+1991.611ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+speech_transformer
+cuda eval  speech_transformer                  int4weightonly-bs1        
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['non-cuda device in graph']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  10%|█         | 3/30 [00:00<00:01, 22.64it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 22.67it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 22.74it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 22.86it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 23.04it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 23.04it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 22.92it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 22.97it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 23.13it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 23.13it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 22.99it/s]
+1529.640ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+squeezenet1_1
+cuda eval  squeezenet1_1                       int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 323.10it/s]
+5660.071ms
+loading model: 0it [00:00, ?it/s]stable_diffusion_text_encoder
+
+Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s][A
+Loading pipeline components...:  17%|█▋        | 1/6 [00:00<00:00,  9.59it/s][A
+Loading pipeline components...:  33%|███▎      | 2/6 [00:00<00:00,  4.53it/s][A
+Loading pipeline components...:  83%|████████▎ | 5/6 [00:00<00:00,  8.42it/s][ALoading pipeline components...: 100%|██████████| 6/6 [00:00<00:00,  8.61it/s]
+loading model: 0it [00:07, ?it/s]
+cuda eval  stable_diffusion_text_encoder       int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:00, 30.35it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 30.37it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 30.33it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 30.56it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 30.26it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 30.33it/s]running benchmark:  93%|█████████▎| 28/30 [00:00<00:00, 30.17it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 30.29it/s]
+11333.555ms
+loading model: 0it [00:00, ?it/s]stable_diffusion_unet
+
+Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s][A
+Loading pipeline components...:  50%|█████     | 3/6 [00:00<00:00, 10.44it/s][A
+Loading pipeline components...:  83%|████████▎ | 5/6 [00:00<00:00, 13.41it/s][ALoading pipeline components...: 100%|██████████| 6/6 [00:00<00:00,  7.35it/s]
+loading model: 0it [00:06, ?it/s]
+cuda eval  stable_diffusion_unet               int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:21,  1.34it/s]running benchmark:   7%|▋         | 2/30 [00:01<00:13,  2.03it/s]running benchmark:  10%|█         | 3/30 [00:01<00:11,  2.43it/s]running benchmark:  13%|█▎        | 4/30 [00:01<00:09,  2.68it/s]running benchmark:  17%|█▋        | 5/30 [00:02<00:08,  2.84it/s]running benchmark:  20%|██        | 6/30 [00:02<00:08,  2.95it/s]running benchmark:  23%|██▎       | 7/30 [00:02<00:07,  3.02it/s]running benchmark:  27%|██▋       | 8/30 [00:02<00:07,  3.07it/s]running benchmark:  30%|███       | 9/30 [00:03<00:06,  3.10it/s]running benchmark:  33%|███▎      | 10/30 [00:03<00:06,  3.13it/s]running benchmark:  37%|███▋      | 11/30 [00:03<00:06,  3.14it/s]running benchmark:  40%|████      | 12/30 [00:04<00:05,  3.15it/s]running benchmark:  43%|████▎     | 13/30 [00:04<00:05,  3.16it/s]running benchmark:  47%|████▋     | 14/30 [00:04<00:05,  3.17it/s]running benchmark:  50%|█████     | 15/30 [00:05<00:04,  3.17it/s]running benchmark:  53%|█████▎    | 16/30 [00:05<00:04,  3.17it/s]running benchmark:  57%|█████▋    | 17/30 [00:05<00:04,  3.17it/s]running benchmark:  60%|██████    | 18/30 [00:06<00:03,  3.18it/s]running benchmark:  63%|██████▎   | 19/30 [00:06<00:03,  3.18it/s]running benchmark:  67%|██████▋   | 20/30 [00:06<00:03,  3.18it/s]running benchmark:  70%|███████   | 21/30 [00:07<00:02,  3.17it/s]running benchmark:  73%|███████▎  | 22/30 [00:07<00:02,  3.18it/s]running benchmark:  77%|███████▋  | 23/30 [00:07<00:02,  3.18it/s]running benchmark:  80%|████████  | 24/30 [00:07<00:01,  3.18it/s]running benchmark:  83%|████████▎ | 25/30 [00:08<00:01,  3.18it/s]running benchmark:  87%|████████▋ | 26/30 [00:08<00:01,  3.18it/s]running benchmark:  90%|█████████ | 27/30 [00:08<00:00,  3.18it/s]running benchmark:  93%|█████████▎| 28/30 [00:09<00:00,  3.17it/s]running benchmark:  97%|█████████▋| 29/30 [00:09<00:00,  3.17it/s]running benchmark: 100%|██████████| 30/30 [00:09<00:00,  3.18it/s]running benchmark: 100%|██████████| 30/30 [00:09<00:00,  3.04it/s]
+1111.224ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+timm_efficientdet
+cuda eval  timm_efficientdet                   int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  10%|█         | 3/30 [00:00<00:01, 21.02it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 21.64it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 21.94it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 22.06it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 22.17it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 22.03it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 21.90it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 21.73it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 21.76it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 21.88it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 21.87it/s]
+7409.069ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+timm_efficientnet
+cuda eval  timm_efficientnet                   int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 95.51it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 96.38it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 95.96it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 95.92it/s]
+8191.429ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+timm_nfnet
+cuda eval  timm_nfnet                          int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  20%|██        | 6/30 [00:00<00:00, 52.35it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 54.49it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 55.98it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 56.67it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 57.06it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 56.26it/s]
+6603.015ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+timm_regnet
+cuda eval  timm_regnet                         int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  20%|██        | 6/30 [00:00<00:00, 57.55it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 58.88it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 58.99it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 59.35it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 59.51it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 59.21it/s]
+5275.865ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+timm_resnest
+cuda eval  timm_resnest                        int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 170.95it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 172.00it/s]
+5823.710ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+timm_vision_transformer
+cuda eval  timm_vision_transformer             int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 82.71it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 84.74it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 85.75it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 85.44it/s]
+3771.608ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:19, ?it/s]
+timm_vision_transformer_large
+cuda eval  timm_vision_transformer_large       int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:05,  4.86it/s]running benchmark:  10%|█         | 3/30 [00:00<00:03,  8.54it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:02,  9.88it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:02, 10.55it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 10.93it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:01, 11.16it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:01, 11.31it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01, 11.41it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:01, 11.48it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 11.52it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 11.55it/s]running benchmark:  77%|███████▋  | 23/30 [00:02<00:00, 11.57it/s]running benchmark:  83%|████████▎ | 25/30 [00:02<00:00, 11.59it/s]running benchmark:  90%|█████████ | 27/30 [00:02<00:00, 11.60it/s]running benchmark:  97%|█████████▋| 29/30 [00:02<00:00, 11.60it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 11.10it/s]
+1017.297ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+timm_vovnet
+cuda eval  timm_vovnet                         int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 133.47it/s]running benchmark:  93%|█████████▎| 28/30 [00:00<00:00, 133.51it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 133.01it/s]
+5608.737ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+torch_multimodal_clip
+cuda eval  torch_multimodal_clip               int4weightonly-bs1        
+skipping cudagraphs due to ['non-cuda device in graph']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 15.78it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 18.88it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 19.95it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 20.49it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 20.75it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 20.90it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 20.91it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 20.99it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 21.00it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 21.01it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 20.62it/s]
+1497.370ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model tts_angular does not support bfloat16, running with amp instead
+loading model: 0it [00:01, ?it/s]
+WARNING:common:Model tts_angular does not support bfloat16, running with amp instead
+tts_angular
+cuda eval  tts_angular                         int4weightonly-bs1        
+WARNING:common:Model tts_angular does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 136.83it/s]running benchmark:  93%|█████████▎| 28/30 [00:00<00:00, 137.20it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 137.00it/s]
+925.629ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+vgg16
+cuda eval  vgg16                               int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 379.63it/s]
+2409.147ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model vision_maskrcnn does not support bfloat16, running with float16 instead
+loading model: 0it [00:07, ?it/s]
+WARNING:common:Model vision_maskrcnn does not support bfloat16, running with float16 instead
+vision_maskrcnn
+cuda eval  vision_maskrcnn                     int4weightonly-bs1        
+WARNING:common:Model vision_maskrcnn does not support bfloat16, running with float16 instead
+ERROR:common:Backend eager failed in warmup()
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2380, in warmup
+    fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/vision/torchvision/models/detection/generalized_rcnn.py", line 105, in forward
+    detections, detector_losses = self.roi_heads(features, proposals, images.image_sizes, targets)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/vision/torchvision/models/detection/roi_heads.py", line 762, in forward
+    box_features = self.box_head(box_features)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/vision/torchvision/models/detection/faster_rcnn.py", line 301, in forward
+    x = F.relu(self.fc6(x))
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/linear.py", line 116, in forward
+    return F.linear(input, self.weight, self.bias)
+  File "/home/cdhernandez/local/ao/torchao/quantization/subclass.py", line 120, in __torch_function__
+    return cls._quantized_op(mat1, w_qtensor, bias)
+  File "/home/cdhernandez/local/ao/torchao/quantization/subclass.py", line 345, in _quantized_op
+    y = aten._weight_int4pack_mm(
+  File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 755, in __call__
+    return self._op(*args, **(kwargs or {}))
+RuntimeError: CUDA error: invalid configuration argument
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Run failed with return code:  255
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+yolov3
+cuda eval  yolov3                              int4weightonly-bs1        
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:00, 49.52it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 64.43it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 68.35it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 70.25it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 67.94it/s]
+5855.186ms
+
+Summary for tag=0.000000:
+speedup             gmean=0.00x mean=0.000x
+abs_latency         gmean=0.00x mean=0.000x
+compilation_latency mean=0.000 seconds
+compression_ratio   mean=0.000x
+eager_peak_mem      gmean=0.00x mean=0.000x
+dynamo_peak_mem     gmean=0.00x mean=0.000x
+calls_captured      gmean=0.00x mean=0.000x
+unique_graphs       gmean=0.00x mean=0.000x
+graph_breaks        gmean=0.00x mean=0.000x
+unique_graph_breaks gmean=0.00x mean=0.000x
+
+Summary for tag=int8dynamic:
+speedup             gmean=9.20x mean=113.389x
+abs_latency         gmean=4.24x mean=10.510x
+compilation_latency mean=34.839 seconds
+compression_ratio   mean=1.263x
+eager_peak_mem      gmean=0.38x mean=0.878x
+dynamo_peak_mem     gmean=0.36x mean=0.844x
+calls_captured      gmean=233.44x mean=564.988x
+unique_graphs       gmean=1.86x mean=7.136x
+graph_breaks        gmean=0.00x mean=5.160x
+unique_graph_breaks gmean=0.00x mean=1.333x
+
+Summary for tag=int8weightonly:
+speedup             gmean=2.46x mean=2.889x
+abs_latency         gmean=4.51x mean=11.782x
+compilation_latency mean=31.136 seconds
+compression_ratio   mean=1.098x
+eager_peak_mem      gmean=0.38x mean=0.871x
+dynamo_peak_mem     gmean=0.46x mean=0.896x
+calls_captured      gmean=233.16x mean=563.963x
+unique_graphs       gmean=1.85x mean=7.183x
+graph_breaks        gmean=0.00x mean=5.220x
+unique_graph_breaks gmean=0.00x mean=1.317x
+
+Summary for tag=int4weightonly:
+speedup             gmean=2.01x mean=2.520x
+abs_latency         gmean=6.14x mean=33.943x
+compilation_latency mean=27.431 seconds
+compression_ratio   mean=1.140x
+eager_peak_mem      gmean=0.33x mean=0.696x
+dynamo_peak_mem     gmean=0.37x mean=0.739x
+calls_captured      gmean=219.02x mean=494.800x
+unique_graphs       gmean=1.83x mean=7.125x
+graph_breaks        gmean=0.00x mean=5.088x
+unique_graph_breaks gmean=0.00x mean=1.312x
+
+Summary for tag=baseline:
+speedup             gmean=2.42x mean=2.935x
+abs_latency         gmean=4.22x mean=13.273x
+compilation_latency mean=36.647 seconds
+compression_ratio   mean=1.125x
+eager_peak_mem      gmean=0.42x mean=1.075x
+dynamo_peak_mem     gmean=0.45x mean=1.120x
+calls_captured      gmean=240.73x mean=595.060x
+unique_graphs       gmean=1.89x mean=6.619x
+graph_breaks        gmean=0.00x mean=5.071x
+unique_graph_breaks gmean=0.00x mean=1.333x
+
+Summary for tag=int8weightonly-bs1:
+speedup             gmean=3.24x mean=4.117x
+abs_latency         gmean=2.99x mean=8.375x
+compilation_latency mean=35.067 seconds
+compression_ratio   mean=0.937x
+eager_peak_mem      gmean=0.24x mean=0.786x
+dynamo_peak_mem     gmean=0.38x mean=0.886x
+calls_captured      gmean=232.72x mean=567.580x
+unique_graphs       gmean=1.87x mean=7.259x
+graph_breaks        gmean=0.00x mean=5.284x
+unique_graph_breaks gmean=0.00x mean=1.333x
+
+Summary for tag=int4weightonly-bs1:
+speedup             gmean=2.82x mean=3.849x
+abs_latency         gmean=3.59x mean=15.921x
+compilation_latency mean=27.963 seconds
+compression_ratio   mean=0.986x
+eager_peak_mem      gmean=0.20x mean=0.605x
+dynamo_peak_mem     gmean=0.30x mean=0.703x
+calls_captured      gmean=218.43x mean=497.633x
+unique_graphs       gmean=1.84x mean=7.203x
+graph_breaks        gmean=0.00x mean=5.152x
+unique_graph_breaks gmean=0.00x mean=1.329x
+start baseline batchsize 1
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+torchrec_dlrm
+/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/fbgemm_gpu_py.so: undefined symbol: _ZNK5torch8autograd4Node4nameEv
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 36, in <module>
+    run()
+  File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 30, in run
+    benchmark.run(bm_args)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/run.py", line 24, in run
+    main(TorchBenchmarkRunner(), original_dir, args)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3133, in main
+    process_entry(0, runner, original_dir, args)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3090, in process_entry
+    return maybe_fresh_cache(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1714, in inner
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 383, in load_model
+    module = importlib.import_module(c)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/importlib/__init__.py", line 126, in import_module
+    return _bootstrap._gcd_import(name[level:], package, level)
+  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
+  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
+  File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
+  File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
+  File "<frozen importlib._bootstrap_external>", line 883, in exec_module
+  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/__init__.py", line 7, in <module>
+    from .data.dlrm_dataloader import get_dataloader
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/data/dlrm_dataloader.py", line 13, in <module>
+    from torchrec.datasets.criteo import (
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/__init__.py", line 8, in <module>
+    import torchrec.distributed  # noqa
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/__init__.py", line 36, in <module>
+    from torchrec.distributed.model_parallel import DistributedModelParallel  # noqa
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/model_parallel.py", line 21, in <module>
+    from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/__init__.py", line 22, in <module>
+    from torchrec.distributed.planner.planners import EmbeddingShardingPlanner  # noqa
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/planners.py", line 19, in <module>
+    from torchrec.distributed.planner.constants import BATCH_SIZE, MAX_SIZE
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/constants.py", line 10, in <module>
+    from torchrec.distributed.embedding_types import EmbeddingComputeKernel
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/embedding_types.py", line 14, in <module>
+    from fbgemm_gpu.split_table_batched_embeddings_ops_training import EmbeddingLocation
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 23, in <module>
+    from . import _fbgemm_gpu_docs, sparse_ops  # noqa: F401, E402  # noqa: F401, E402
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/_fbgemm_gpu_docs.py", line 19, in <module>
+    torch.ops.fbgemm.jagged_2d_to_dense,
+  File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 822, in __getattr__
+    raise AttributeError(
+AttributeError: '_OpNamespace' 'fbgemm' object has no attribute 'jagged_2d_to_dense'
+Run failed with return code:  1
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+BERT_pytorch
+cuda eval  BERT_pytorch                        baseline-bs1              
+AUTOTUNE addmm(128x768, 128x768, 768x768)
+  bias_addmm 0.0127 ms 100.0%
+  triton_mm_5 0.0131 ms 97.5%
+  triton_mm_9 0.0131 ms 97.3%
+  triton_mm_6 0.0135 ms 94.4%
+  triton_mm_8 0.0147 ms 86.7%
+  addmm 0.0165 ms 77.0%
+  triton_mm_3 0.0170 ms 75.0%
+  triton_mm_4 0.0171 ms 74.3%
+  triton_mm_2 0.0205 ms 62.1%
+  triton_mm_1 0.0205 ms 62.0%
+SingleProcess AUTOTUNE takes 5.5990 seconds
+AUTOTUNE mm(128x768, 768x768)
+  mm 0.0119 ms 100.0%
+  triton_mm_65 0.0125 ms 94.6%
+  triton_mm_66 0.0131 ms 90.7%
+  triton_mm_69 0.0133 ms 89.0%
+  triton_mm_68 0.0136 ms 87.5%
+  triton_mm_64 0.0158 ms 74.9%
+  triton_mm_63 0.0162 ms 73.4%
+  triton_mm_62 0.0192 ms 61.8%
+  triton_mm_61 0.0197 ms 60.1%
+  triton_mm_60 0.0279 ms 42.5%
+SingleProcess AUTOTUNE takes 5.1503 seconds
+AUTOTUNE mm(128x768, 768x3072)
+  mm 0.0145 ms 100.0%
+  triton_mm_80 0.0150 ms 96.8%
+  triton_mm_76 0.0168 ms 86.6%
+  triton_mm_78 0.0170 ms 85.5%
+  triton_mm_75 0.0173 ms 84.1%
+  triton_mm_77 0.0178 ms 81.7%
+  triton_mm_81 0.0187 ms 77.6%
+  triton_mm_74 0.0202 ms 72.1%
+  triton_mm_73 0.0203 ms 71.7%
+  triton_mm_72 0.0300 ms 48.5%
+SingleProcess AUTOTUNE takes 4.9015 seconds
+AUTOTUNE mm(128x3072, 3072x768)
+  mm 0.0179 ms 100.0%
+  triton_mm_90 0.0298 ms 59.9%
+  triton_mm_89 0.0300 ms 59.4%
+  triton_mm_93 0.0310 ms 57.6%
+  triton_mm_92 0.0343 ms 52.1%
+  triton_mm_88 0.0411 ms 43.5%
+  triton_mm_87 0.0415 ms 43.0%
+  triton_mm_86 0.0557 ms 32.1%
+  triton_mm_85 0.0557 ms 32.0%
+  triton_mm_84 0.0747 ms 23.9%
+SingleProcess AUTOTUNE takes 4.8119 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 83.72it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 84.48it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 84.54it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 84.47it/s]
+8371.692ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:06, ?it/s]
+Background_Matting
+cuda eval  Background_Matting                  baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  10%|█         | 3/30 [00:00<00:00, 28.96it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 37.59it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 40.00it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 41.12it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 41.74it/s]running benchmark:  93%|█████████▎| 28/30 [00:00<00:00, 42.10it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 40.85it/s]
+2062.106ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model DALLE2_pytorch does not support bfloat16, running with amp instead
+loading model: 0it [00:11, ?it/s]
+WARNING:common:Model DALLE2_pytorch does not support bfloat16, running with amp instead
+DALLE2_pytorch
+cuda eval  DALLE2_pytorch                      baseline-bs1              
+WARNING:common:Model DALLE2_pytorch does not support bfloat16, running with amp instead
+[2023-12-12 08:27:52,644] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 08:27:53,221] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 08:27:53,627] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 08:27:54,015] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 08:27:54,406] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 08:27:54,793] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 08:27:55,480] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 08:27:55,874] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 08:27:56,279] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 08:27:56,675] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 08:27:57,078] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 08:27:57,470] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 08:28:26,389] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 08:28:26,918] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 08:28:27,294] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 08:28:27,662] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 08:28:28,040] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 08:28:28,419] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 08:28:28,796] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 08:28:29,175] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 08:28:29,545] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 08:28:29,928] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 08:28:30,320] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 08:28:30,714] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:14,  2.01it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:12,  2.17it/s]running benchmark:  10%|█         | 3/30 [00:01<00:09,  2.88it/s]running benchmark:  13%|█▎        | 4/30 [00:01<00:07,  3.38it/s]running benchmark:  17%|█▋        | 5/30 [00:01<00:06,  3.75it/s]running benchmark:  20%|██        | 6/30 [00:01<00:06,  3.98it/s]running benchmark:  23%|██▎       | 7/30 [00:02<00:05,  4.15it/s]running benchmark:  27%|██▋       | 8/30 [00:02<00:05,  4.25it/s]running benchmark:  30%|███       | 9/30 [00:02<00:04,  4.35it/s]running benchmark:  33%|███▎      | 10/30 [00:02<00:04,  4.43it/s]running benchmark:  37%|███▋      | 11/30 [00:02<00:04,  4.55it/s]running benchmark:  40%|████      | 12/30 [00:03<00:03,  4.59it/s]running benchmark:  43%|████▎     | 13/30 [00:03<00:03,  4.60it/s]running benchmark:  47%|████▋     | 14/30 [00:03<00:03,  4.59it/s]running benchmark:  50%|█████     | 15/30 [00:03<00:03,  4.60it/s]running benchmark:  53%|█████▎    | 16/30 [00:03<00:03,  4.60it/s]running benchmark:  57%|█████▋    | 17/30 [00:04<00:02,  4.57it/s]running benchmark:  60%|██████    | 18/30 [00:04<00:02,  4.57it/s]running benchmark:  63%|██████▎   | 19/30 [00:04<00:02,  4.53it/s]running benchmark:  67%|██████▋   | 20/30 [00:04<00:02,  4.52it/s]running benchmark:  70%|███████   | 21/30 [00:05<00:01,  4.53it/s]running benchmark:  73%|███████▎  | 22/30 [00:05<00:01,  4.55it/s]running benchmark:  77%|███████▋  | 23/30 [00:05<00:01,  4.66it/s]running benchmark:  80%|████████  | 24/30 [00:05<00:01,  4.75it/s]running benchmark:  83%|████████▎ | 25/30 [00:05<00:01,  4.75it/s]running benchmark:  87%|████████▋ | 26/30 [00:06<00:00,  4.69it/s]running benchmark:  90%|█████████ | 27/30 [00:06<00:00,  4.60it/s]running benchmark:  93%|█████████▎| 28/30 [00:06<00:00,  4.52it/s]running benchmark:  97%|█████████▋| 29/30 [00:06<00:00,  4.48it/s]running benchmark: 100%|██████████| 30/30 [00:07<00:00,  4.46it/s]running benchmark: 100%|██████████| 30/30 [00:07<00:00,  4.26it/s]
+3742.691ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:05, ?it/s]
+LearningToPaint
+cuda eval  LearningToPaint                     baseline-bs1              
+AUTOTUNE mm(1x512, 512x65)
+  triton_mm_151 0.0092 ms 100.0%
+  triton_mm_147 0.0094 ms 97.6%
+  triton_mm_148 0.0094 ms 97.3%
+  triton_mm_149 0.0095 ms 96.5%
+  triton_mm_152 0.0098 ms 93.2%
+  mm 0.0101 ms 90.2%
+  triton_mm_146 0.0105 ms 87.5%
+  triton_mm_145 0.0114 ms 80.3%
+  triton_mm_144 0.0122 ms 74.9%
+  triton_mm_143 0.0169 ms 54.2%
+SingleProcess AUTOTUNE takes 3.8411 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 262.86it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 262.93it/s]
+5479.162ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model Super_SloMo does not support bfloat16, running with amp instead
+loading model: 0it [00:04, ?it/s]
+WARNING:common:Model Super_SloMo does not support bfloat16, running with amp instead
+Super_SloMo
+cuda eval  Super_SloMo                         baseline-bs1              
+WARNING:common:Model Super_SloMo does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  20%|██        | 6/30 [00:00<00:00, 52.46it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 57.42it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 59.57it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 60.50it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 59.57it/s]
+2828.514ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+alexnet
+cuda eval  alexnet                             baseline-bs1              
+AUTOTUNE mm(1x9216, 9216x4096)
+  mm 0.0733 ms 100.0%
+  triton_mm_40 0.0886 ms 82.7%
+  triton_mm_42 0.0914 ms 80.2%
+  triton_mm_38 0.0940 ms 78.0%
+  triton_mm_39 0.0959 ms 76.4%
+  triton_mm_43 0.0993 ms 73.8%
+  triton_mm_37 0.1058 ms 69.3%
+  triton_mm_36 0.1373 ms 53.4%
+  triton_mm_35 0.1493 ms 49.1%
+  triton_mm_34 0.1972 ms 37.2%
+SingleProcess AUTOTUNE takes 3.6165 seconds
+AUTOTUNE mm(1x4096, 4096x4096)
+  mm 0.0426 ms 100.0%
+  triton_mm_52 0.0474 ms 89.8%
+  triton_mm_54 0.0477 ms 89.1%
+  triton_mm_50 0.0495 ms 86.0%
+  triton_mm_49 0.0539 ms 79.0%
+  triton_mm_51 0.0548 ms 77.6%
+  triton_mm_55 0.0548 ms 77.6%
+  triton_mm_48 0.0686 ms 62.0%
+  triton_mm_47 0.0732 ms 58.1%
+  triton_mm_46 0.0923 ms 46.1%
+SingleProcess AUTOTUNE takes 3.8130 seconds
+AUTOTUNE addmm(1x1000, 1x4096, 4096x1000)
+  bias_addmm 0.0156 ms 100.0%
+  addmm 0.0188 ms 83.1%
+  triton_mm_63 0.0341 ms 45.7%
+  triton_mm_64 0.0355 ms 44.0%
+  triton_mm_66 0.0372 ms 41.8%
+  triton_mm_67 0.0382 ms 40.8%
+  triton_mm_62 0.0396 ms 39.3%
+  triton_mm_61 0.0456 ms 34.2%
+  triton_mm_60 0.0605 ms 25.7%
+  triton_mm_59 0.0651 ms 23.9%
+SingleProcess AUTOTUNE takes 3.9484 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 724.14it/s]
+1964.138ms
+loading model: 0it [00:00, ?it/s]basic_gnn_edgecnn
+loading model: 0it [00:03, ?it/s]
+cuda eval  basic_gnn_edgecnn                   baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 192.07it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 200.69it/s]
+1383.229ms
+loading model: 0it [00:00, ?it/s]basic_gnn_gcn
+loading model: 0it [00:04, ?it/s]
+cuda eval  basic_gnn_gcn                       baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:00, 47.23it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 98.16it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 102.78it/s]
+1049.578ms
+loading model: 0it [00:00, ?it/s]basic_gnn_gin
+loading model: 0it [00:04, ?it/s]
+cuda eval  basic_gnn_gin                       baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 353.14it/s]
+1212.080ms
+loading model: 0it [00:00, ?it/s]basic_gnn_sage
+loading model: 0it [00:02, ?it/s]
+cuda eval  basic_gnn_sage                      baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 159.94it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 166.27it/s]
+1264.195ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+cm3leon_generate
+cuda eval  cm3leon_generate                    baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:01<00:56,  1.96s/it]running benchmark:   7%|▋         | 2/30 [00:03<00:42,  1.51s/it]running benchmark:  10%|█         | 3/30 [00:04<00:37,  1.37s/it]running benchmark:  13%|█▎        | 4/30 [00:05<00:33,  1.31s/it]running benchmark:  17%|█▋        | 5/30 [00:06<00:31,  1.27s/it]running benchmark:  20%|██        | 6/30 [00:07<00:30,  1.25s/it]running benchmark:  23%|██▎       | 7/30 [00:09<00:28,  1.24s/it]running benchmark:  27%|██▋       | 8/30 [00:10<00:27,  1.24s/it]running benchmark:  30%|███       | 9/30 [00:11<00:26,  1.25s/it]running benchmark:  33%|███▎      | 10/30 [00:12<00:24,  1.24s/it]running benchmark:  37%|███▋      | 11/30 [00:14<00:23,  1.23s/it]running benchmark:  40%|████      | 12/30 [00:15<00:21,  1.22s/it]running benchmark:  43%|████▎     | 13/30 [00:16<00:20,  1.21s/it]running benchmark:  47%|████▋     | 14/30 [00:17<00:19,  1.21s/it]running benchmark:  50%|█████     | 15/30 [00:18<00:18,  1.20s/it]running benchmark:  53%|█████▎    | 16/30 [00:20<00:16,  1.20s/it]running benchmark:  57%|█████▋    | 17/30 [00:21<00:15,  1.21s/it]running benchmark:  60%|██████    | 18/30 [00:22<00:14,  1.22s/it]running benchmark:  63%|██████▎   | 19/30 [00:23<00:13,  1.21s/it]running benchmark:  67%|██████▋   | 20/30 [00:24<00:12,  1.21s/it]running benchmark:  70%|███████   | 21/30 [00:26<00:10,  1.21s/it]running benchmark:  73%|███████▎  | 22/30 [00:27<00:09,  1.20s/it]running benchmark:  77%|███████▋  | 23/30 [00:28<00:08,  1.20s/it]running benchmark:  80%|████████  | 24/30 [00:29<00:07,  1.20s/it]running benchmark:  83%|████████▎ | 25/30 [00:31<00:06,  1.21s/it]running benchmark:  87%|████████▋ | 26/30 [00:32<00:04,  1.23s/it]running benchmark:  90%|█████████ | 27/30 [00:33<00:03,  1.23s/it]running benchmark:  93%|█████████▎| 28/30 [00:34<00:02,  1.23s/it]running benchmark:  97%|█████████▋| 29/30 [00:35<00:01,  1.23s/it]running benchmark: 100%|██████████| 30/30 [00:37<00:00,  1.22s/it]running benchmark: 100%|██████████| 30/30 [00:37<00:00,  1.24s/it]
+4429.677ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+dcgan
+cuda eval  dcgan                               baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 851.92it/s]
+2442.995ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model demucs does not support bfloat16, running with amp instead
+loading model: 0it [00:08, ?it/s]
+WARNING:common:Model demucs does not support bfloat16, running with amp instead
+demucs
+cuda eval  demucs                              baseline-bs1              
+WARNING:common:Model demucs does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 14.33it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 18.63it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 20.01it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 20.66it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 21.00it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 21.20it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 21.33it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 21.42it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 21.48it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 21.51it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 20.90it/s]
+1182.451ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+densenet121
+cuda eval  densenet121                         baseline-bs1              
+AUTOTUNE addmm(1x1000, 1x1024, 1024x1000)
+  bias_addmm 0.0137 ms 100.0%
+  addmm 0.0137 ms 100.0%
+  triton_mm_1102 0.0148 ms 92.6%
+  triton_mm_1100 0.0151 ms 90.5%
+  triton_mm_1101 0.0151 ms 90.5%
+  triton_mm_1104 0.0154 ms 89.0%
+  triton_mm_1105 0.0159 ms 85.9%
+  triton_mm_1099 0.0172 ms 79.4%
+  triton_mm_1098 0.0209 ms 65.4%
+  triton_mm_1097 0.0220 ms 62.3%
+SingleProcess AUTOTUNE takes 4.0689 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:00, 45.38it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 48.49it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 49.52it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 50.02it/s]running benchmark:  93%|█████████▎| 28/30 [00:00<00:00, 49.74it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 49.42it/s]
+8072.411ms
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_c4
+WARNING:common:Model detectron2_fasterrcnn_r_101_c4 does not support bfloat16, running with amp instead
+loading model: 0it [00:06, ?it/s]
+WARNING:common:Model detectron2_fasterrcnn_r_101_c4 does not support bfloat16, running with amp instead
+cuda eval  detectron2_fasterrcnn_r_101_c4      baseline-bs1              
+WARNING:common:Model detectron2_fasterrcnn_r_101_c4 does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 08:41:52,129] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:02, 11.97it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:02, 12.71it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 13.00it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 13.16it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 13.23it/s]running benchmark:  40%|████      | 12/30 [00:00<00:01, 13.30it/s]running benchmark:  47%|████▋     | 14/30 [00:01<00:01, 13.33it/s]running benchmark:  53%|█████▎    | 16/30 [00:01<00:01, 13.34it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:00, 13.35it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 13.37it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 13.37it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 13.39it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 13.40it/s]running benchmark:  93%|█████████▎| 28/30 [00:02<00:00, 13.41it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 13.41it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 13.28it/s]
+1747.110ms
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_dc5
+loading model: 0it [00:09, ?it/s]
+WARNING:root:detectron2_fasterrcnn_r_101_dc5 failed to load
+Original Error: "roi_align_forward_kernel" not implemented for 'BFloat16'
+Eager model failed to run
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model
+    self.model_iter_fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward
+    return self.inference(batched_inputs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference
+    results, _ = self.roi_heads(images, features, proposals, None)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 747, in forward
+    pred_instances = self._forward_box(features, proposals)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 798, in _forward_box
+    box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals])
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward
+    return self.level_poolers[0](x[0], pooler_fmt_boxes)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward
+    return roi_align(
+  File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 238, in roi_align
+    return torch.ops.torchvision.roi_align(
+  File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 755, in __call__
+    return self._op(*args, **(kwargs or {}))
+RuntimeError: "roi_align_forward_kernel" not implemented for 'BFloat16'
+
+The above exception was the direct cause of the following exception:
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model
+    self.validate_model(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model
+    raise NotImplementedError("Eager model failed to run") from e
+NotImplementedError: Eager model failed to run
+
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_fpn
+WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:06, ?it/s]
+WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_fasterrcnn_r_101_fpn     baseline-bs1              
+WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 08:43:39,230] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:06,  4.20it/s]running benchmark:  10%|█         | 3/30 [00:00<00:02,  9.33it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:02, 12.09it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 14.00it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 16.43it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 18.00it/s]running benchmark:  53%|█████▎    | 16/30 [00:01<00:00, 19.00it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 19.73it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 20.22it/s]running benchmark:  83%|████████▎ | 25/30 [00:01<00:00, 20.53it/s]running benchmark:  93%|█████████▎| 28/30 [00:01<00:00, 20.77it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 17.82it/s]
+2215.217ms
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_50_c4
+loading model: 0it [00:05, ?it/s]
+WARNING:root:detectron2_fasterrcnn_r_50_c4 failed to load
+Original Error: "roi_align_forward_kernel" not implemented for 'BFloat16'
+Eager model failed to run
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model
+    self.model_iter_fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward
+    return self.inference(batched_inputs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference
+    results, _ = self.roi_heads(images, features, proposals, None)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 477, in forward
+    box_features = self._shared_roi_transform(
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 456, in _shared_roi_transform
+    x = self.pooler(features, boxes)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward
+    return self.level_poolers[0](x[0], pooler_fmt_boxes)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward
+    return roi_align(
+  File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 238, in roi_align
+    return torch.ops.torchvision.roi_align(
+  File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 755, in __call__
+    return self._op(*args, **(kwargs or {}))
+RuntimeError: "roi_align_forward_kernel" not implemented for 'BFloat16'
+
+The above exception was the direct cause of the following exception:
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model
+    self.validate_model(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model
+    raise NotImplementedError("Eager model failed to run") from e
+NotImplementedError: Eager model failed to run
+
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_50_dc5
+WARNING:common:Model detectron2_fasterrcnn_r_50_dc5 does not support bfloat16, running with amp instead
+loading model: 0it [00:08, ?it/s]
+WARNING:common:Model detectron2_fasterrcnn_r_50_dc5 does not support bfloat16, running with amp instead
+cuda eval  detectron2_fasterrcnn_r_50_dc5      baseline-bs1              
+WARNING:common:Model detectron2_fasterrcnn_r_50_dc5 does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 08:44:51,615] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 14.89it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 20.27it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 22.15it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 23.06it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 23.42it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 23.73it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 23.96it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 24.09it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 24.23it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 24.32it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 23.40it/s]
+1381.140ms
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_50_fpn
+WARNING:common:Model detectron2_fasterrcnn_r_50_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:05, ?it/s]
+WARNING:common:Model detectron2_fasterrcnn_r_50_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_fasterrcnn_r_50_fpn      baseline-bs1              
+WARNING:common:Model detectron2_fasterrcnn_r_50_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 08:46:01,068] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 18.97it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 23.13it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 24.34it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 25.01it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 25.38it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 25.60it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 25.76it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 25.82it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 25.83it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 25.89it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 25.32it/s]
+2041.912ms
+loading model: 0it [00:00, ?it/s]detectron2_fcos_r_50_fpn
+WARNING:common:Model detectron2_fcos_r_50_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:07, ?it/s]
+WARNING:common:Model detectron2_fcos_r_50_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_fcos_r_50_fpn            baseline-bs1              
+WARNING:common:Model detectron2_fcos_r_50_fpn does not support bfloat16, running with amp instead
+[2023-12-12 08:47:10,981] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (8)
+[2023-12-12 08:47:10,981] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/batch_norm.py:318)
+[2023-12-12 08:47:10,981] torch._dynamo.convert_frame: [WARNING]    last reason: L['self']._pos == 0                                           # ret = self[self._pos](x)  # miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/batch_norm.py:319 in forward
+[2023-12-12 08:47:10,981] torch._dynamo.convert_frame: [WARNING] To log all recompilation reasons, use TORCH_LOGS="recompiles".
+[2023-12-12 08:47:10,981] torch._dynamo.convert_frame: [WARNING] To diagnose recompilation issues, see https://pytorch.org/docs/master/compile/troubleshooting.html.
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 16.61it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:01, 18.32it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 19.02it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 19.62it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 19.94it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 20.16it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 20.17it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 20.33it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 20.43it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 20.52it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 20.59it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 20.11it/s]
+1597.959ms
+loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_101_c4
+WARNING:common:Model detectron2_maskrcnn_r_101_c4 does not support bfloat16, running with amp instead
+loading model: 0it [00:07, ?it/s]
+WARNING:common:Model detectron2_maskrcnn_r_101_c4 does not support bfloat16, running with amp instead
+cuda eval  detectron2_maskrcnn_r_101_c4        baseline-bs1              
+WARNING:common:Model detectron2_maskrcnn_r_101_c4 does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 08:48:31,780] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 08:48:38,539] [30/1_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:02, 10.86it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:02, 11.63it/s]running benchmark:  20%|██        | 6/30 [00:00<00:02, 11.97it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 12.14it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 12.24it/s]running benchmark:  40%|████      | 12/30 [00:00<00:01, 12.29it/s]running benchmark:  47%|████▋     | 14/30 [00:01<00:01, 12.33it/s]running benchmark:  53%|█████▎    | 16/30 [00:01<00:01, 12.35it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:00, 12.36it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 12.39it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 12.39it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 12.41it/s]running benchmark:  87%|████████▋ | 26/30 [00:02<00:00, 12.37it/s]running benchmark:  93%|█████████▎| 28/30 [00:02<00:00, 12.38it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 12.38it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 12.27it/s]
+1752.356ms
+loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_101_fpn
+WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:05, ?it/s]
+WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_maskrcnn_r_101_fpn       baseline-bs1              
+WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 08:50:10,947] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 08:50:16,218] [30/1_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 14.83it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:01, 16.71it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 17.43it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 17.89it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 18.17it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 18.23it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 18.25it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 18.24it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:00, 18.28it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 18.42it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 18.54it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 18.65it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 18.71it/s]running benchmark:  93%|█████████▎| 28/30 [00:01<00:00, 18.75it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 18.77it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 18.27it/s]
+2058.520ms
+loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_50_c4
+WARNING:common:Model detectron2_maskrcnn_r_50_c4 does not support bfloat16, running with amp instead
+loading model: 0it [00:05, ?it/s]
+WARNING:common:Model detectron2_maskrcnn_r_50_c4 does not support bfloat16, running with amp instead
+cuda eval  detectron2_maskrcnn_r_50_c4         baseline-bs1              
+WARNING:common:Model detectron2_maskrcnn_r_50_c4 does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 08:51:17,794] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 08:51:24,346] [30/1_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:06,  4.81it/s]running benchmark:  10%|█         | 3/30 [00:00<00:02,  9.48it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:02, 11.50it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 12.59it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 13.25it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:01, 13.65it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:01, 13.88it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01, 14.07it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:00, 14.21it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 14.31it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 14.37it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 14.41it/s]running benchmark:  83%|████████▎ | 25/30 [00:01<00:00, 14.47it/s]running benchmark:  90%|█████████ | 27/30 [00:02<00:00, 14.49it/s]running benchmark:  97%|█████████▋| 29/30 [00:02<00:00, 14.50it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 13.58it/s]
+1554.350ms
+loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_50_fpn
+WARNING:common:Model detectron2_maskrcnn_r_50_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:07, ?it/s]
+WARNING:common:Model detectron2_maskrcnn_r_50_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_maskrcnn_r_50_fpn        baseline-bs1              
+WARNING:common:Model detectron2_maskrcnn_r_50_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 08:52:38,616] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 08:52:43,938] [30/1_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 16.17it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 19.63it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 20.70it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 21.32it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 21.73it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 22.00it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 22.19it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 22.30it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 22.38it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 22.34it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 21.77it/s]
+1885.215ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:12, ?it/s]
+dlrm
+cuda eval  dlrm                                baseline-bs1              
+AUTOTUNE mm(1x512, 512x64)
+  triton_mm_17 0.0091 ms 100.0%
+  triton_mm_15 0.0093 ms 97.9%
+  triton_mm_14 0.0094 ms 96.9%
+  triton_mm_16 0.0095 ms 95.6%
+  triton_mm_18 0.0099 ms 91.9%
+  mm 0.0100 ms 91.3%
+  triton_mm_13 0.0116 ms 78.5%
+  triton_mm_12 0.0175 ms 52.0%
+  triton_mm_20 0.0201 ms 45.2%
+  triton_mm_19 0.0212 ms 43.0%
+SingleProcess AUTOTUNE takes 3.0626 seconds
+AUTOTUNE mm(1x100, 100x1024)
+  triton_mm_33 0.0072 ms 100.0%
+  triton_mm_35 0.0072 ms 100.0%
+  triton_mm_34 0.0075 ms 95.7%
+  triton_mm_30 0.0076 ms 94.1%
+  triton_mm_37 0.0077 ms 92.9%
+  triton_mm_31 0.0077 ms 92.6%
+  triton_mm_29 0.0083 ms 86.2%
+  mm 0.0084 ms 85.5%
+  triton_mm_32 0.0086 ms 83.3%
+  triton_mm_38 0.0094 ms 76.5%
+SingleProcess AUTOTUNE takes 3.7768 seconds
+AUTOTUNE mm(1x1024, 1024x1)
+  mm 0.0092 ms 100.0%
+  triton_mm_68 0.0106 ms 86.5%
+  triton_mm_67 0.0107 ms 85.6%
+  triton_mm_69 0.0109 ms 84.2%
+  triton_mm_70 0.0111 ms 82.2%
+  triton_mm_66 0.0140 ms 65.3%
+  triton_mm_65 0.0223 ms 41.0%
+  triton_mm_72 0.0281 ms 32.5%
+  triton_mm_71 0.0286 ms 32.0%
+SingleProcess AUTOTUNE takes 2.5178 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 582.45it/s]
+1776.994ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model doctr_det_predictor does not support bfloat16, running with amp instead
+loading model: 0it [00:07, ?it/s]
+WARNING:common:Model doctr_det_predictor does not support bfloat16, running with amp instead
+doctr_det_predictor
+cuda eval  doctr_det_predictor                 baseline-bs1              
+WARNING:common:Model doctr_det_predictor does not support bfloat16, running with amp instead
+[2023-12-12 08:54:06,240] [1/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+Fatal glibc error: malloc.c:2496 (sysmalloc): assertion failed: (old_top == initial_top (av) && old_size == 0) || ((unsigned long) (old_size) >= MINSIZE && prev_inuse (old_top) && ((unsigned long) old_end & (pagesize - 1)) == 0)
+Run failed with return code:  -6
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead
+loading model: 0it [00:05, ?it/s]
+WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead
+doctr_reco_predictor
+cuda eval  doctr_reco_predictor                baseline-bs1              
+WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 199.84it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 200.32it/s]
+2417.314ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+drq
+cuda eval  drq                                 baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 746.24it/s]
+2930.158ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+fastNLP_Bert
+cuda eval  fastNLP_Bert                        baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:00, 67.30it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 67.46it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 68.19it/s]running benchmark:  93%|█████████▎| 28/30 [00:00<00:00, 68.67it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 68.37it/s]
+2765.415ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+functorch_dp_cifar10
+cuda eval  functorch_dp_cifar10                baseline-bs1              
+AUTOTUNE addmm(1x1000, 1x512, 512x1000)
+  triton_mm_115 0.0106 ms 100.0%
+  triton_mm_113 0.0109 ms 97.6%
+  triton_mm_117 0.0109 ms 97.6%
+  triton_mm_114 0.0114 ms 93.0%
+  triton_mm_118 0.0117 ms 90.7%
+  bias_addmm 0.0124 ms 85.6%
+  addmm 0.0125 ms 85.1%
+  triton_mm_112 0.0125 ms 85.1%
+  triton_mm_111 0.0135 ms 78.7%
+  triton_mm_110 0.0143 ms 74.4%
+SingleProcess AUTOTUNE takes 4.2319 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 266.37it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 266.78it/s]
+5319.779ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+functorch_maml_omniglot
+cuda eval  functorch_maml_omniglot             baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 908.94it/s]
+2998.833ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+hf_Albert
+cuda eval  hf_Albert                           baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 74.52it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 82.51it/s]running benchmark:  87%|████████▋ | 26/30 [00:00<00:00, 85.71it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 84.66it/s]
+5939.034ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:08, ?it/s]
+hf_Bart
+cuda eval  hf_Bart                             baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  10%|█         | 3/30 [00:00<00:00, 29.70it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 55.81it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 63.35it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 66.94it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 63.20it/s]
+4196.522ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:05, ?it/s]
+hf_BigBird
+cuda eval  hf_BigBird                          baseline-bs1              
+[2023-12-12 08:58:16,396] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 08:58:19,910] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 08:58:21,948] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 08:58:24,002] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 08:58:26,287] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 08:58:28,287] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 08:58:30,294] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 08:58:32,545] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 08:58:34,572] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 08:58:36,589] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 08:58:38,890] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 08:58:40,894] [0/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:02, 13.97it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:01, 14.08it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 14.09it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 14.13it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 14.19it/s]running benchmark:  40%|████      | 12/30 [00:00<00:01, 14.16it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:01, 14.09it/s]running benchmark:  53%|█████▎    | 16/30 [00:01<00:00, 14.16it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:00, 14.18it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 14.19it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 14.24it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 14.26it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 14.25it/s]running benchmark:  93%|█████████▎| 28/30 [00:01<00:00, 14.18it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 14.26it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 14.19it/s]
+2463.309ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+hf_DistilBert
+cuda eval  hf_DistilBert                       baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 138.70it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 151.02it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 149.02it/s]
+4273.392ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:06, ?it/s]
+hf_GPT2
+cuda eval  hf_GPT2                             baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  20%|██        | 6/30 [00:00<00:00, 56.00it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 64.86it/s]running benchmark:  73%|███████▎  | 22/30 [00:00<00:00, 67.42it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 68.44it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 66.90it/s]
+3371.245ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:18, ?it/s]
+hf_GPT2_large
+cuda eval  hf_GPT2_large                       baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:02, 12.79it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:01, 15.48it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 16.55it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 17.17it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 17.49it/s]running benchmark:  40%|████      | 12/30 [00:00<00:01, 17.70it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 17.85it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 17.91it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:00, 17.92it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 17.99it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 17.96it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 17.98it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 18.02it/s]running benchmark:  93%|█████████▎| 28/30 [00:01<00:00, 18.00it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 17.97it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 17.57it/s]
+2432.864ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:06, ?it/s]
+hf_Longformer
+cuda eval  hf_Longformer                       baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 15.26it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:01, 16.72it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 17.25it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 17.52it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 17.67it/s]running benchmark:  40%|████      | 12/30 [00:00<00:01, 17.73it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 17.75it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 17.80it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:00, 17.83it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 17.85it/s]running benchmark:  73%|███████▎  | 22/30 [00:01<00:00, 17.87it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 17.89it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 17.88it/s]running benchmark:  93%|█████████▎| 28/30 [00:01<00:00, 17.88it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 17.90it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 17.70it/s]
+2018.524ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+hf_Reformer
+cuda eval  hf_Reformer                         baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:00, 61.87it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 69.66it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 72.27it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 71.68it/s]
+3680.809ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+hf_T5
+cuda eval  hf_T5                               baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  10%|█         | 3/30 [00:00<00:01, 22.48it/s]running benchmark:  20%|██        | 6/30 [00:00<00:00, 25.33it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 26.41it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 26.97it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 27.26it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 27.43it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 27.56it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 27.64it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 27.63it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 27.63it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 27.13it/s]
+2813.992ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:06, ?it/s]
+hf_T5_base
+cuda eval  hf_T5_base                          baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:05,  4.99it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:03,  7.01it/s]running benchmark:  10%|█         | 3/30 [00:00<00:03,  8.04it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:03,  8.64it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:02,  9.01it/s]running benchmark:  20%|██        | 6/30 [00:00<00:02,  9.25it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:02,  9.40it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:02,  9.51it/s]running benchmark:  30%|███       | 9/30 [00:01<00:02,  9.58it/s]running benchmark:  33%|███▎      | 10/30 [00:01<00:02,  9.64it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:01,  9.68it/s]running benchmark:  40%|████      | 12/30 [00:01<00:01,  9.70it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:01,  9.71it/s]running benchmark:  47%|████▋     | 14/30 [00:01<00:01,  9.72it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01,  9.71it/s]running benchmark:  53%|█████▎    | 16/30 [00:01<00:01,  9.73it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:01,  9.75it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:01,  9.77it/s]running benchmark:  63%|██████▎   | 19/30 [00:02<00:01,  9.78it/s]running benchmark:  67%|██████▋   | 20/30 [00:02<00:01,  9.79it/s]running benchmark:  70%|███████   | 21/30 [00:02<00:00,  9.80it/s]running benchmark:  73%|███████▎  | 22/30 [00:02<00:00,  9.80it/s]running benchmark:  77%|███████▋  | 23/30 [00:02<00:00,  9.81it/s]running benchmark:  80%|████████  | 24/30 [00:02<00:00,  9.80it/s]running benchmark:  83%|████████▎ | 25/30 [00:02<00:00,  9.79it/s]running benchmark:  87%|████████▋ | 26/30 [00:02<00:00,  9.79it/s]running benchmark:  90%|█████████ | 27/30 [00:02<00:00,  9.79it/s]running benchmark:  93%|█████████▎| 28/30 [00:02<00:00,  9.79it/s]running benchmark:  97%|█████████▋| 29/30 [00:03<00:00,  9.79it/s]running benchmark: 100%|██████████| 30/30 [00:03<00:00,  9.78it/s]running benchmark: 100%|██████████| 30/30 [00:03<00:00,  9.47it/s]
+2780.510ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:07, ?it/s]
+hf_T5_generate
+cuda eval  hf_T5_generate                      baseline-bs1              
+[2023-12-12 09:08:34,718] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (8)
+[2023-12-12 09:08:34,718] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py:1645)
+[2023-12-12 09:08:34,718] torch._dynamo.convert_frame: [WARNING]    last reason: ___check_obj_id(L['past_key_values'], 7628576)                # mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length  # miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py:1026 in forward
+[2023-12-12 09:08:34,718] torch._dynamo.convert_frame: [WARNING] To log all recompilation reasons, use TORCH_LOGS="recompiles".
+[2023-12-12 09:08:34,718] torch._dynamo.convert_frame: [WARNING] To diagnose recompilation issues, see https://pytorch.org/docs/master/compile/troubleshooting.html.
+[2023-12-12 09:10:46,079] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (8)
+[2023-12-12 09:10:46,079] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py:978)
+[2023-12-12 09:10:46,079] torch._dynamo.convert_frame: [WARNING]    last reason: tensor 'L['input_ids']' stride mismatch at index 0. expected 9, actual 17
+[2023-12-12 09:10:46,079] torch._dynamo.convert_frame: [WARNING] To log all recompilation reasons, use TORCH_LOGS="recompiles".
+[2023-12-12 09:10:46,079] torch._dynamo.convert_frame: [WARNING] To diagnose recompilation issues, see https://pytorch.org/docs/master/compile/troubleshooting.html.
+TIMEOUT
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:17, ?it/s]
+hf_T5_large
+cuda eval  hf_T5_large                         baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:05,  5.06it/s]running benchmark:  10%|█         | 3/30 [00:00<00:02,  9.32it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:02, 11.53it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 12.57it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 13.34it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:01, 13.68it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:01, 14.09it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01, 14.24it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:00, 14.55it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 14.70it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 14.74it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 14.59it/s]running benchmark:  83%|████████▎ | 25/30 [00:01<00:00, 14.48it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 14.53it/s]running benchmark:  97%|█████████▋| 29/30 [00:02<00:00, 14.73it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 13.76it/s]
+4805.833ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+hf_Whisper
+cuda eval  hf_Whisper                          baseline-bs1              
+AUTOTUNE mm(1500x256, 256x256)
+  triton_mm_8 0.0100 ms 100.0%
+  mm 0.0101 ms 98.7%
+  triton_mm_4 0.0102 ms 97.2%
+  triton_mm_3 0.0106 ms 94.2%
+  triton_mm_5 0.0106 ms 94.2%
+  triton_mm_6 0.0111 ms 89.9%
+  triton_mm_9 0.0113 ms 87.9%
+  triton_mm_2 0.0115 ms 86.9%
+  triton_mm_1 0.0116 ms 85.6%
+  triton_mm_0 0.0136 ms 73.0%
+SingleProcess AUTOTUNE takes 4.8890 seconds
+AUTOTUNE mm(1500x256, 256x1536)
+  triton_mm_50 0.0152 ms 100.0%
+  triton_mm_49 0.0153 ms 99.2%
+  mm 0.0170 ms 89.0%
+  triton_mm_51 0.0172 ms 88.4%
+  triton_mm_48 0.0174 ms 87.0%
+  triton_mm_52 0.0178 ms 85.3%
+  triton_mm_55 0.0192 ms 79.0%
+  triton_mm_56 0.0197 ms 76.8%
+  triton_mm_58 0.0236 ms 64.1%
+  triton_mm_54 0.0288 ms 52.7%
+SingleProcess AUTOTUNE takes 4.7856 seconds
+AUTOTUNE mm(1500x1536, 1536x256)
+  mm 0.0170 ms 100.0%
+  triton_mm_68 0.0211 ms 80.6%
+  triton_mm_64 0.0246 ms 69.0%
+  triton_mm_63 0.0250 ms 68.0%
+  triton_mm_65 0.0262 ms 64.8%
+  triton_mm_66 0.0263 ms 64.6%
+  triton_mm_69 0.0267 ms 63.7%
+  triton_mm_61 0.0319 ms 53.3%
+  triton_mm_62 0.0322 ms 52.8%
+  triton_mm_60 0.0387 ms 43.9%
+SingleProcess AUTOTUNE takes 4.6072 seconds
+AUTOTUNE addmm(1500x256, 1500x256, 256x256)
+  triton_mm_440 0.0100 ms 100.0%
+  bias_addmm 0.0109 ms 91.5%
+  triton_mm_438 0.0109 ms 91.2%
+  triton_mm_437 0.0110 ms 91.0%
+  triton_mm_441 0.0112 ms 89.4%
+  triton_mm_436 0.0115 ms 87.2%
+  triton_mm_435 0.0115 ms 87.0%
+  triton_mm_434 0.0118 ms 84.3%
+  triton_mm_433 0.0124 ms 80.4%
+  triton_mm_432 0.0143 ms 69.8%
+SingleProcess AUTOTUNE takes 5.1777 seconds
+AUTOTUNE addmm(1x2, 1x256, 256x2)
+  triton_mm_446 0.0070 ms 100.0%
+  triton_mm_449 0.0071 ms 98.2%
+  triton_mm_447 0.0075 ms 93.6%
+  triton_mm_448 0.0075 ms 93.6%
+  triton_mm_445 0.0081 ms 86.2%
+  bias_addmm 0.0086 ms 81.7%
+  triton_mm_444 0.0096 ms 73.2%
+  triton_mm_451 0.0106 ms 66.2%
+  addmm 0.0107 ms 65.6%
+  triton_mm_450 0.0108 ms 65.0%
+SingleProcess AUTOTUNE takes 2.9262 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 133.24it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 156.64it/s]
+4253.641ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:05, ?it/s]
+WARNING:root:hf_clip failed to load
+hf_clip
+Original Error: 'str' object has no attribute 'shape'
+Eager model failed to run
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model
+    self.model_iter_fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.py", line 1142, in forward
+    vision_outputs = self.vision_model(
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.py", line 883, in forward
+    hidden_states = self.embeddings(pixel_values)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.py", line 194, in forward
+    batch_size = pixel_values.shape[0]
+AttributeError: 'str' object has no attribute 'shape'
+
+The above exception was the direct cause of the following exception:
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model
+    self.validate_model(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model
+    raise NotImplementedError("Eager model failed to run") from e
+NotImplementedError: Eager model failed to run
+
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+lennard_jones
+cuda eval  lennard_jones                       baseline-bs1              
+AUTOTUNE mm(1x1, 1x16)
+  mm 0.0059 ms 100.0%
+  triton_mm_0 0.0059 ms 100.0%
+  triton_mm_1 0.0059 ms 100.0%
+  triton_mm_2 0.0059 ms 100.0%
+  triton_mm_4 0.0066 ms 89.3%
+  triton_mm_3 0.0067 ms 88.5%
+SingleProcess AUTOTUNE takes 1.6600 seconds
+AUTOTUNE mm(1x16, 16x16)
+  triton_mm_5 0.0065 ms 100.0%
+  triton_mm_7 0.0065 ms 99.8%
+  triton_mm_9 0.0066 ms 99.5%
+  triton_mm_8 0.0066 ms 99.5%
+  triton_mm_6 0.0067 ms 98.1%
+  mm 0.0069 ms 94.4%
+SingleProcess AUTOTUNE takes 1.4548 seconds
+AUTOTUNE addmm(1x1, 1x16, 16x1)
+  triton_mm_20 0.0059 ms 100.0%
+  triton_mm_22 0.0059 ms 100.0%
+  triton_mm_24 0.0059 ms 100.0%
+  triton_mm_21 0.0064 ms 91.5%
+  triton_mm_23 0.0064 ms 91.5%
+  addmm 0.0096 ms 61.2%
+SingleProcess AUTOTUNE takes 1.5862 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 1511.53it/s]
+1419.988ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+llama
+cuda eval  llama                               baseline-bs1              
+AUTOTUNE mm(32x512, 512x1536)
+  triton_mm_56 0.0110 ms 100.0%
+  triton_mm_54 0.0112 ms 98.4%
+  mm 0.0113 ms 97.3%
+  triton_mm_57 0.0126 ms 87.0%
+  triton_mm_50 0.0138 ms 79.9%
+  triton_mm_52 0.0145 ms 75.9%
+  triton_mm_53 0.0162 ms 68.0%
+  triton_mm_51 0.0197 ms 55.8%
+  triton_mm_55 0.0224 ms 49.0%
+  triton_mm_58 0.0280 ms 39.3%
+SingleProcess AUTOTUNE takes 4.1057 seconds
+AUTOTUNE mm(32x1536, 1536x512)
+  mm 0.0131 ms 100.0%
+  triton_mm_77 0.0166 ms 78.8%
+  triton_mm_78 0.0182 ms 71.8%
+  triton_mm_80 0.0185 ms 70.6%
+  triton_mm_81 0.0189 ms 69.1%
+  triton_mm_76 0.0202 ms 64.9%
+  triton_mm_75 0.0226 ms 57.9%
+  triton_mm_74 0.0267 ms 49.1%
+  triton_mm_73 0.0286 ms 45.8%
+  triton_mm_72 0.0459 ms 28.5%
+SingleProcess AUTOTUNE takes 4.2483 seconds
+AUTOTUNE mm(1x512, 512x32000)
+  triton_mm_674 0.0390 ms 100.0%
+  triton_mm_673 0.0392 ms 99.5%
+  triton_mm_676 0.0396 ms 98.6%
+  triton_mm_678 0.0396 ms 98.5%
+  triton_mm_675 0.0405 ms 96.4%
+  mm 0.0418 ms 93.3%
+  triton_mm_672 0.0462 ms 84.4%
+  triton_mm_679 0.0469 ms 83.2%
+  triton_mm_682 0.0486 ms 80.3%
+  triton_mm_683 0.0488 ms 80.0%
+SingleProcess AUTOTUNE takes 3.9642 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 74.44it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 75.65it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 76.20it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 76.11it/s]
+6911.303ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:59, ?it/s]
+llama_v2_7b_16h
+cuda eval  llama_v2_7b_16h                     baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:03,  9.29it/s]running benchmark:  10%|█         | 3/30 [00:00<00:01, 15.32it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 18.13it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 19.17it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 19.68it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 19.92it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 20.09it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 20.19it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 20.30it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 20.30it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 20.35it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 19.61it/s]
+1373.039ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+maml_omniglot
+cuda eval  maml_omniglot                       baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 905.56it/s]
+2919.036ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+mnasnet1_0
+cuda eval  mnasnet1_0                          baseline-bs1              
+AUTOTUNE addmm(1x1000, 1x1280, 1280x1000)
+  bias_addmm 0.0115 ms 100.0%
+  addmm 0.0154 ms 75.0%
+  triton_mm_417 0.0160 ms 71.9%
+  triton_mm_418 0.0163 ms 70.9%
+  triton_mm_420 0.0169 ms 68.3%
+  triton_mm_421 0.0175 ms 65.9%
+  triton_mm_416 0.0178 ms 64.9%
+  triton_mm_415 0.0198 ms 58.2%
+  triton_mm_414 0.0240 ms 47.9%
+  triton_mm_413 0.0256 ms 45.1%
+SingleProcess AUTOTUNE takes 4.4255 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 147.00it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 149.24it/s]
+8089.498ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+mobilenet_v2
+cuda eval  mobilenet_v2                        baseline-bs1              
+AUTOTUNE addmm(1x1000, 1x1280, 1280x1000)
+  addmm 0.0152 ms 100.0%
+  bias_addmm 0.0153 ms 99.6%
+  triton_mm_417 0.0161 ms 94.5%
+  triton_mm_418 0.0162 ms 94.3%
+  triton_mm_416 0.0173 ms 88.0%
+  triton_mm_420 0.0173 ms 88.0%
+  triton_mm_421 0.0175 ms 87.2%
+  triton_mm_415 0.0196 ms 77.9%
+  triton_mm_414 0.0240 ms 63.5%
+  triton_mm_413 0.0260 ms 58.6%
+SingleProcess AUTOTUNE takes 4.4196 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 135.62it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 140.42it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 139.86it/s]
+9308.434ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+WARNING:root:mobilenet_v2_quantized_qat failed to load
+mobilenet_v2_quantized_qat
+The eval test only supports CPU.
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 443, in load_model
+    benchmark = benchmark_cls(
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/util/model.py", line 24, in __call__
+    obj = type.__call__(cls, *args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/models/mobilenet_v2_quantized_qat/__init__.py", line 21, in __init__
+    raise NotImplementedError("The eval test only supports CPU.")
+NotImplementedError: The eval test only supports CPU.
+
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+mobilenet_v3_large
+cuda eval  mobilenet_v3_large                  baseline-bs1              
+AUTOTUNE mm(1x960, 960x1280)
+  mm 0.0127 ms 100.0%
+  triton_mm_545 0.0139 ms 91.9%
+  triton_mm_544 0.0140 ms 90.7%
+  triton_mm_543 0.0144 ms 88.2%
+  triton_mm_547 0.0145 ms 87.9%
+  triton_mm_548 0.0153 ms 83.1%
+  triton_mm_542 0.0165 ms 77.3%
+  triton_mm_541 0.0197 ms 64.5%
+  triton_mm_540 0.0209 ms 60.9%
+  triton_mm_539 0.0313 ms 40.7%
+SingleProcess AUTOTUNE takes 3.9225 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 118.34it/s]running benchmark:  83%|████████▎ | 25/30 [00:00<00:00, 119.77it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 119.12it/s]
+9510.709ms
+loading model: 0it [00:00, ?it/s]NCCL version 2.19.3+cuda12.0
+loading model: 0it [00:03, ?it/s]
+moco
+cuda eval  moco                                baseline-bs1              
+[rank0]:[2023-12-12 09:46:48,640] [0/0] torch._dynamo.variables.torch: [WARNING] Profiler function <class 'torch.autograd.profiler.record_function'> will be ignored
+[rank0]:[2023-12-12 09:46:53,460] [1/0_1] torch._dynamo.backends.distributed: [WARNING] Some buckets were extended beyond their requested parameter capacities in order to ensure each subgraph has an output node, required for fx graph partitioning. This can be the case when a subgraph would have only contained nodes performing inplace mutation, and returning no logical outputs. This should not be a problem, unless it results in too few graph partitions for optimal DDP performance.
+[rank0]:[2023-12-12 09:46:53,486] [1/0_1] torch._dynamo.backends.distributed: [WARNING] DDPOptimizer extended these buckets to ensure per-subgraph output nodes:
+[rank0]:[2023-12-12 09:46:53,486] [1/0_1] torch._dynamo.backends.distributed: [WARNING] ┌─────────┬─────────────┬────────────────────────┐
+[rank0]:[2023-12-12 09:46:53,486] [1/0_1] torch._dynamo.backends.distributed: [WARNING] │   Index │   Extra Ops │   Extra Param Size (b) │
+[rank0]:[2023-12-12 09:46:53,486] [1/0_1] torch._dynamo.backends.distributed: [WARNING] ├─────────┼─────────────┼────────────────────────┤
+[rank0]:[2023-12-12 09:46:53,486] [1/0_1] torch._dynamo.backends.distributed: [WARNING] │       0 │         157 │               44910720 │
+[rank0]:[2023-12-12 09:46:53,486] [1/0_1] torch._dynamo.backends.distributed: [WARNING] └─────────┴─────────────┴────────────────────────┘
+AUTOTUNE addmm(1x128, 1x2048, 2048x128)
+  bias_addmm 0.0112 ms 100.0%
+  addmm 0.0112 ms 100.0%
+  triton_mm_540 0.0187 ms 60.1%
+  triton_mm_541 0.0198 ms 56.8%
+  triton_mm_543 0.0201 ms 55.9%
+  triton_mm_544 0.0210 ms 53.5%
+  triton_mm_539 0.0217 ms 51.7%
+  triton_mm_538 0.0241 ms 46.7%
+  triton_mm_537 0.0308 ms 36.4%
+  triton_mm_536 0.0331 ms 34.0%
+SingleProcess AUTOTUNE takes 4.3309 seconds
+skipping cudagraphs due to ['mutated inputs']
+[rank0]:[2023-12-12 09:47:26,228] [5/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+AUTOTUNE mm(1x2048, 2048x128)
+  mm 0.0083 ms 100.0%
+  triton_mm_1087 0.0177 ms 46.8%
+  triton_mm_1088 0.0188 ms 44.0%
+  triton_mm_1090 0.0196 ms 42.2%
+  triton_mm_1091 0.0205 ms 40.3%
+  triton_mm_1086 0.0207 ms 40.0%
+  triton_mm_1085 0.0236 ms 35.0%
+  triton_mm_1084 0.0308 ms 26.9%
+  triton_mm_1083 0.0326 ms 25.4%
+  triton_mm_1082 0.0543 ms 15.3%
+SingleProcess AUTOTUNE takes 4.4818 seconds
+AUTOTUNE bmm(1x1x128, 1x128x1)
+  triton_bmm_1096 0.0061 ms 100.0%
+  triton_bmm_1098 0.0061 ms 100.0%
+  bmm 0.0065 ms 93.1%
+  triton_bmm_1097 0.0066 ms 91.8%
+  triton_bmm_1099 0.0066 ms 91.8%
+  triton_bmm_1095 0.0070 ms 86.4%
+  triton_bmm_1094 0.0074 ms 82.3%
+  triton_bmm_1100 0.0081 ms 75.4%
+  triton_bmm_1101 0.0086 ms 70.6%
+SingleProcess AUTOTUNE takes 2.5842 seconds
+AUTOTUNE bmm(1x1x128, 1x128x32000)
+  triton_bmm_1102 0.0140 ms 100.0%
+  triton_bmm_1104 0.0145 ms 96.6%
+  triton_bmm_1103 0.0150 ms 93.2%
+  triton_bmm_1106 0.0152 ms 91.8%
+  triton_bmm_1108 0.0154 ms 90.6%
+  triton_bmm_1105 0.0157 ms 89.2%
+  triton_bmm_1111 0.0158 ms 88.6%
+  triton_bmm_1112 0.0158 ms 88.6%
+  triton_bmm_1113 0.0162 ms 86.4%
+  triton_bmm_1109 0.0181 ms 77.1%
+SingleProcess AUTOTUNE takes 3.8806 seconds
+[rank0]:[W CUDAGraph.cpp:145] Warning: Waiting for pending NCCL work to finish before starting graph capture. (function operator())
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  10%|█         | 3/30 [00:00<00:01, 25.75it/s]running benchmark:  20%|██        | 6/30 [00:00<00:00, 26.22it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 22.60it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 24.39it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 27.81it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 29.93it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 31.36it/s]running benchmark:  93%|█████████▎| 28/30 [00:00<00:00, 32.39it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 29.46it/s]
+2867.772ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:05, ?it/s]
+nanogpt
+number of parameters: 123.69M
+num decayed parameter tensors: 50, with 124,354,560 parameters
+num non-decayed parameter tensors: 98, with 121,344 parameters
+using fused AdamW: True
+cuda eval  nanogpt                             baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 133.33it/s]running benchmark:  93%|█████████▎| 28/30 [00:00<00:00, 135.41it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 135.01it/s]
+6835.468ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+nvidia_deeprecommender
+cuda eval  nvidia_deeprecommender              baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 838.42it/s]
+869.831ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+opacus_cifar10
+cuda eval  opacus_cifar10                      baseline-bs1              
+AUTOTUNE addmm(1x10, 1x512, 512x10)
+  triton_mm_112 0.0087 ms 100.0%
+  triton_mm_113 0.0088 ms 99.6%
+  triton_mm_111 0.0090 ms 96.8%
+  triton_mm_114 0.0093 ms 94.1%
+  triton_mm_110 0.0108 ms 80.8%
+  triton_mm_109 0.0151 ms 58.0%
+  bias_addmm 0.0169 ms 51.6%
+  addmm 0.0169 ms 51.6%
+  triton_mm_115 0.0182 ms 47.9%
+  triton_mm_116 0.0186 ms 46.9%
+SingleProcess AUTOTUNE takes 3.0437 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  73%|███████▎  | 22/30 [00:00<00:00, 216.59it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 216.88it/s]
+5506.180ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:27, ?it/s]
+phi_1_5
+cuda eval  phi_1_5                             baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 16.08it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 20.30it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 21.61it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 22.31it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 22.60it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 22.70it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 22.82it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 22.88it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 22.91it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 23.02it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 22.45it/s]
+2993.381ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+phlippe_densenet
+cuda eval  phlippe_densenet                    baseline-bs1              
+AUTOTUNE addmm(1x10, 1x184, 184x10)
+  triton_mm_405 0.0073 ms 100.0%
+  triton_mm_404 0.0075 ms 97.0%
+  triton_mm_406 0.0079 ms 91.5%
+  triton_mm_403 0.0080 ms 91.2%
+  triton_mm_407 0.0083 ms 88.0%
+  triton_mm_402 0.0100 ms 72.8%
+  triton_mm_408 0.0122 ms 59.4%
+  bias_addmm 0.0126 ms 57.8%
+  addmm 0.0126 ms 57.8%
+  triton_mm_409 0.0137 ms 53.0%
+SingleProcess AUTOTUNE takes 2.9887 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 102.58it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 107.82it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 108.16it/s]
+11761.298ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+phlippe_resnet
+cuda eval  phlippe_resnet                      baseline-bs1              
+AUTOTUNE addmm(1x10, 1x64, 64x10)
+  triton_mm_120 0.0059 ms 100.0%
+  triton_mm_121 0.0059 ms 100.0%
+  triton_mm_117 0.0062 ms 95.9%
+  triton_mm_118 0.0068 ms 86.9%
+  triton_mm_119 0.0068 ms 86.9%
+  triton_mm_123 0.0068 ms 86.4%
+  triton_mm_116 0.0071 ms 83.7%
+  triton_mm_122 0.0075 ms 79.1%
+  addmm 0.0092 ms 64.2%
+  bias_addmm 0.0094 ms 63.1%
+SingleProcess AUTOTUNE takes 2.6468 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  93%|█████████▎| 28/30 [00:00<00:00, 277.46it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 277.18it/s]
+7359.598ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+pyhpc_equation_of_state
+cuda eval  pyhpc_equation_of_state             baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 151.33it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 151.00it/s]
+24210.436ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+pyhpc_isoneutral_mixing
+cuda eval  pyhpc_isoneutral_mixing             baseline-bs1              
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 154.15it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 157.15it/s]
+20450.258ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model pyhpc_turbulent_kinetic_energy does not support bfloat16, running with amp instead
+loading model: 0it [00:01, ?it/s]
+WARNING:common:Model pyhpc_turbulent_kinetic_energy does not support bfloat16, running with amp instead
+pyhpc_turbulent_kinetic_energy
+cuda eval  pyhpc_turbulent_kinetic_energy      baseline-bs1              
+WARNING:common:Model pyhpc_turbulent_kinetic_energy does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 73.27it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 76.47it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 76.83it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 76.94it/s]
+4774.119ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+pytorch_CycleGAN_and_pix2pix
+cuda eval  pytorch_CycleGAN_and_pix2pix        baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 152.98it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 152.95it/s]
+2460.731ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+pytorch_stargan
+cuda eval  pytorch_stargan                     baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 104.33it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 114.81it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 115.54it/s]
+1981.033ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+pytorch_unet
+cuda eval  pytorch_unet                        baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:00, 37.50it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 42.74it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 44.40it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 45.15it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 45.55it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 45.83it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 44.91it/s]
+1805.425ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+resnet152
+cuda eval  resnet152                           baseline-bs1              
+AUTOTUNE addmm(1x1000, 1x2048, 2048x1000)
+  addmm 0.0166 ms 100.0%
+  bias_addmm 0.0166 ms 99.8%
+  triton_mm_1594 0.0211 ms 78.5%
+  triton_mm_1595 0.0216 ms 76.9%
+  triton_mm_1597 0.0223 ms 74.3%
+  triton_mm_1598 0.0231 ms 71.6%
+  triton_mm_1593 0.0234 ms 70.8%
+  triton_mm_1592 0.0262 ms 63.3%
+  triton_mm_1591 0.0338 ms 49.0%
+  triton_mm_1590 0.0361 ms 45.9%
+SingleProcess AUTOTUNE takes 4.1443 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:00, 39.62it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 40.90it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 41.61it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 42.04it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 42.32it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 42.39it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 41.99it/s]
+8056.812ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+resnet18
+cuda eval  resnet18                            baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  93%|█████████▎| 28/30 [00:00<00:00, 275.74it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 275.26it/s]
+5040.501ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+resnet50
+cuda eval  resnet50                            baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 91.42it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 93.40it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 95.14it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 94.39it/s]
+8640.064ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+WARNING:root:resnet50_quantized_qat failed to load
+resnet50_quantized_qat
+The eval test only supports CPU.
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 443, in load_model
+    benchmark = benchmark_cls(
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/util/model.py", line 24, in __call__
+    obj = type.__call__(cls, *args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/models/resnet50_quantized_qat/__init__.py", line 21, in __init__
+    raise NotImplementedError("The eval test only supports CPU.")
+NotImplementedError: The eval test only supports CPU.
+
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+resnext50_32x4d
+cuda eval  resnext50_32x4d                     baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 121.52it/s]running benchmark:  87%|████████▋ | 26/30 [00:00<00:00, 120.95it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 120.50it/s]
+7156.164ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:11, ?it/s]
+sam
+cuda eval  sam                                 baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:13,  2.15it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:07,  3.62it/s]running benchmark:  10%|█         | 3/30 [00:00<00:05,  4.64it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:04,  5.35it/s]running benchmark:  17%|█▋        | 5/30 [00:01<00:04,  5.84it/s]running benchmark:  20%|██        | 6/30 [00:01<00:03,  6.18it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:03,  6.41it/s]running benchmark:  27%|██▋       | 8/30 [00:01<00:03,  6.57it/s]running benchmark:  30%|███       | 9/30 [00:01<00:03,  6.69it/s]running benchmark:  33%|███▎      | 10/30 [00:01<00:02,  6.77it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:02,  6.83it/s]running benchmark:  40%|████      | 12/30 [00:02<00:02,  6.87it/s]running benchmark:  43%|████▎     | 13/30 [00:02<00:02,  6.89it/s]running benchmark:  47%|████▋     | 14/30 [00:02<00:02,  6.91it/s]running benchmark:  50%|█████     | 15/30 [00:02<00:02,  6.92it/s]running benchmark:  53%|█████▎    | 16/30 [00:02<00:02,  6.93it/s]running benchmark:  57%|█████▋    | 17/30 [00:02<00:01,  6.92it/s]running benchmark:  60%|██████    | 18/30 [00:02<00:01,  6.93it/s]running benchmark:  63%|██████▎   | 19/30 [00:03<00:01,  6.93it/s]running benchmark:  67%|██████▋   | 20/30 [00:03<00:01,  6.94it/s]running benchmark:  70%|███████   | 21/30 [00:03<00:01,  6.94it/s]running benchmark:  73%|███████▎  | 22/30 [00:03<00:01,  6.94it/s]running benchmark:  77%|███████▋  | 23/30 [00:03<00:01,  6.94it/s]running benchmark:  80%|████████  | 24/30 [00:03<00:00,  6.94it/s]running benchmark:  83%|████████▎ | 25/30 [00:03<00:00,  6.93it/s]running benchmark:  87%|████████▋ | 26/30 [00:04<00:00,  6.94it/s]running benchmark:  90%|█████████ | 27/30 [00:04<00:00,  6.94it/s]running benchmark:  93%|█████████▎| 28/30 [00:04<00:00,  6.94it/s]running benchmark:  97%|█████████▋| 29/30 [00:04<00:00,  6.95it/s]running benchmark: 100%|██████████| 30/30 [00:04<00:00,  6.95it/s]running benchmark: 100%|██████████| 30/30 [00:04<00:00,  6.47it/s]
+1588.837ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+shufflenet_v2_x1_0
+cuda eval  shufflenet_v2_x1_0                  baseline-bs1              
+ERROR:common:Backend dynamo failed in warmup()
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2380, in warmup
+    fn(model, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 488, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 654, in catch_errors
+    return callback(frame, cache_entry, hooks, frame_state)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 727, in _convert_frame
+    result = inner_convert(frame, cache_entry, hooks, frame_state)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 383, in _convert_frame_assert
+    compiled_product = _compile(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 646, in _compile
+    guarded_code = compile_inner(code, one_graph, hooks, transform)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 562, in compile_inner
+    out_code = transform_code_object(code, transform)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/bytecode_transformation.py", line 1033, in transform_code_object
+    transformations(instructions, code_options)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 151, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 527, in transform
+    tracer.run()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2144, in run
+    super().run()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 818, in run
+    and self.step()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 781, in step
+    getattr(self, inst.opname)(inst)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2259, in RETURN_VALUE
+    self.output.compile_subgraph(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 919, in compile_subgraph
+    self.compile_and_call_fx_graph(tx, list(reversed(stack_values)), root)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1087, in compile_and_call_fx_graph
+    compiled_fn = self.call_user_compiler(gm)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1159, in call_user_compiler
+    raise BackendCompilerFailed(self.compiler_fn, e).with_traceback(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1140, in call_user_compiler
+    compiled_fn = compiler_fn(gm, self.example_inputs())
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_dynamo.py", line 117, in debug_wrapper
+    compiled_gm = compiler_fn(gm, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/__init__.py", line 1672, in __call__
+    return compile_fx(model_, inputs_, config_patches=self.config)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 952, in compile_fx
+    return compile_fx(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1168, in compile_fx
+    return aot_autograd(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/backends/common.py", line 55, in compiler_fn
+    cg = aot_module_simplified(gm, example_inputs, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 887, in aot_module_simplified
+    compiled_fn = create_aot_dispatcher_function(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 600, in create_aot_dispatcher_function
+    compiled_fn = compiler_fn(flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 425, in aot_wrapper_dedupe
+    return compiler_fn(flat_fn, leaf_flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 630, in aot_wrapper_synthetic_base
+    return compiler_fn(flat_fn, flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py", line 97, in aot_dispatch_base
+    compiled_fw = compiler(fw_module, updated_flat_args)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1100, in fw_compiler_base
+    return inner_compile(
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_aot.py", line 83, in debug_wrapper
+    inner_compiled_fn = compiler_fn(gm, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/debug.py", line 305, in inner
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 320, in compile_fx_inner
+    compiled_graph = fx_codegen_and_compile(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 535, in fx_codegen_and_compile
+    graph.run(*example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 518, in run
+    return super().run(*args)
+  File "/home/cdhernandez/local/pytorch/torch/fx/interpreter.py", line 138, in run
+    self.env[node] = self.run_node(node)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 815, in run_node
+    result = self.call_function(n.target, args, kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 695, in call_function
+    raise LoweringException(e, target, args, kwargs).with_traceback(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 692, in call_function
+    out = lowerings[target](*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/lowering.py", line 291, in wrapped
+    out = decomp_fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/kernel/conv.py", line 363, in convolution
+    return convert_1x1_conv_to_mm(x, weight, bias)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/kernel/conv.py", line 280, in convert_1x1_conv_to_mm
+    x.freeze_layout()
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/ir.py", line 6264, in __getattr__
+    fn = getattr(self.data, name)
+torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised:
+LoweringException: AttributeError: 'SliceView' object has no attribute 'freeze_layout'
+  target: aten.convolution.default
+  args[0]: TensorBox(
+    SliceView(
+      View(
+        StorageBox(
+          ComputedBuffer(name='buf14', layout=FlexibleLayout('cuda', torch.bfloat16, size=[1, 58, 2, 28, 28], stride=[90944, 1568, 784, 28, 1]), data=Pointwise(
+            'cuda',
+            torch.bfloat16,
+            def inner_fn(index):
+                _, i1, i2, i3, i4 = index
+                tmp0 = ops.load(buf13, i4 + 28 * i3 + 784 * i1 + 45472 * i2)
+                return tmp0
+            ,
+            ranges=[1, 58, 2, 28, 28],
+            origin_node=clone,
+            origins={clone}
+          ))
+        ),
+        size=[1, 116, 28, 28],
+        reindex=lambda i0, i1, i2, i3: [0, ModularIndexing(i1, 2, 58), ModularIndexing(i1, 1, 2), i2, i3],
+        origins={clone, view_1}
+      ),
+      size=[1, 58, 28, 28],
+      reindex=lambda i0, i1, i2, i3: [i0, i1 + 58, i2, i3],
+      origins={split}
+    )
+  )
+  args[1]: TensorBox(StorageBox(
+    InputBuffer(name='arg18_1', layout=FixedLayout('cuda', torch.bfloat16, size=[58, 58, 1, 1], stride=[58, 1, 1, 1]))
+  ))
+  args[2]: None
+  args[3]: [1, 1]
+  args[4]: [0, 0]
+  args[5]: [1, 1]
+  args[6]: False
+  args[7]: [0, 0]
+  args[8]: 1
+
+Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information
+
+
+You can suppress this exception and fall back to eager by setting:
+    import torch._dynamo
+    torch._dynamo.config.suppress_errors = True
+
+Run failed with return code:  255
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+soft_actor_critic
+cuda eval  soft_actor_critic                   baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 1248.06it/s]
+1382.896ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+speech_transformer
+cuda eval  speech_transformer                  baseline-bs1              
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['non-cuda device in graph']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:00, 38.55it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 38.85it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 39.53it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 39.61it/s]running benchmark:  73%|███████▎  | 22/30 [00:00<00:00, 39.83it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 39.91it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 39.75it/s]
+1422.362ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+squeezenet1_1
+cuda eval  squeezenet1_1                       baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 329.50it/s]
+5618.659ms
+loading model: 0it [00:00, ?it/s]stable_diffusion_text_encoder
+
+Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s][A
+Loading pipeline components...:  33%|███▎      | 2/6 [00:00<00:00,  5.01it/s][A
+Loading pipeline components...:  50%|█████     | 3/6 [00:00<00:00,  4.83it/s][ALoading pipeline components...: 100%|██████████| 6/6 [00:00<00:00,  8.67it/s]
+loading model: 0it [00:05, ?it/s]
+cuda eval  stable_diffusion_text_encoder       baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:00, 60.78it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 61.56it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 62.64it/s]running benchmark:  93%|█████████▎| 28/30 [00:00<00:00, 62.61it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 62.29it/s]
+6395.302ms
+loading model: 0it [00:00, ?it/s]stable_diffusion_unet
+
+Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s][A
+Loading pipeline components...:  33%|███▎      | 2/6 [00:00<00:00, 12.06it/s][A
+Loading pipeline components...:  67%|██████▋   | 4/6 [00:00<00:00, 10.70it/s][A
+Loading pipeline components...: 100%|██████████| 6/6 [00:00<00:00,  6.80it/s][ALoading pipeline components...: 100%|██████████| 6/6 [00:00<00:00,  7.60it/s]
+loading model: 0it [00:06, ?it/s]
+cuda eval  stable_diffusion_unet               baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:05,  5.51it/s]running benchmark:  10%|█         | 3/30 [00:00<00:02,  9.24it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:02, 10.55it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:02, 11.17it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 11.52it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:01, 11.70it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:01, 11.82it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01, 11.92it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:01, 12.00it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 12.04it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 12.08it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 12.11it/s]running benchmark:  83%|████████▎ | 25/30 [00:02<00:00, 12.09it/s]running benchmark:  90%|█████████ | 27/30 [00:02<00:00, 12.08it/s]running benchmark:  97%|█████████▋| 29/30 [00:02<00:00, 12.08it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 11.65it/s]
+1365.948ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+timm_efficientdet
+cuda eval  timm_efficientdet                   baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:08,  3.32it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:02, 10.42it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 14.43it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:01, 16.82it/s]running benchmark:  40%|████      | 12/30 [00:00<00:01, 17.64it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 18.83it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:00, 19.54it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 20.03it/s]running benchmark:  80%|████████  | 24/30 [00:01<00:00, 20.17it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 20.41it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 20.48it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 17.78it/s]
+7217.679ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+timm_efficientnet
+cuda eval  timm_efficientnet                   baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 76.90it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 78.40it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 78.76it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 78.42it/s]
+12091.726ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+timm_nfnet
+cuda eval  timm_nfnet                          baseline-bs1              
+AUTOTUNE addmm(1x1000, 1x3072, 3072x1000)
+  bias_addmm 0.0173 ms 100.0%
+  addmm 0.0173 ms 100.0%
+  triton_mm_390 0.0280 ms 62.0%
+  triton_mm_391 0.0284 ms 61.2%
+  triton_mm_393 0.0297 ms 58.5%
+  triton_mm_394 0.0302 ms 57.4%
+  triton_mm_389 0.0318 ms 54.6%
+  triton_mm_388 0.0364 ms 47.7%
+  triton_mm_387 0.0474 ms 36.6%
+  triton_mm_386 0.0507 ms 34.2%
+SingleProcess AUTOTUNE takes 4.0853 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  20%|██        | 6/30 [00:00<00:00, 53.51it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 56.18it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 56.94it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 57.20it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 56.56it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 56.45it/s]
+7118.892ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+timm_regnet
+cuda eval  timm_regnet                         baseline-bs1              
+AUTOTUNE addmm(1x1000, 1x2240, 2240x1000)
+  bias_addmm 0.0172 ms 100.0%
+  addmm 0.0172 ms 100.0%
+  triton_mm_518 0.0227 ms 75.6%
+  triton_mm_519 0.0231 ms 74.3%
+  triton_mm_521 0.0242 ms 71.0%
+  triton_mm_522 0.0249 ms 69.0%
+  triton_mm_517 0.0253 ms 67.8%
+  triton_mm_516 0.0284 ms 60.6%
+  triton_mm_515 0.0363 ms 47.4%
+  triton_mm_514 0.0391 ms 43.9%
+SingleProcess AUTOTUNE takes 4.7369 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  20%|██        | 6/30 [00:00<00:00, 53.75it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 57.43it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 58.46it/s]running benchmark:  87%|████████▋ | 26/30 [00:00<00:00, 59.51it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 58.76it/s]
+5747.663ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+timm_resnest
+cuda eval  timm_resnest                        baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 173.42it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 180.01it/s]
+6225.524ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+timm_vision_transformer
+cuda eval  timm_vision_transformer             baseline-bs1              
+AUTOTUNE addmm(197x1152, 197x384, 384x1152)
+  triton_mm_15 0.0108 ms 100.0%
+  bias_addmm 0.0114 ms 94.9%
+  triton_mm_16 0.0118 ms 91.6%
+  triton_mm_11 0.0123 ms 88.0%
+  triton_mm_13 0.0124 ms 87.3%
+  triton_mm_12 0.0125 ms 86.4%
+  triton_mm_10 0.0126 ms 85.6%
+  triton_mm_8 0.0142 ms 76.3%
+  triton_mm_9 0.0142 ms 76.3%
+  addmm 0.0152 ms 71.3%
+SingleProcess AUTOTUNE takes 5.6256 seconds
+AUTOTUNE mm(197x384, 384x384)
+  triton_mm_25 0.0094 ms 100.0%
+  triton_mm_28 0.0096 ms 97.5%
+  triton_mm_24 0.0098 ms 96.1%
+  triton_mm_27 0.0103 ms 91.7%
+  mm 0.0104 ms 90.3%
+  triton_mm_22 0.0115 ms 81.7%
+  triton_mm_23 0.0116 ms 81.1%
+  triton_mm_21 0.0125 ms 75.2%
+  triton_mm_20 0.0129 ms 73.1%
+  triton_mm_19 0.0160 ms 58.7%
+SingleProcess AUTOTUNE takes 4.6369 seconds
+AUTOTUNE mm(197x384, 384x1536)
+  mm 0.0106 ms 100.0%
+  triton_mm_39 0.0110 ms 96.5%
+  triton_mm_37 0.0120 ms 88.5%
+  triton_mm_34 0.0121 ms 88.1%
+  triton_mm_35 0.0121 ms 88.1%
+  triton_mm_40 0.0121 ms 87.6%
+  triton_mm_36 0.0124 ms 85.6%
+  triton_mm_32 0.0129 ms 82.2%
+  triton_mm_33 0.0131 ms 81.2%
+  triton_mm_31 0.0172 ms 61.9%
+SingleProcess AUTOTUNE takes 4.6276 seconds
+AUTOTUNE mm(197x1536, 1536x384)
+  mm 0.0137 ms 100.0%
+  triton_mm_48 0.0177 ms 77.7%
+  triton_mm_49 0.0183 ms 75.1%
+  triton_mm_52 0.0187 ms 73.6%
+  triton_mm_51 0.0196 ms 70.0%
+  triton_mm_46 0.0234 ms 58.6%
+  triton_mm_47 0.0241 ms 57.0%
+  triton_mm_44 0.0305 ms 45.1%
+  triton_mm_45 0.0308 ms 44.6%
+  triton_mm_43 0.0384 ms 35.8%
+SingleProcess AUTOTUNE takes 4.8935 seconds
+AUTOTUNE addmm(1x1000, 1x384, 384x1000)
+  bias_addmm 0.0092 ms 100.0%
+  triton_mm_589 0.0094 ms 97.6%
+  triton_mm_587 0.0097 ms 95.0%
+  triton_mm_588 0.0100 ms 92.3%
+  triton_mm_591 0.0105 ms 87.2%
+  triton_mm_586 0.0106 ms 87.0%
+  triton_mm_592 0.0107 ms 86.2%
+  triton_mm_585 0.0120 ms 76.7%
+  triton_mm_584 0.0122 ms 75.5%
+  addmm 0.0125 ms 73.2%
+SingleProcess AUTOTUNE takes 3.9200 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 166.05it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 167.01it/s]
+5366.101ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:19, ?it/s]
+timm_vision_transformer_large
+cuda eval  timm_vision_transformer_large       baseline-bs1              
+AUTOTUNE addmm(257x4224, 257x1408, 1408x4224)
+  bias_addmm 0.0303 ms 100.0%
+  addmm 0.0359 ms 84.4%
+  triton_mm_8 0.0365 ms 82.8%
+  triton_mm_10 0.0368 ms 82.3%
+  triton_mm_9 0.0383 ms 79.1%
+  triton_mm_11 0.0383 ms 79.1%
+  triton_mm_15 0.0463 ms 65.4%
+  triton_mm_7 0.0466 ms 64.9%
+  triton_mm_14 0.0540 ms 56.0%
+  triton_mm_13 0.0595 ms 50.9%
+SingleProcess AUTOTUNE takes 5.5802 seconds
+AUTOTUNE mm(257x1408, 1408x1408)
+  mm 0.0179 ms 100.0%
+  triton_mm_22 0.0231 ms 77.3%
+  triton_mm_23 0.0236 ms 75.5%
+  triton_mm_25 0.0244 ms 73.3%
+  triton_mm_27 0.0248 ms 72.1%
+  triton_mm_28 0.0273 ms 65.5%
+  triton_mm_20 0.0296 ms 60.3%
+  triton_mm_21 0.0300 ms 59.6%
+  triton_mm_24 0.0329 ms 54.2%
+  triton_mm_19 0.0363 ms 49.1%
+SingleProcess AUTOTUNE takes 4.9160 seconds
+AUTOTUNE mm(257x1408, 1408x6144)
+  triton_mm_32 0.0467 ms 100.0%
+  mm 0.0472 ms 98.8%
+  triton_mm_33 0.0510 ms 91.6%
+  triton_mm_34 0.0534 ms 87.4%
+  triton_mm_39 0.0541 ms 86.3%
+  triton_mm_35 0.0543 ms 85.9%
+  triton_mm_31 0.0567 ms 82.3%
+  triton_mm_37 0.0752 ms 62.1%
+  triton_mm_36 0.0838 ms 55.7%
+  triton_mm_40 0.0898 ms 52.0%
+SingleProcess AUTOTUNE takes 4.8294 seconds
+AUTOTUNE mm(257x6144, 6144x1408)
+  mm 0.0448 ms 100.0%
+  triton_mm_46 0.0772 ms 58.0%
+  triton_mm_47 0.0777 ms 57.6%
+  triton_mm_49 0.0803 ms 55.8%
+  triton_mm_51 0.0806 ms 55.5%
+  triton_mm_52 0.0836 ms 53.6%
+  triton_mm_45 0.1096 ms 40.8%
+  triton_mm_44 0.1104 ms 40.6%
+  triton_mm_48 0.1140 ms 39.3%
+  triton_mm_43 0.1386 ms 32.3%
+SingleProcess AUTOTUNE takes 4.5410 seconds
+AUTOTUNE addmm(1x1000, 1x1408, 1408x1000)
+  bias_addmm 0.0125 ms 100.0%
+  addmm 0.0164 ms 76.5%
+  triton_mm_1932 0.0174 ms 71.9%
+  triton_mm_1933 0.0174 ms 71.7%
+  triton_mm_1935 0.0179 ms 70.1%
+  triton_mm_1936 0.0188 ms 66.7%
+  triton_mm_1931 0.0188 ms 66.6%
+  triton_mm_1930 0.0206 ms 60.7%
+  triton_mm_1929 0.0261 ms 48.0%
+  triton_mm_1928 0.0273 ms 45.9%
+SingleProcess AUTOTUNE takes 4.5912 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:00, 37.64it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 42.48it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 44.01it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 44.66it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 45.15it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 45.47it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 44.54it/s]
+2264.627ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+timm_vovnet
+cuda eval  timm_vovnet                         baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 112.69it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 114.48it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 114.23it/s]
+7559.349ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+torch_multimodal_clip
+cuda eval  torch_multimodal_clip               baseline-bs1              
+AUTOTUNE mm(50x768, 768x768)
+  mm 0.0114 ms 100.0%
+  triton_mm_25 0.0125 ms 91.3%
+  triton_mm_28 0.0125 ms 91.1%
+  triton_mm_24 0.0129 ms 88.6%
+  triton_mm_27 0.0134 ms 85.2%
+  triton_mm_23 0.0144 ms 79.5%
+  triton_mm_22 0.0160 ms 71.5%
+  triton_mm_21 0.0174 ms 65.6%
+  triton_mm_20 0.0194 ms 58.8%
+  triton_mm_19 0.0278 ms 41.1%
+SingleProcess AUTOTUNE takes 4.9194 seconds
+AUTOTUNE mm(50x768, 768x3072)
+  mm 0.0137 ms 100.0%
+  triton_mm_37 0.0141 ms 96.8%
+  triton_mm_36 0.0150 ms 91.2%
+  triton_mm_40 0.0151 ms 90.5%
+  triton_mm_39 0.0151 ms 90.3%
+  triton_mm_35 0.0155 ms 88.0%
+  triton_mm_34 0.0165 ms 82.9%
+  triton_mm_33 0.0190 ms 72.0%
+  triton_mm_32 0.0202 ms 67.8%
+  triton_mm_31 0.0292 ms 46.7%
+SingleProcess AUTOTUNE takes 4.6403 seconds
+AUTOTUNE mm(50x3072, 3072x768)
+  mm 0.0163 ms 100.0%
+  triton_mm_48 0.0299 ms 54.4%
+  triton_mm_49 0.0299 ms 54.4%
+  triton_mm_52 0.0312 ms 52.1%
+  triton_mm_51 0.0335 ms 48.5%
+  triton_mm_47 0.0363 ms 44.8%
+  triton_mm_46 0.0408 ms 39.8%
+  triton_mm_45 0.0498 ms 32.6%
+  triton_mm_44 0.0552 ms 29.4%
+  triton_mm_43 0.0802 ms 20.3%
+SingleProcess AUTOTUNE takes 4.2684 seconds
+skipping cudagraphs due to ['non-cuda device in graph']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:00, 48.28it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 48.39it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 48.86it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 48.44it/s]running benchmark:  83%|████████▎ | 25/30 [00:00<00:00, 48.57it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 48.81it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 48.66it/s]
+1569.991ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model tts_angular does not support bfloat16, running with amp instead
+loading model: 0it [00:00, ?it/s]
+WARNING:common:Model tts_angular does not support bfloat16, running with amp instead
+tts_angular
+cuda eval  tts_angular                         baseline-bs1              
+WARNING:common:Model tts_angular does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 142.28it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 144.57it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 144.07it/s]
+941.860ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+vgg16
+cuda eval  vgg16                               baseline-bs1              
+AUTOTUNE mm(1x25088, 25088x4096)
+  mm 0.1542 ms 100.0%
+  triton_mm_96 0.2027 ms 76.1%
+  triton_mm_98 0.2127 ms 72.5%
+  triton_mm_95 0.2171 ms 71.0%
+  triton_mm_94 0.2189 ms 70.5%
+  triton_mm_99 0.2330 ms 66.2%
+  triton_mm_93 0.2535 ms 60.8%
+  triton_mm_92 0.3401 ms 45.4%
+  triton_mm_91 0.3744 ms 41.2%
+  triton_mm_90 0.5069 ms 30.4%
+SingleProcess AUTOTUNE takes 4.4236 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 397.96it/s]
+1959.010ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model vision_maskrcnn does not support bfloat16, running with float16 instead
+loading model: 0it [00:04, ?it/s]
+WARNING:common:Model vision_maskrcnn does not support bfloat16, running with float16 instead
+vision_maskrcnn
+cuda eval  vision_maskrcnn                     baseline-bs1              
+WARNING:common:Model vision_maskrcnn does not support bfloat16, running with float16 instead
+ERROR:common:Backend dynamo failed in warmup()
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2380, in warmup
+    fn(model, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 488, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/vision/torchvision/models/detection/generalized_rcnn.py", line 104, in forward
+    proposals, proposal_losses = self.rpn(images, features, targets)
+  File "/home/cdhernandez/local/vision/torchvision/models/detection/generalized_rcnn.py", line 105, in resume_in_forward
+    detections, detector_losses = self.roi_heads(features, proposals, images.image_sizes, targets)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/vision/torchvision/models/detection/roi_heads.py", line 761, in forward
+    box_features = self.box_roi_pool(features, proposals, image_shapes)
+  File "/home/cdhernandez/local/vision/torchvision/models/detection/roi_heads.py", line 775, in resume_in_forward
+    boxes, scores, labels = self.postprocess_detections(class_logits, box_regression, proposals, image_shapes)
+  File "/home/cdhernandez/local/vision/torchvision/models/detection/roi_heads.py", line 804, in resume_in_forward
+    mask_features = self.mask_roi_pool(features, mask_proposals, image_shapes)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 654, in catch_errors
+    return callback(frame, cache_entry, hooks, frame_state)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 727, in _convert_frame
+    result = inner_convert(frame, cache_entry, hooks, frame_state)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 383, in _convert_frame_assert
+    compiled_product = _compile(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 646, in _compile
+    guarded_code = compile_inner(code, one_graph, hooks, transform)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 562, in compile_inner
+    out_code = transform_code_object(code, transform)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/bytecode_transformation.py", line 1033, in transform_code_object
+    transformations(instructions, code_options)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 151, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 527, in transform
+    tracer.run()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2144, in run
+    super().run()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 818, in run
+    and self.step()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 781, in step
+    getattr(self, inst.opname)(inst)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2259, in RETURN_VALUE
+    self.output.compile_subgraph(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 945, in compile_subgraph
+    self.compile_and_call_fx_graph(tx, pass2.graph_output_vars(), root)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1087, in compile_and_call_fx_graph
+    compiled_fn = self.call_user_compiler(gm)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1159, in call_user_compiler
+    raise BackendCompilerFailed(self.compiler_fn, e).with_traceback(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1140, in call_user_compiler
+    compiled_fn = compiler_fn(gm, self.example_inputs())
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_dynamo.py", line 117, in debug_wrapper
+    compiled_gm = compiler_fn(gm, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/__init__.py", line 1672, in __call__
+    return compile_fx(model_, inputs_, config_patches=self.config)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 952, in compile_fx
+    return compile_fx(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1168, in compile_fx
+    return aot_autograd(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/backends/common.py", line 55, in compiler_fn
+    cg = aot_module_simplified(gm, example_inputs, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 887, in aot_module_simplified
+    compiled_fn = create_aot_dispatcher_function(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 600, in create_aot_dispatcher_function
+    compiled_fn = compiler_fn(flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 425, in aot_wrapper_dedupe
+    return compiler_fn(flat_fn, leaf_flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 630, in aot_wrapper_synthetic_base
+    return compiler_fn(flat_fn, flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py", line 97, in aot_dispatch_base
+    compiled_fw = compiler(fw_module, updated_flat_args)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1100, in fw_compiler_base
+    return inner_compile(
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_aot.py", line 83, in debug_wrapper
+    inner_compiled_fn = compiler_fn(gm, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/debug.py", line 305, in inner
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 320, in compile_fx_inner
+    compiled_graph = fx_codegen_and_compile(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 535, in fx_codegen_and_compile
+    graph.run(*example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 518, in run
+    return super().run(*args)
+  File "/home/cdhernandez/local/pytorch/torch/fx/interpreter.py", line 138, in run
+    self.env[node] = self.run_node(node)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 815, in run_node
+    result = self.call_function(n.target, args, kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 695, in call_function
+    raise LoweringException(e, target, args, kwargs).with_traceback(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 692, in call_function
+    out = lowerings[target](*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/lowering.py", line 291, in wrapped
+    out = decomp_fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/kernel/conv.py", line 367, in convolution
+    result = convolution(x, weight, None, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/lowering.py", line 291, in wrapped
+    out = decomp_fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/kernel/conv.py", line 457, in convolution
+    return autotune_select_algorithm("convolution", choices, args, layout)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/select_algorithm.py", line 991, in autotune_select_algorithm
+    return _ALGORITHM_SELECTOR_CACHE(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/select_algorithm.py", line 748, in __call__
+    timings = self.lookup(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 291, in lookup
+    timings = benchmark(choices)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/select_algorithm.py", line 739, in autotune
+    return make_benchmark_fn()(choices)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/select_algorithm.py", line 865, in benchmark_in_current_process
+    raise AssertionError(  # noqa: TRY200
+torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised:
+LoweringException: AssertionError: Incorrect result from choice ExternKernelCaller(extern_kernels.convolution)
+
+expected size 256==256, stride 196==1 at dim=1
+  target: aten.convolution.default
+  args[0]: TensorBox(StorageBox(
+    InputBuffer(name='arg12_1', layout=FixedLayout('cuda', torch.float16, size=[0, 256, 14, 14], stride=[50176, 196, 14, 1]))
+  ))
+  args[1]: TensorBox(StorageBox(
+    InputBuffer(name='arg0_1', layout=FixedLayout('cuda', torch.float16, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
+  ))
+  args[2]: TensorBox(StorageBox(
+    InputBuffer(name='arg1_1', layout=FixedLayout('cuda', torch.float16, size=[256], stride=[1]))
+  ))
+  args[3]: [1, 1]
+  args[4]: [1, 1]
+  args[5]: [1, 1]
+  args[6]: False
+  args[7]: [0, 0]
+  args[8]: 1
+
+Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information
+
+
+You can suppress this exception and fall back to eager by setting:
+    import torch._dynamo
+    torch._dynamo.config.suppress_errors = True
+
+Run failed with return code:  255
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+yolov3
+cuda eval  yolov3                              baseline-bs1              
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:00, 62.07it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 68.86it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 71.22it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 70.53it/s]
+5747.528ms
+
+Summary for tag=0.000000:
+speedup             gmean=0.00x mean=0.000x
+abs_latency         gmean=0.00x mean=0.000x
+compilation_latency mean=0.000 seconds
+compression_ratio   mean=0.000x
+eager_peak_mem      gmean=0.00x mean=0.000x
+dynamo_peak_mem     gmean=0.00x mean=0.000x
+calls_captured      gmean=0.00x mean=0.000x
+unique_graphs       gmean=0.00x mean=0.000x
+graph_breaks        gmean=0.00x mean=0.000x
+unique_graph_breaks gmean=0.00x mean=0.000x
+
+Summary for tag=int8dynamic:
+speedup             gmean=9.20x mean=113.389x
+abs_latency         gmean=4.24x mean=10.510x
+compilation_latency mean=34.839 seconds
+compression_ratio   mean=1.263x
+eager_peak_mem      gmean=0.38x mean=0.878x
+dynamo_peak_mem     gmean=0.36x mean=0.844x
+calls_captured      gmean=233.44x mean=564.988x
+unique_graphs       gmean=1.86x mean=7.136x
+graph_breaks        gmean=0.00x mean=5.160x
+unique_graph_breaks gmean=0.00x mean=1.333x
+
+Summary for tag=int8weightonly:
+speedup             gmean=2.46x mean=2.889x
+abs_latency         gmean=4.51x mean=11.782x
+compilation_latency mean=31.136 seconds
+compression_ratio   mean=1.098x
+eager_peak_mem      gmean=0.38x mean=0.871x
+dynamo_peak_mem     gmean=0.46x mean=0.896x
+calls_captured      gmean=233.16x mean=563.963x
+unique_graphs       gmean=1.85x mean=7.183x
+graph_breaks        gmean=0.00x mean=5.220x
+unique_graph_breaks gmean=0.00x mean=1.317x
+
+Summary for tag=int4weightonly:
+speedup             gmean=2.01x mean=2.520x
+abs_latency         gmean=6.14x mean=33.943x
+compilation_latency mean=27.431 seconds
+compression_ratio   mean=1.140x
+eager_peak_mem      gmean=0.33x mean=0.696x
+dynamo_peak_mem     gmean=0.37x mean=0.739x
+calls_captured      gmean=219.02x mean=494.800x
+unique_graphs       gmean=1.83x mean=7.125x
+graph_breaks        gmean=0.00x mean=5.088x
+unique_graph_breaks gmean=0.00x mean=1.312x
+
+Summary for tag=baseline:
+speedup             gmean=2.42x mean=2.935x
+abs_latency         gmean=4.22x mean=13.273x
+compilation_latency mean=36.647 seconds
+compression_ratio   mean=1.125x
+eager_peak_mem      gmean=0.42x mean=1.075x
+dynamo_peak_mem     gmean=0.45x mean=1.120x
+calls_captured      gmean=240.73x mean=595.060x
+unique_graphs       gmean=1.89x mean=6.619x
+graph_breaks        gmean=0.00x mean=5.071x
+unique_graph_breaks gmean=0.00x mean=1.333x
+
+Summary for tag=int8weightonly-bs1:
+speedup             gmean=3.24x mean=4.117x
+abs_latency         gmean=2.99x mean=8.375x
+compilation_latency mean=35.067 seconds
+compression_ratio   mean=0.937x
+eager_peak_mem      gmean=0.24x mean=0.786x
+dynamo_peak_mem     gmean=0.38x mean=0.886x
+calls_captured      gmean=232.72x mean=567.580x
+unique_graphs       gmean=1.87x mean=7.259x
+graph_breaks        gmean=0.00x mean=5.284x
+unique_graph_breaks gmean=0.00x mean=1.333x
+
+Summary for tag=int4weightonly-bs1:
+speedup             gmean=2.82x mean=3.849x
+abs_latency         gmean=3.59x mean=15.921x
+compilation_latency mean=27.963 seconds
+compression_ratio   mean=0.986x
+eager_peak_mem      gmean=0.20x mean=0.605x
+dynamo_peak_mem     gmean=0.30x mean=0.703x
+calls_captured      gmean=218.43x mean=497.633x
+unique_graphs       gmean=1.84x mean=7.203x
+graph_breaks        gmean=0.00x mean=5.152x
+unique_graph_breaks gmean=0.00x mean=1.329x
+
+Summary for tag=baseline-bs1:
+speedup             gmean=3.33x mean=4.420x
+abs_latency         gmean=2.67x mean=9.869x
+compilation_latency mean=37.506 seconds
+compression_ratio   mean=1.159x
+eager_peak_mem      gmean=0.27x mean=0.992x
+dynamo_peak_mem     gmean=0.30x mean=1.025x
+calls_captured      gmean=240.37x mean=598.928x
+unique_graphs       gmean=1.90x mean=6.687x
+graph_breaks        gmean=0.00x mean=5.133x
+unique_graph_breaks gmean=0.00x mean=1.349x
+start dynamic batchsize 32
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+torchrec_dlrm
+/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/fbgemm_gpu_py.so: undefined symbol: _ZNK5torch8autograd4Node4nameEv
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 36, in <module>
+    run()
+  File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 30, in run
+    benchmark.run(bm_args)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/run.py", line 24, in run
+    main(TorchBenchmarkRunner(), original_dir, args)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3133, in main
+    process_entry(0, runner, original_dir, args)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3090, in process_entry
+    return maybe_fresh_cache(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1714, in inner
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 383, in load_model
+    module = importlib.import_module(c)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/importlib/__init__.py", line 126, in import_module
+    return _bootstrap._gcd_import(name[level:], package, level)
+  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
+  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
+  File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
+  File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
+  File "<frozen importlib._bootstrap_external>", line 883, in exec_module
+  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/__init__.py", line 7, in <module>
+    from .data.dlrm_dataloader import get_dataloader
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/data/dlrm_dataloader.py", line 13, in <module>
+    from torchrec.datasets.criteo import (
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/__init__.py", line 8, in <module>
+    import torchrec.distributed  # noqa
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/__init__.py", line 36, in <module>
+    from torchrec.distributed.model_parallel import DistributedModelParallel  # noqa
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/model_parallel.py", line 21, in <module>
+    from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/__init__.py", line 22, in <module>
+    from torchrec.distributed.planner.planners import EmbeddingShardingPlanner  # noqa
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/planners.py", line 19, in <module>
+    from torchrec.distributed.planner.constants import BATCH_SIZE, MAX_SIZE
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/constants.py", line 10, in <module>
+    from torchrec.distributed.embedding_types import EmbeddingComputeKernel
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/embedding_types.py", line 14, in <module>
+    from fbgemm_gpu.split_table_batched_embeddings_ops_training import EmbeddingLocation
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 23, in <module>
+    from . import _fbgemm_gpu_docs, sparse_ops  # noqa: F401, E402  # noqa: F401, E402
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/_fbgemm_gpu_docs.py", line 19, in <module>
+    torch.ops.fbgemm.jagged_2d_to_dense,
+  File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 822, in __getattr__
+    raise AttributeError(
+AttributeError: '_OpNamespace' 'fbgemm' object has no attribute 'jagged_2d_to_dense'
+Run failed with return code:  1
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+BERT_pytorch
+cuda eval  BERT_pytorch                        int8dynamic-bs32          
+AUTOTUNE bmm(384x128x64, 384x64x128)
+  triton_bmm_30 0.0256 ms 100.0%
+  triton_bmm_23 0.0266 ms 96.0%
+  triton_bmm_24 0.0267 ms 95.9%
+  triton_bmm_25 0.0274 ms 93.3%
+  triton_bmm_26 0.0275 ms 93.1%
+  triton_bmm_32 0.0281 ms 91.1%
+  triton_bmm_22 0.0282 ms 90.6%
+  bmm 0.0292 ms 87.5%
+  triton_bmm_29 0.0296 ms 86.4%
+  triton_bmm_31 0.0316 ms 80.9%
+SingleProcess AUTOTUNE takes 1.8196 seconds
+AUTOTUNE bmm(384x128x128, 384x128x64)
+  triton_bmm_47 0.0282 ms 100.0%
+  triton_bmm_46 0.0296 ms 95.3%
+  triton_bmm_53 0.0299 ms 94.2%
+  triton_bmm_45 0.0300 ms 93.9%
+  triton_bmm_49 0.0301 ms 93.6%
+  triton_bmm_52 0.0301 ms 93.6%
+  triton_bmm_48 0.0313 ms 90.1%
+  triton_bmm_51 0.0322 ms 87.5%
+  triton_bmm_55 0.0325 ms 86.7%
+  triton_bmm_50 0.0334 ms 84.4%
+SingleProcess AUTOTUNE takes 1.8928 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:05,  5.51it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:04,  6.07it/s]running benchmark:  10%|█         | 3/30 [00:00<00:04,  6.29it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:04,  6.37it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:03,  6.39it/s]running benchmark:  20%|██        | 6/30 [00:00<00:03,  6.42it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:03,  6.40it/s]running benchmark:  27%|██▋       | 8/30 [00:01<00:03,  6.41it/s]running benchmark:  30%|███       | 9/30 [00:01<00:03,  6.40it/s]running benchmark:  33%|███▎      | 10/30 [00:01<00:03,  6.42it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:02,  6.47it/s]running benchmark:  40%|████      | 12/30 [00:01<00:02,  6.51it/s]running benchmark:  43%|████▎     | 13/30 [00:02<00:02,  6.49it/s]running benchmark:  47%|████▋     | 14/30 [00:02<00:02,  6.51it/s]running benchmark:  50%|█████     | 15/30 [00:02<00:02,  6.53it/s]running benchmark:  53%|█████▎    | 16/30 [00:02<00:02,  6.53it/s]running benchmark:  57%|█████▋    | 17/30 [00:02<00:01,  6.53it/s]running benchmark:  60%|██████    | 18/30 [00:02<00:01,  6.50it/s]running benchmark:  63%|██████▎   | 19/30 [00:02<00:01,  6.51it/s]running benchmark:  67%|██████▋   | 20/30 [00:03<00:01,  6.49it/s]running benchmark:  70%|███████   | 21/30 [00:03<00:01,  6.50it/s]running benchmark:  73%|███████▎  | 22/30 [00:03<00:01,  6.53it/s]running benchmark:  77%|███████▋  | 23/30 [00:03<00:01,  6.55it/s]running benchmark:  80%|████████  | 24/30 [00:03<00:00,  6.56it/s]running benchmark:  83%|████████▎ | 25/30 [00:03<00:00,  6.57it/s]running benchmark:  87%|████████▋ | 26/30 [00:04<00:00,  6.54it/s]running benchmark:  90%|█████████ | 27/30 [00:04<00:00,  6.55it/s]running benchmark:  93%|█████████▎| 28/30 [00:04<00:00,  6.60it/s]running benchmark:  97%|█████████▋| 29/30 [00:04<00:00,  6.59it/s]running benchmark: 100%|██████████| 30/30 [00:04<00:00,  6.59it/s]running benchmark: 100%|██████████| 30/30 [00:04<00:00,  6.48it/s]
+23642.654ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+Background_Matting
+cuda eval  Background_Matting                  int8dynamic-bs32          
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:00, 35.45it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 39.94it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 41.29it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 41.90it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 42.23it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 42.41it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 41.68it/s]
+2063.105ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model DALLE2_pytorch does not support bfloat16, running with amp instead
+loading model: 0it [00:12, ?it/s]
+WARNING:common:Model DALLE2_pytorch does not support bfloat16, running with amp instead
+DALLE2_pytorch
+cuda eval  DALLE2_pytorch                      int8dynamic-bs32          
+WARNING:common:Model DALLE2_pytorch does not support bfloat16, running with amp instead
+AUTOTUNE mm(4928x512, 512x1536)
+  mm 0.0489 ms 100.0%
+  triton_mm_1 0.0515 ms 95.0%
+  triton_mm_2 0.0520 ms 94.0%
+  triton_mm_3 0.0605 ms 80.8%
+  triton_mm_4 0.0605 ms 80.8%
+  triton_mm_0 0.0629 ms 77.7%
+  triton_mm_7 0.0650 ms 75.2%
+  triton_mm_8 0.0709 ms 68.9%
+  triton_mm_10 0.1024 ms 47.7%
+  triton_mm_9 0.1254 ms 39.0%
+SingleProcess AUTOTUNE takes 5.1601 seconds
+[2023-12-12 10:15:43,630] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+AUTOTUNE int_mm(4928x512, 512x512, 4928x512)
+  triton_mm_20 0.0270 ms 100.0%
+  triton_mm_14 0.0272 ms 99.0%
+  triton_mm_13 0.0287 ms 93.9%
+  triton_mm_16 0.0292 ms 92.4%
+  triton_mm_15 0.0300 ms 90.0%
+  triton_mm_22 0.0311 ms 86.7%
+  triton_mm_12 0.0312 ms 86.4%
+  triton_mm_21 0.0315 ms 85.7%
+  triton_mm_19 0.0501 ms 53.8%
+  triton_mm_17 0.0527 ms 51.2%
+SingleProcess AUTOTUNE takes 7.3957 seconds
+AUTOTUNE int_mm(4928x512, 512x2048, 4928x2048)
+  triton_mm_25 0.0786 ms 100.0%
+  triton_mm_24 0.0790 ms 99.5%
+  triton_mm_31 0.0801 ms 98.2%
+  triton_mm_32 0.0860 ms 91.3%
+  triton_mm_33 0.0866 ms 90.8%
+  triton_mm_26 0.0898 ms 87.5%
+  triton_mm_23 0.0900 ms 87.4%
+  triton_mm_27 0.0944 ms 83.2%
+  triton_mm_30 0.1332 ms 59.0%
+  triton_mm_29 0.1796 ms 43.7%
+SingleProcess AUTOTUNE takes 7.5138 seconds
+AUTOTUNE int_mm(4928x2048, 2048x512, 4928x512)
+  triton_mm_44 0.0494 ms 100.0%
+  triton_mm_43 0.0503 ms 98.2%
+  triton_mm_42 0.0588 ms 84.0%
+  triton_mm_36 0.0622 ms 79.4%
+  triton_mm_35 0.0649 ms 76.0%
+  triton_mm_38 0.0668 ms 73.9%
+  triton_mm_37 0.0700 ms 70.5%
+  triton_mm_34 0.0807 ms 61.2%
+  triton_mm_41 0.1421 ms 34.8%
+  triton_mm_40 0.1521 ms 32.5%
+SingleProcess AUTOTUNE takes 7.6154 seconds
+[2023-12-12 10:16:06,792] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 10:16:07,262] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 10:16:07,728] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 10:16:08,524] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 10:16:08,996] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 10:16:09,480] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 10:16:09,958] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 10:16:10,429] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 10:16:10,894] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 10:16:11,364] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 10:16:11,825] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+AUTOTUNE mm(64x512, 512x512)
+  mm 0.0092 ms 100.0%
+  triton_mm_546 0.0102 ms 90.6%
+  triton_mm_545 0.0109 ms 84.2%
+  triton_mm_549 0.0109 ms 84.2%
+  triton_mm_548 0.0115 ms 80.4%
+  triton_mm_544 0.0120 ms 76.6%
+  triton_mm_543 0.0125 ms 73.5%
+  triton_mm_542 0.0134 ms 68.7%
+  triton_mm_541 0.0149 ms 61.9%
+  triton_mm_540 0.0210 ms 44.0%
+SingleProcess AUTOTUNE takes 4.4093 seconds
+AUTOTUNE int_mm(64x512, 512x1024, 64x1024)
+  triton_mm_558 0.0111 ms 100.0%
+  triton_mm_560 0.0115 ms 96.4%
+  triton_mm_557 0.0116 ms 95.3%
+  triton_mm_562 0.0120 ms 92.8%
+  triton_mm_556 0.0135 ms 82.0%
+  triton_mm_554 0.0147 ms 75.8%
+  triton_mm_555 0.0150 ms 74.0%
+  triton_mm_561 0.0157 ms 70.8%
+  triton_mm_553 0.0171 ms 65.1%
+  triton_mm_552 0.0178 ms 62.4%
+SingleProcess AUTOTUNE takes 5.0332 seconds
+AUTOTUNE int_mm(64x1024, 1024x1024, 64x1024)
+  triton_mm_569 0.0148 ms 100.0%
+  triton_mm_573 0.0149 ms 99.1%
+  triton_mm_568 0.0153 ms 96.7%
+  triton_mm_571 0.0158 ms 93.9%
+  triton_mm_567 0.0186 ms 79.7%
+  triton_mm_572 0.0195 ms 76.0%
+  triton_mm_566 0.0199 ms 74.3%
+  triton_mm_565 0.0215 ms 68.9%
+  triton_mm_564 0.0248 ms 59.7%
+  triton_mm_563 0.0304 ms 48.8%
+SingleProcess AUTOTUNE takes 5.5972 seconds
+AUTOTUNE int_mm(64x1024, 1024x512, 64x512)
+  triton_mm_580 0.0145 ms 100.0%
+  triton_mm_584 0.0148 ms 97.8%
+  triton_mm_579 0.0149 ms 97.6%
+  triton_mm_582 0.0149 ms 97.2%
+  triton_mm_578 0.0182 ms 79.9%
+  triton_mm_583 0.0190 ms 76.6%
+  triton_mm_577 0.0201 ms 72.4%
+  triton_mm_576 0.0214 ms 68.0%
+  triton_mm_575 0.0245 ms 59.3%
+  triton_mm_574 0.0290 ms 50.1%
+SingleProcess AUTOTUNE takes 5.2618 seconds
+AUTOTUNE int_mm(16640x512, 512x128, 16640x128)
+  triton_mm_593 0.0267 ms 100.0%
+  triton_mm_587 0.0294 ms 90.8%
+  triton_mm_589 0.0314 ms 85.0%
+  triton_mm_594 0.0317 ms 84.1%
+  triton_mm_585 0.0322 ms 82.9%
+  triton_mm_586 0.0324 ms 82.2%
+  triton_mm_595 0.0327 ms 81.6%
+  triton_mm_588 0.0344 ms 77.5%
+  triton_mm_590 0.0450 ms 59.4%
+  triton_mm_591 0.0455 ms 58.7%
+SingleProcess AUTOTUNE takes 7.1181 seconds
+AUTOTUNE int_mm(16640x512, 512x512, 16640x512)
+  triton_mm_604 0.0711 ms 100.0%
+  triton_mm_598 0.0716 ms 99.3%
+  triton_mm_597 0.0726 ms 97.9%
+  triton_mm_599 0.0785 ms 90.5%
+  triton_mm_600 0.0810 ms 87.7%
+  triton_mm_606 0.0839 ms 84.7%
+  triton_mm_596 0.0846 ms 84.0%
+  triton_mm_605 0.0847 ms 83.9%
+  triton_mm_603 0.1172 ms 60.6%
+  triton_mm_602 0.1540 ms 46.1%
+SingleProcess AUTOTUNE takes 7.6345 seconds
+AUTOTUNE bmm(64x2080x64, 64x64x261)
+  triton_bmm_609 0.1005 ms 100.0%
+  triton_bmm_608 0.1108 ms 90.7%
+  triton_bmm_607 0.1128 ms 89.1%
+  triton_bmm_611 0.1240 ms 81.0%
+  triton_bmm_617 0.1264 ms 79.5%
+  triton_bmm_616 0.1362 ms 73.8%
+  triton_bmm_610 0.1392 ms 72.2%
+  triton_bmm_614 0.1532 ms 65.6%
+  triton_bmm_618 0.1651 ms 60.9%
+  triton_bmm_613 0.2102 ms 47.8%
+SingleProcess AUTOTUNE takes 1.8875 seconds
+AUTOTUNE bmm(64x2080x261, 64x261x64)
+  triton_bmm_622 0.0994 ms 100.0%
+  triton_bmm_620 0.1018 ms 97.6%
+  triton_bmm_623 0.1057 ms 94.0%
+  triton_bmm_621 0.1065 ms 93.3%
+  triton_bmm_627 0.1106 ms 89.8%
+  triton_bmm_619 0.1164 ms 85.3%
+  triton_bmm_626 0.1194 ms 83.2%
+  triton_bmm_629 0.1195 ms 83.1%
+  triton_bmm_625 0.1210 ms 82.1%
+  triton_bmm_630 0.1312 ms 75.7%
+SingleProcess AUTOTUNE takes 1.8593 seconds
+AUTOTUNE int_mm(16640x512, 512x4096, 16640x4096)
+  triton_mm_643 0.4475 ms 100.0%
+  triton_mm_644 0.4511 ms 99.2%
+  triton_mm_650 0.4874 ms 91.8%
+  triton_mm_652 0.4897 ms 91.4%
+  triton_mm_651 0.4944 ms 90.5%
+  triton_mm_642 0.5129 ms 87.2%
+  triton_mm_645 0.5285 ms 84.7%
+  triton_mm_646 0.5600 ms 79.9%
+  triton_mm_649 0.7857 ms 57.0%
+  triton_mm_648 1.1541 ms 38.8%
+SingleProcess AUTOTUNE takes 7.8949 seconds
+AUTOTUNE int_mm(16640x2048, 2048x512, 16640x512)
+  triton_mm_663 0.1357 ms 100.0%
+  triton_mm_662 0.1373 ms 98.9%
+  triton_mm_661 0.1718 ms 79.0%
+  triton_mm_655 0.1902 ms 71.4%
+  triton_mm_654 0.1943 ms 69.9%
+  triton_mm_656 0.2012 ms 67.5%
+  triton_mm_657 0.2012 ms 67.5%
+  triton_mm_653 0.2467 ms 55.0%
+  triton_mm_660 0.3460 ms 39.2%
+  triton_mm_659 0.4785 ms 28.4%
+SingleProcess AUTOTUNE takes 7.4764 seconds
+AUTOTUNE bmm(64x1x512, 64x512x1)
+  triton_bmm_1073 0.0085 ms 100.0%
+  triton_bmm_1072 0.0089 ms 95.7%
+  triton_bmm_1075 0.0091 ms 93.6%
+  triton_bmm_1074 0.0092 ms 92.0%
+  triton_bmm_1071 0.0112 ms 75.9%
+  bmm 0.0127 ms 66.6%
+  triton_bmm_1070 0.0154 ms 55.2%
+  triton_bmm_1076 0.0180 ms 47.2%
+  triton_bmm_1077 0.0195 ms 43.5%
+SingleProcess AUTOTUNE takes 1.3718 seconds
+skipping cudagraphs due to ['non-cuda device in graph']
+AUTOTUNE mm(2464x512, 512x1536)
+  triton_mm_1079 0.0315 ms 100.0%
+  triton_mm_1080 0.0321 ms 97.9%
+  mm 0.0329 ms 95.5%
+  triton_mm_1081 0.0363 ms 86.7%
+  triton_mm_1082 0.0369 ms 85.3%
+  triton_mm_1086 0.0400 ms 78.6%
+  triton_mm_1078 0.0415 ms 75.7%
+  triton_mm_1085 0.0439 ms 71.6%
+  triton_mm_1088 0.0623 ms 50.5%
+  triton_mm_1087 0.0666 ms 47.3%
+SingleProcess AUTOTUNE takes 4.6993 seconds
+[2023-12-12 10:17:58,307] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+AUTOTUNE int_mm(2464x512, 512x512, 2464x512)
+  triton_mm_1098 0.0166 ms 100.0%
+  triton_mm_1092 0.0196 ms 84.6%
+  triton_mm_1091 0.0203 ms 81.7%
+  triton_mm_1093 0.0205 ms 80.7%
+  triton_mm_1090 0.0209 ms 79.3%
+  triton_mm_1094 0.0212 ms 78.1%
+  triton_mm_1097 0.0281 ms 59.1%
+  triton_mm_1099 0.0295 ms 56.2%
+  triton_mm_1100 0.0298 ms 55.6%
+  triton_mm_1096 0.0299 ms 55.5%
+SingleProcess AUTOTUNE takes 7.6102 seconds
+AUTOTUNE int_mm(2464x512, 512x2048, 2464x2048)
+  triton_mm_1103 0.0451 ms 100.0%
+  triton_mm_1109 0.0455 ms 99.2%
+  triton_mm_1102 0.0460 ms 98.1%
+  triton_mm_1104 0.0498 ms 90.6%
+  triton_mm_1101 0.0507 ms 89.0%
+  triton_mm_1105 0.0511 ms 88.3%
+  triton_mm_1111 0.0571 ms 79.0%
+  triton_mm_1110 0.0575 ms 78.5%
+  triton_mm_1108 0.0731 ms 61.7%
+  triton_mm_1107 0.0934 ms 48.3%
+SingleProcess AUTOTUNE takes 7.4960 seconds
+AUTOTUNE int_mm(2464x2048, 2048x512, 2464x512)
+  triton_mm_1120 0.0325 ms 100.0%
+  triton_mm_1114 0.0437 ms 74.4%
+  triton_mm_1113 0.0445 ms 73.1%
+  triton_mm_1115 0.0457 ms 71.2%
+  triton_mm_1116 0.0465 ms 70.0%
+  triton_mm_1122 0.0476 ms 68.3%
+  triton_mm_1121 0.0478 ms 68.0%
+  triton_mm_1112 0.0521 ms 62.4%
+  triton_mm_1119 0.0739 ms 44.0%
+  triton_mm_1117 0.0800 ms 40.7%
+SingleProcess AUTOTUNE takes 7.5296 seconds
+[2023-12-12 10:18:21,519] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 10:18:22,007] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 10:18:22,509] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 10:18:22,996] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 10:18:23,479] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 10:18:23,973] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 10:18:24,467] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 10:18:24,955] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 10:18:26,010] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 10:18:26,508] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 10:18:27,003] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+AUTOTUNE mm(32x512, 512x512)
+  triton_mm_1623 0.0099 ms 100.0%
+  triton_mm_1626 0.0104 ms 95.8%
+  triton_mm_1627 0.0104 ms 95.5%
+  triton_mm_1624 0.0107 ms 92.8%
+  mm 0.0108 ms 91.6%
+  triton_mm_1622 0.0116 ms 85.8%
+  triton_mm_1621 0.0123 ms 80.9%
+  triton_mm_1620 0.0133 ms 74.8%
+  triton_mm_1619 0.0134 ms 74.1%
+  triton_mm_1618 0.0188 ms 52.7%
+SingleProcess AUTOTUNE takes 4.0089 seconds
+skipping cudagraphs due to ['non-cuda device in graph']
+AUTOTUNE int_mm(32x128, 128x512, 32x512)
+  triton_mm_1635 0.0079 ms 100.0%
+  triton_mm_1638 0.0079 ms 100.0%
+  triton_mm_1634 0.0080 ms 98.8%
+  triton_mm_1636 0.0080 ms 98.4%
+  triton_mm_1632 0.0084 ms 93.6%
+  triton_mm_1633 0.0087 ms 91.1%
+  triton_mm_1640 0.0088 ms 90.1%
+  triton_mm_1630 0.0090 ms 87.9%
+  triton_mm_1631 0.0094 ms 84.3%
+  triton_mm_1637 0.0096 ms 82.7%
+SingleProcess AUTOTUNE takes 4.1426 seconds
+AUTOTUNE int_mm(32x512, 512x256, 32x256)
+  triton_mm_1651 0.0100 ms 100.0%
+  triton_mm_1649 0.0102 ms 97.8%
+  triton_mm_1646 0.0102 ms 97.5%
+  triton_mm_1645 0.0112 ms 88.6%
+  triton_mm_1647 0.0113 ms 87.7%
+  triton_mm_1650 0.0123 ms 80.8%
+  triton_mm_1644 0.0124 ms 80.6%
+  triton_mm_1643 0.0130 ms 76.8%
+  triton_mm_1642 0.0143 ms 69.7%
+  triton_mm_1641 0.0156 ms 63.7%
+SingleProcess AUTOTUNE takes 4.4498 seconds
+AUTOTUNE int_mm(32x512, 512x512, 32x512)
+  triton_mm_1662 0.0099 ms 100.0%
+  triton_mm_1660 0.0102 ms 96.9%
+  triton_mm_1657 0.0103 ms 95.7%
+  triton_mm_1656 0.0113 ms 87.5%
+  triton_mm_1658 0.0114 ms 87.0%
+  triton_mm_1661 0.0122 ms 81.3%
+  triton_mm_1655 0.0127 ms 77.6%
+  triton_mm_1654 0.0134 ms 73.7%
+  triton_mm_1653 0.0141 ms 69.9%
+  triton_mm_1652 0.0164 ms 60.1%
+SingleProcess AUTOTUNE takes 4.0475 seconds
+AUTOTUNE int_mm(2464x512, 512x128, 2464x128)
+  triton_mm_1671 0.0120 ms 100.0%
+  triton_mm_1668 0.0149 ms 80.5%
+  triton_mm_1669 0.0153 ms 78.5%
+  triton_mm_1666 0.0156 ms 77.2%
+  triton_mm_1667 0.0159 ms 75.3%
+  triton_mm_1672 0.0167 ms 71.8%
+  triton_mm_1664 0.0174 ms 69.1%
+  triton_mm_1665 0.0175 ms 68.7%
+  triton_mm_1663 0.0181 ms 66.1%
+  triton_mm_1670 0.0281 ms 42.7%
+SingleProcess AUTOTUNE takes 6.6261 seconds
+AUTOTUNE convolution(32x3x128x128, 64x3x3x3)
+  convolution 0.1023 ms 100.0%
+  triton_convolution_1678 0.2058 ms 49.7%
+  triton_convolution_1677 0.2065 ms 49.6%
+  triton_convolution_1679 0.2672 ms 38.3%
+  triton_convolution_1674 0.2757 ms 37.1%
+  triton_convolution_1676 0.3267 ms 31.3%
+  triton_convolution_1675 0.4005 ms 25.6%
+SingleProcess AUTOTUNE takes 3.8174 seconds
+AUTOTUNE convolution(32x3x128x128, 32x3x7x7)
+  convolution 0.1837 ms 100.0%
+  triton_convolution_1682 0.6459 ms 28.4%
+  triton_convolution_1680 0.7054 ms 26.0%
+  triton_convolution_1684 0.7219 ms 25.5%
+  triton_convolution_1683 0.7257 ms 25.3%
+  triton_convolution_1685 0.8403 ms 21.9%
+  triton_convolution_1681 0.8413 ms 21.8%
+SingleProcess AUTOTUNE takes 3.1376 seconds
+AUTOTUNE convolution(32x3x128x128, 32x3x15x15)
+  convolution 1.4682 ms 100.0%
+  triton_convolution_1688 2.7425 ms 53.5%
+  triton_convolution_1686 3.0070 ms 48.8%
+  triton_convolution_1689 3.0984 ms 47.4%
+  triton_convolution_1690 3.1033 ms 47.3%
+  triton_convolution_1691 3.4783 ms 42.2%
+  triton_convolution_1687 3.5392 ms 41.5%
+SingleProcess AUTOTUNE takes 3.0847 seconds
+AUTOTUNE convolution(32x128x128x128, 128x128x3x3)
+  convolution 0.7454 ms 100.0%
+  triton_convolution_1695 4.3307 ms 17.2%
+  triton_convolution_1698 4.3784 ms 17.0%
+  triton_convolution_1692 4.8440 ms 15.4%
+  triton_convolution_1697 5.0788 ms 14.7%
+  triton_convolution_1693 6.6031 ms 11.3%
+  triton_convolution_1696 7.8471 ms 9.5%
+  triton_convolution_1694 19.1707 ms 3.9%
+SingleProcess AUTOTUNE takes 4.5047 seconds
+AUTOTUNE addmm(131072x128, 131072x512, 512x128)
+  bias_addmm 0.1381 ms 100.0%
+  triton_mm_1790 0.1399 ms 98.7%
+  triton_mm_1791 0.1404 ms 98.4%
+  triton_mm_1792 0.1486 ms 93.0%
+  triton_mm_1793 0.1508 ms 91.6%
+  triton_mm_1797 0.1624 ms 85.0%
+  triton_mm_1789 0.1827 ms 75.6%
+  addmm 0.1968 ms 70.2%
+  triton_mm_1796 0.2543 ms 54.3%
+  triton_mm_1799 0.2581 ms 53.5%
+SingleProcess AUTOTUNE takes 5.4560 seconds
+AUTOTUNE convolution(32x128x64x64, 128x128x3x3)
+  convolution 0.1994 ms 100.0%
+  triton_convolution_1804 1.0482 ms 19.0%
+  triton_convolution_1807 1.1279 ms 17.7%
+  triton_convolution_1806 1.2955 ms 15.4%
+  triton_convolution_1801 1.2980 ms 15.4%
+  triton_convolution_1802 1.6546 ms 12.1%
+  triton_convolution_1805 1.9970 ms 10.0%
+  triton_convolution_1803 4.9903 ms 4.0%
+SingleProcess AUTOTUNE takes 4.2528 seconds
+AUTOTUNE int_mm(192x128, 128x1024, 192x1024)
+  triton_mm_1850 0.0089 ms 100.0%
+  triton_mm_1849 0.0090 ms 99.3%
+  triton_mm_1852 0.0090 ms 99.3%
+  triton_mm_1844 0.0100 ms 89.1%
+  triton_mm_1846 0.0104 ms 85.8%
+  triton_mm_1848 0.0108 ms 82.5%
+  triton_mm_1847 0.0109 ms 81.6%
+  triton_mm_1845 0.0111 ms 80.5%
+  triton_mm_1851 0.0159 ms 56.1%
+  triton_mm_1854 0.0279 ms 32.0%
+SingleProcess AUTOTUNE takes 5.8169 seconds
+AUTOTUNE int_mm(131072x128, 128x512, 131072x512)
+  triton_mm_1857 0.2536 ms 100.0%
+  triton_mm_1855 0.2592 ms 97.8%
+  triton_mm_1856 0.2593 ms 97.8%
+  triton_mm_1863 0.2839 ms 89.3%
+  triton_mm_1858 0.2964 ms 85.6%
+  triton_mm_1859 0.3194 ms 79.4%
+  triton_mm_1862 0.3937 ms 64.4%
+  triton_mm_1865 0.4584 ms 55.3%
+  triton_mm_1864 0.4599 ms 55.1%
+  triton_mm_1861 0.5155 ms 49.2%
+SingleProcess AUTOTUNE takes 5.8228 seconds
+AUTOTUNE bmm(256x4096x64, 256x64x7)
+  triton_bmm_1874 0.1148 ms 100.0%
+  triton_bmm_1868 0.1148 ms 99.9%
+  triton_bmm_1867 0.1154 ms 99.4%
+  triton_bmm_1870 0.1165 ms 98.5%
+  triton_bmm_1866 0.1165 ms 98.5%
+  triton_bmm_1875 0.1166 ms 98.4%
+  triton_bmm_1869 0.1169 ms 98.2%
+  triton_bmm_1873 0.1170 ms 98.1%
+  triton_bmm_1871 0.1180 ms 97.2%
+  triton_bmm_1876 0.1194 ms 96.1%
+SingleProcess AUTOTUNE takes 3.8144 seconds
+AUTOTUNE bmm(256x4096x7, 256x7x64)
+  triton_bmm_1884 0.1079 ms 100.0%
+  triton_bmm_1885 0.1102 ms 98.0%
+  triton_bmm_1882 0.1103 ms 97.8%
+  triton_bmm_1886 0.1109 ms 97.3%
+  triton_bmm_1881 0.1110 ms 97.2%
+  triton_bmm_1879 0.1113 ms 97.0%
+  triton_bmm_1880 0.1113 ms 97.0%
+  triton_bmm_1878 0.1117 ms 96.6%
+  triton_bmm_1888 0.1163 ms 92.8%
+  triton_bmm_1883 0.1179 ms 91.5%
+SingleProcess AUTOTUNE takes 3.7589 seconds
+AUTOTUNE int_mm(131072x512, 512x128, 131072x128)
+  triton_mm_1897 0.1344 ms 100.0%
+  triton_mm_1891 0.1403 ms 95.8%
+  triton_mm_1898 0.1414 ms 95.0%
+  triton_mm_1890 0.1471 ms 91.4%
+  triton_mm_1892 0.1518 ms 88.6%
+  triton_mm_1893 0.1534 ms 87.6%
+  triton_mm_1899 0.1558 ms 86.2%
+  triton_mm_1889 0.1719 ms 78.2%
+  triton_mm_1896 0.2319 ms 58.0%
+  triton_mm_1894 0.2922 ms 46.0%
+SingleProcess AUTOTUNE takes 6.5661 seconds
+AUTOTUNE addmm(32768x256, 32768x512, 512x256)
+  bias_addmm 0.0575 ms 100.0%
+  triton_mm_1990 0.0667 ms 86.2%
+  triton_mm_1989 0.0668 ms 86.1%
+  triton_mm_1991 0.0720 ms 79.9%
+  triton_mm_1992 0.0728 ms 79.0%
+  triton_mm_1996 0.0807 ms 71.3%
+  addmm 0.0867 ms 66.3%
+  triton_mm_1988 0.0886 ms 64.9%
+  triton_mm_1995 0.1244 ms 46.2%
+  triton_mm_1998 0.1259 ms 45.7%
+SingleProcess AUTOTUNE takes 5.2633 seconds
+AUTOTUNE convolution(32x256x32x32, 256x256x3x3)
+  convolution 0.1712 ms 100.0%
+  triton_convolution_2005 1.0192 ms 16.8%
+  triton_convolution_2003 1.0794 ms 15.9%
+  triton_convolution_2006 1.2233 ms 14.0%
+  triton_convolution_2000 1.5524 ms 11.0%
+  triton_convolution_2004 2.0344 ms 8.4%
+  triton_convolution_2001 2.0415 ms 8.4%
+  triton_convolution_2002 4.8558 ms 3.5%
+SingleProcess AUTOTUNE takes 5.1190 seconds
+AUTOTUNE int_mm(32768x256, 256x512, 32768x512)
+  triton_mm_2056 0.0913 ms 100.0%
+  triton_mm_2055 0.0935 ms 97.7%
+  triton_mm_2062 0.0956 ms 95.6%
+  triton_mm_2054 0.0995 ms 91.8%
+  triton_mm_2057 0.1034 ms 88.3%
+  triton_mm_2058 0.1077 ms 84.8%
+  triton_mm_2064 0.1207 ms 75.7%
+  triton_mm_2063 0.1218 ms 75.0%
+  triton_mm_2061 0.1436 ms 63.6%
+  triton_mm_2060 0.1891 ms 48.3%
+SingleProcess AUTOTUNE takes 7.5838 seconds
+AUTOTUNE bmm(256x1024x64, 256x64x7)
+  triton_bmm_2073 0.0415 ms 100.0%
+  triton_bmm_2074 0.0429 ms 96.8%
+  triton_bmm_2066 0.0434 ms 95.7%
+  triton_bmm_2067 0.0438 ms 94.8%
+  triton_bmm_2072 0.0449 ms 92.5%
+  triton_bmm_2069 0.0450 ms 92.3%
+  triton_bmm_2068 0.0450 ms 92.3%
+  triton_bmm_2065 0.0452 ms 92.0%
+  triton_bmm_2070 0.0452 ms 91.9%
+  triton_bmm_2071 0.0465 ms 89.4%
+SingleProcess AUTOTUNE takes 3.8017 seconds
+AUTOTUNE bmm(256x1024x7, 256x7x64)
+  triton_bmm_2079 0.0319 ms 100.0%
+  triton_bmm_2084 0.0320 ms 99.7%
+  triton_bmm_2077 0.0323 ms 98.8%
+  triton_bmm_2078 0.0323 ms 98.8%
+  triton_bmm_2080 0.0324 ms 98.7%
+  triton_bmm_2085 0.0324 ms 98.7%
+  triton_bmm_2083 0.0324 ms 98.5%
+  triton_bmm_2081 0.0325 ms 98.3%
+  triton_bmm_2087 0.0347 ms 92.2%
+  triton_bmm_2082 0.0348 ms 91.9%
+SingleProcess AUTOTUNE takes 3.4496 seconds
+AUTOTUNE int_mm(32768x512, 512x256, 32768x256)
+  triton_mm_2096 0.0692 ms 100.0%
+  triton_mm_2090 0.0738 ms 93.8%
+  triton_mm_2089 0.0756 ms 91.6%
+  triton_mm_2091 0.0805 ms 86.0%
+  triton_mm_2092 0.0826 ms 83.8%
+  triton_mm_2098 0.0851 ms 81.4%
+  triton_mm_2097 0.0857 ms 80.8%
+  triton_mm_2088 0.0877 ms 78.9%
+  triton_mm_2095 0.1199 ms 57.7%
+  triton_mm_2094 0.1504 ms 46.1%
+SingleProcess AUTOTUNE takes 7.4623 seconds
+AUTOTUNE addmm(8192x512, 8192x1024, 1024x512)
+  bias_addmm 0.0574 ms 100.0%
+  triton_mm_2189 0.0592 ms 97.1%
+  triton_mm_2188 0.0628 ms 91.5%
+  triton_mm_2190 0.0634 ms 90.6%
+  triton_mm_2191 0.0644 ms 89.1%
+  addmm 0.0705 ms 81.4%
+  triton_mm_2195 0.0751 ms 76.5%
+  triton_mm_2187 0.0785 ms 73.1%
+  triton_mm_2194 0.1008 ms 56.9%
+  triton_mm_2197 0.1245 ms 46.1%
+SingleProcess AUTOTUNE takes 5.7465 seconds
+AUTOTUNE convolution(32x512x16x16, 512x512x3x3)
+  convolution 0.1846 ms 100.0%
+  triton_convolution_2204 1.3397 ms 13.8%
+  triton_convolution_2205 1.4894 ms 12.4%
+  triton_convolution_2202 1.7466 ms 10.6%
+  triton_convolution_2200 2.0696 ms 8.9%
+  triton_convolution_2199 2.2316 ms 8.3%
+  triton_convolution_2203 2.4625 ms 7.5%
+  triton_convolution_2201 5.6301 ms 3.3%
+SingleProcess AUTOTUNE takes 4.8313 seconds
+AUTOTUNE int_mm(32x512, 512x1024, 32x1024)
+  triton_mm_2211 0.0101 ms 100.0%
+  triton_mm_2216 0.0102 ms 99.1%
+  triton_mm_2214 0.0107 ms 94.0%
+  triton_mm_2212 0.0110 ms 91.3%
+  triton_mm_2210 0.0120 ms 83.8%
+  triton_mm_2215 0.0121 ms 83.1%
+  triton_mm_2209 0.0124 ms 81.6%
+  triton_mm_2208 0.0132 ms 76.3%
+  triton_mm_2207 0.0143 ms 70.5%
+  triton_mm_2206 0.0172 ms 58.4%
+SingleProcess AUTOTUNE takes 4.1421 seconds
+AUTOTUNE int_mm(8192x512, 512x512, 8192x512)
+  triton_mm_2261 0.0391 ms 100.0%
+  triton_mm_2255 0.0396 ms 98.6%
+  triton_mm_2254 0.0404 ms 96.8%
+  triton_mm_2256 0.0430 ms 90.9%
+  triton_mm_2257 0.0443 ms 88.3%
+  triton_mm_2253 0.0467 ms 83.7%
+  triton_mm_2262 0.0564 ms 69.3%
+  triton_mm_2263 0.0566 ms 69.0%
+  triton_mm_2260 0.0691 ms 56.6%
+  triton_mm_2258 0.0787 ms 49.7%
+SingleProcess AUTOTUNE takes 8.2540 seconds
+AUTOTUNE bmm(256x256x64, 256x64x7)
+  triton_bmm_2271 0.0159 ms 100.0%
+  triton_bmm_2265 0.0162 ms 98.3%
+  triton_bmm_2264 0.0163 ms 97.7%
+  triton_bmm_2272 0.0163 ms 97.7%
+  triton_bmm_2273 0.0166 ms 95.7%
+  triton_bmm_2267 0.0167 ms 95.3%
+  triton_bmm_2274 0.0167 ms 95.1%
+  triton_bmm_2270 0.0168 ms 94.8%
+  triton_bmm_2268 0.0173 ms 91.6%
+  triton_bmm_2269 0.0173 ms 91.6%
+SingleProcess AUTOTUNE takes 3.8169 seconds
+AUTOTUNE bmm(256x256x7, 256x7x64)
+  triton_bmm_2278 0.0117 ms 100.0%
+  triton_bmm_2276 0.0117 ms 99.7%
+  triton_bmm_2277 0.0118 ms 99.5%
+  triton_bmm_2283 0.0120 ms 97.6%
+  triton_bmm_2279 0.0120 ms 97.3%
+  triton_bmm_2280 0.0120 ms 97.3%
+  triton_bmm_2284 0.0120 ms 97.3%
+  triton_bmm_2282 0.0135 ms 86.5%
+  triton_bmm_2285 0.0143 ms 82.1%
+  triton_bmm_2281 0.0144 ms 81.2%
+SingleProcess AUTOTUNE takes 3.9364 seconds
+AUTOTUNE addmm(8192x1024, 8192x512, 512x1024)
+  bias_addmm 0.0510 ms 100.0%
+  triton_mm_2388 0.0603 ms 84.6%
+  triton_mm_2387 0.0610 ms 83.5%
+  triton_mm_2389 0.0690 ms 73.9%
+  triton_mm_2390 0.0700 ms 72.8%
+  triton_mm_2386 0.0760 ms 67.1%
+  addmm 0.0780 ms 65.3%
+  triton_mm_2394 0.0794 ms 64.2%
+  triton_mm_2393 0.1098 ms 46.4%
+  triton_mm_2396 0.1132 ms 45.0%
+SingleProcess AUTOTUNE takes 5.6078 seconds
+AUTOTUNE convolution(32x1024x16x16, 1024x1024x3x3)
+  convolution 0.6472 ms 100.0%
+  triton_convolution_2403 4.5734 ms 14.2%
+  triton_convolution_2404 5.0817 ms 12.7%
+  triton_convolution_2401 5.9505 ms 10.9%
+  triton_convolution_2398 7.2109 ms 9.0%
+  triton_convolution_2399 7.6796 ms 8.4%
+  triton_convolution_2402 9.6961 ms 6.7%
+  triton_convolution_2400 18.7354 ms 3.5%
+SingleProcess AUTOTUNE takes 5.2631 seconds
+AUTOTUNE int_mm(32x512, 512x2048, 32x2048)
+  triton_mm_2415 0.0102 ms 100.0%
+  triton_mm_2413 0.0107 ms 96.1%
+  triton_mm_2410 0.0111 ms 92.0%
+  triton_mm_2411 0.0113 ms 90.7%
+  triton_mm_2409 0.0122 ms 84.2%
+  triton_mm_2414 0.0129 ms 79.2%
+  triton_mm_2408 0.0132 ms 77.7%
+  triton_mm_2407 0.0135 ms 76.0%
+  triton_mm_2406 0.0146 ms 70.3%
+  triton_mm_2405 0.0171 ms 59.9%
+SingleProcess AUTOTUNE takes 4.2997 seconds
+AUTOTUNE int_mm(8384x128, 128x1024, 8384x1024)
+  triton_mm_2418 0.0410 ms 100.0%
+  triton_mm_2416 0.0422 ms 97.3%
+  triton_mm_2417 0.0423 ms 97.0%
+  triton_mm_2424 0.0432 ms 94.9%
+  triton_mm_2419 0.0460 ms 89.1%
+  triton_mm_2420 0.0493 ms 83.2%
+  triton_mm_2423 0.0605 ms 67.8%
+  triton_mm_2422 0.0735 ms 55.8%
+  triton_mm_2421 0.0740 ms 55.4%
+  triton_mm_2426 0.0802 ms 51.2%
+SingleProcess AUTOTUNE takes 5.7693 seconds
+AUTOTUNE int_mm(8192x1024, 1024x512, 8192x512)
+  triton_mm_2435 0.0568 ms 100.0%
+  triton_mm_2429 0.0602 ms 94.4%
+  triton_mm_2428 0.0614 ms 92.6%
+  triton_mm_2430 0.0654 ms 86.8%
+  triton_mm_2431 0.0658 ms 86.3%
+  triton_mm_2437 0.0685 ms 83.0%
+  triton_mm_2436 0.0691 ms 82.3%
+  triton_mm_2427 0.0740 ms 76.8%
+  triton_mm_2434 0.1121 ms 50.7%
+  triton_mm_2432 0.1319 ms 43.1%
+SingleProcess AUTOTUNE takes 7.6622 seconds
+AUTOTUNE bmm(256x256x64, 256x64x263)
+  triton_bmm_2440 0.0550 ms 100.0%
+  triton_bmm_2439 0.0605 ms 91.0%
+  triton_bmm_2438 0.0654 ms 84.2%
+  triton_bmm_2442 0.0657 ms 83.8%
+  triton_bmm_2448 0.0667 ms 82.6%
+  triton_bmm_2447 0.0709 ms 77.6%
+  triton_bmm_2441 0.0729 ms 75.5%
+  triton_bmm_2445 0.0845 ms 65.1%
+  triton_bmm_2449 0.0846 ms 65.1%
+  triton_bmm_2444 0.1083 ms 50.8%
+SingleProcess AUTOTUNE takes 5.5988 seconds
+AUTOTUNE bmm(256x256x263, 256x263x64)
+  triton_bmm_2451 0.0618 ms 100.0%
+  triton_bmm_2453 0.0638 ms 96.9%
+  triton_bmm_2452 0.0642 ms 96.3%
+  triton_bmm_2454 0.0663 ms 93.2%
+  triton_bmm_2450 0.0677 ms 91.3%
+  triton_bmm_2456 0.0688 ms 89.9%
+  triton_bmm_2458 0.0688 ms 89.9%
+  triton_bmm_2457 0.0723 ms 85.6%
+  triton_bmm_2461 0.0757 ms 81.7%
+  bmm 0.0791 ms 78.2%
+SingleProcess AUTOTUNE takes 4.5038 seconds
+AUTOTUNE int_mm(8192x512, 512x1024, 8192x1024)
+  triton_mm_2464 0.0675 ms 100.0%
+  triton_mm_2470 0.0675 ms 100.0%
+  triton_mm_2463 0.0688 ms 98.0%
+  triton_mm_2465 0.0769 ms 87.7%
+  triton_mm_2462 0.0789 ms 85.5%
+  triton_mm_2466 0.0797 ms 84.6%
+  triton_mm_2472 0.0820 ms 82.3%
+  triton_mm_2471 0.0830 ms 81.3%
+  triton_mm_2469 0.1116 ms 60.5%
+  triton_mm_2468 0.1497 ms 45.1%
+SingleProcess AUTOTUNE takes 7.7778 seconds
+AUTOTUNE int_mm(8192x1024, 1024x64, 8192x64)
+  triton_mm_2489 0.0187 ms 100.0%
+  triton_mm_2488 0.0204 ms 91.7%
+  triton_mm_2490 0.0219 ms 85.1%
+  triton_mm_2484 0.0237 ms 78.6%
+  triton_mm_2483 0.0252 ms 74.2%
+  triton_mm_2481 0.0273 ms 68.3%
+  triton_mm_2485 0.0277 ms 67.4%
+  triton_mm_2486 0.0282 ms 66.3%
+  triton_mm_2482 0.0290 ms 64.3%
+  triton_mm_2480 0.0335 ms 55.6%
+SingleProcess AUTOTUNE takes 5.0084 seconds
+AUTOTUNE bmm(32x4096x32, 32x32x257)
+  triton_bmm_2502 0.0852 ms 100.0%
+  triton_bmm_2504 0.0856 ms 99.5%
+  triton_bmm_2513 0.0895 ms 95.2%
+  triton_bmm_2506 0.0909 ms 93.8%
+  triton_bmm_2512 0.0924 ms 92.3%
+  triton_bmm_2510 0.0928 ms 91.9%
+  triton_bmm_2503 0.0956 ms 89.2%
+  triton_bmm_2508 0.1028 ms 82.9%
+  triton_bmm_2505 0.1033 ms 82.5%
+  triton_bmm_2509 0.1037 ms 82.2%
+SingleProcess AUTOTUNE takes 5.0008 seconds
+AUTOTUNE bmm(32x4096x257, 32x257x32)
+  triton_bmm_2525 0.0855 ms 100.0%
+  triton_bmm_2516 0.0883 ms 96.8%
+  triton_bmm_2522 0.0886 ms 96.5%
+  triton_bmm_2520 0.0919 ms 93.1%
+  triton_bmm_2515 0.0936 ms 91.4%
+  triton_bmm_2517 0.0964 ms 88.7%
+  triton_bmm_2519 0.0971 ms 88.1%
+  triton_bmm_2518 0.0979 ms 87.4%
+  triton_bmm_2523 0.1008 ms 84.9%
+  triton_bmm_2514 0.1051 ms 81.4%
+SingleProcess AUTOTUNE takes 4.3951 seconds
+AUTOTUNE convolution(32x1536x16x16, 1024x1536x3x3)
+  convolution 0.9803 ms 100.0%
+  triton_convolution_2624 6.9608 ms 14.1%
+  triton_convolution_2619 11.0669 ms 8.9%
+  triton_convolution_2625 11.9512 ms 8.2%
+  triton_convolution_2620 13.1855 ms 7.4%
+  triton_convolution_2622 13.4764 ms 7.3%
+  triton_convolution_2623 21.2557 ms 4.6%
+  triton_convolution_2621 28.0694 ms 3.5%
+SingleProcess AUTOTUNE takes 5.6950 seconds
+AUTOTUNE addmm(8192x1024, 8192x1536, 1536x1024)
+  bias_addmm 0.1361 ms 100.0%
+  triton_mm_2702 0.1494 ms 91.1%
+  triton_mm_2701 0.1498 ms 90.8%
+  addmm 0.1607 ms 84.7%
+  triton_mm_2703 0.1707 ms 79.7%
+  triton_mm_2704 0.1733 ms 78.5%
+  triton_mm_2700 0.1980 ms 68.7%
+  triton_mm_2708 0.1985 ms 68.6%
+  triton_mm_2707 0.2378 ms 57.2%
+  triton_mm_2710 0.3291 ms 41.3%
+SingleProcess AUTOTUNE takes 5.5354 seconds
+AUTOTUNE addmm(8192x2048, 8192x1024, 1024x2048)
+  bias_addmm 0.1617 ms 100.0%
+  triton_mm_2900 0.1937 ms 83.5%
+  triton_mm_2899 0.1945 ms 83.1%
+  addmm 0.2194 ms 73.7%
+  triton_mm_2901 0.2257 ms 71.6%
+  triton_mm_2902 0.2281 ms 70.9%
+  triton_mm_2898 0.2495 ms 64.8%
+  triton_mm_2906 0.2713 ms 59.6%
+  triton_mm_2905 0.2985 ms 54.2%
+  triton_mm_2908 0.3877 ms 41.7%
+SingleProcess AUTOTUNE takes 5.6505 seconds
+AUTOTUNE convolution(32x768x32x32, 512x768x3x3)
+  convolution 1.0140 ms 100.0%
+  triton_convolution_2915 6.6166 ms 15.3%
+  triton_convolution_2910 8.2038 ms 12.4%
+  triton_convolution_2913 9.8618 ms 10.3%
+  triton_convolution_2911 13.6631 ms 7.4%
+  triton_convolution_2916 13.8621 ms 7.3%
+  triton_convolution_2914 18.5968 ms 5.5%
+  triton_convolution_2912 29.4599 ms 3.4%
+SingleProcess AUTOTUNE takes 5.3172 seconds
+AUTOTUNE int_mm(32768x512, 512x512, 32768x512)
+  triton_mm_2941 0.1242 ms 100.0%
+  triton_mm_2940 0.1273 ms 97.5%
+  triton_mm_2947 0.1290 ms 96.3%
+  triton_mm_2949 0.1365 ms 91.0%
+  triton_mm_2948 0.1369 ms 90.7%
+  triton_mm_2942 0.1417 ms 87.6%
+  triton_mm_2943 0.1464 ms 84.8%
+  triton_mm_2939 0.1490 ms 83.4%
+  triton_mm_2946 0.2074 ms 59.9%
+  triton_mm_2945 0.2921 ms 42.5%
+SingleProcess AUTOTUNE takes 7.7415 seconds
+AUTOTUNE convolution(32x512x32x32, 512x512x3x3)
+  convolution 0.6889 ms 100.0%
+  triton_convolution_2989 4.2980 ms 16.0%
+  triton_convolution_2984 5.8858 ms 11.7%
+  triton_convolution_2990 6.3587 ms 10.8%
+  triton_convolution_2987 6.3719 ms 10.8%
+  triton_convolution_2985 10.2362 ms 6.7%
+  triton_convolution_2988 11.8454 ms 5.8%
+  triton_convolution_2986 20.1195 ms 3.4%
+SingleProcess AUTOTUNE takes 5.8465 seconds
+AUTOTUNE addmm(32768x512, 32768x768, 768x512)
+  bias_addmm 0.1335 ms 100.0%
+  triton_mm_2993 0.1581 ms 84.4%
+  triton_mm_2992 0.1607 ms 83.1%
+  triton_mm_2994 0.1817 ms 73.5%
+  triton_mm_2995 0.1832 ms 72.9%
+  addmm 0.1835 ms 72.8%
+  triton_mm_2991 0.2116 ms 63.1%
+  triton_mm_2999 0.2161 ms 61.8%
+  triton_mm_2998 0.2941 ms 45.4%
+  triton_mm_3001 0.3156 ms 42.3%
+SingleProcess AUTOTUNE takes 5.7284 seconds
+AUTOTUNE addmm(32768x1024, 32768x512, 512x1024)
+  bias_addmm 0.1762 ms 100.0%
+  triton_mm_3191 0.2152 ms 81.9%
+  triton_mm_3190 0.2162 ms 81.5%
+  triton_mm_3192 0.2557 ms 68.9%
+  triton_mm_3193 0.2576 ms 68.4%
+  triton_mm_3189 0.2740 ms 64.3%
+  addmm 0.2875 ms 61.3%
+  triton_mm_3197 0.3035 ms 58.1%
+  triton_mm_3196 0.3723 ms 47.3%
+  triton_mm_3199 0.4108 ms 42.9%
+SingleProcess AUTOTUNE takes 5.7263 seconds
+AUTOTUNE convolution(32x384x64x64, 256x384x3x3)
+  convolution 1.0823 ms 100.0%
+  triton_convolution_3206 6.3966 ms 16.9%
+  triton_convolution_3201 7.6596 ms 14.1%
+  triton_convolution_3207 7.9435 ms 13.6%
+  triton_convolution_3204 9.1220 ms 11.9%
+  triton_convolution_3202 14.9503 ms 7.2%
+  triton_convolution_3205 18.3887 ms 5.9%
+  triton_convolution_3203 29.1732 ms 3.7%
+SingleProcess AUTOTUNE takes 5.3170 seconds
+AUTOTUNE int_mm(131072x256, 256x512, 131072x512)
+  triton_mm_3232 0.3276 ms 100.0%
+  triton_mm_3231 0.3303 ms 99.2%
+  triton_mm_3230 0.3491 ms 93.8%
+  triton_mm_3238 0.3556 ms 92.1%
+  triton_mm_3233 0.3826 ms 85.6%
+  triton_mm_3234 0.4028 ms 81.3%
+  triton_mm_3240 0.4256 ms 77.0%
+  triton_mm_3239 0.4267 ms 76.8%
+  triton_mm_3237 0.5187 ms 63.2%
+  triton_mm_3236 0.7297 ms 44.9%
+SingleProcess AUTOTUNE takes 7.8866 seconds
+AUTOTUNE int_mm(131072x512, 512x256, 131072x256)
+  triton_mm_3266 0.2447 ms 100.0%
+  triton_mm_3265 0.2494 ms 98.1%
+  triton_mm_3272 0.2511 ms 97.4%
+  triton_mm_3274 0.2590 ms 94.5%
+  triton_mm_3273 0.2596 ms 94.3%
+  triton_mm_3267 0.2774 ms 88.2%
+  triton_mm_3268 0.2869 ms 85.3%
+  triton_mm_3264 0.2887 ms 84.8%
+  triton_mm_3271 0.4306 ms 56.8%
+  triton_mm_3269 0.5768 ms 42.4%
+SingleProcess AUTOTUNE takes 7.8289 seconds
+AUTOTUNE convolution(32x256x64x64, 256x256x3x3)
+  convolution 0.7379 ms 100.0%
+  triton_convolution_3280 3.8248 ms 19.3%
+  triton_convolution_3278 4.3620 ms 16.9%
+  triton_convolution_3275 5.0466 ms 14.6%
+  triton_convolution_3281 5.1101 ms 14.4%
+  triton_convolution_3276 9.8005 ms 7.5%
+  triton_convolution_3279 10.1794 ms 7.2%
+  triton_convolution_3277 19.9078 ms 3.7%
+SingleProcess AUTOTUNE takes 5.1393 seconds
+AUTOTUNE addmm(131072x256, 131072x384, 384x256)
+  bias_addmm 0.1763 ms 100.0%
+  triton_mm_3284 0.1892 ms 93.2%
+  triton_mm_3283 0.1944 ms 90.7%
+  triton_mm_3286 0.2196 ms 80.3%
+  triton_mm_3285 0.2202 ms 80.1%
+  triton_mm_3282 0.2451 ms 71.9%
+  triton_mm_3290 0.2487 ms 70.9%
+  addmm 0.2753 ms 64.1%
+  triton_mm_3289 0.3493 ms 50.5%
+  triton_mm_3292 0.3868 ms 45.6%
+SingleProcess AUTOTUNE takes 5.3353 seconds
+AUTOTUNE addmm(131072x512, 131072x256, 256x512)
+  bias_addmm 0.2347 ms 100.0%
+  triton_mm_3481 0.2642 ms 88.8%
+  triton_mm_3482 0.2646 ms 88.7%
+  triton_mm_3487 0.2882 ms 81.4%
+  triton_mm_3480 0.3081 ms 76.2%
+  triton_mm_3484 0.3176 ms 73.9%
+  triton_mm_3483 0.3196 ms 73.4%
+  triton_mm_3488 0.3684 ms 63.7%
+  triton_mm_3490 0.4478 ms 52.4%
+  addmm 0.4605 ms 51.0%
+SingleProcess AUTOTUNE takes 5.6946 seconds
+AUTOTUNE convolution(32x256x128x128, 128x256x3x3)
+  convolution 1.4138 ms 100.0%
+  triton_convolution_3495 9.1103 ms 15.5%
+  triton_convolution_3498 10.1303 ms 14.0%
+  triton_convolution_3497 10.1881 ms 13.9%
+  triton_convolution_3492 12.2094 ms 11.6%
+  triton_convolution_3493 19.7230 ms 7.2%
+  triton_convolution_3496 20.5600 ms 6.9%
+  triton_convolution_3494 40.0119 ms 3.5%
+SingleProcess AUTOTUNE takes 5.0172 seconds
+AUTOTUNE addmm(524288x128, 524288x256, 256x128)
+  bias_addmm 0.3015 ms 100.0%
+  triton_mm_3518 0.3042 ms 99.1%
+  triton_mm_3519 0.3050 ms 98.8%
+  triton_mm_3524 0.3381 ms 89.2%
+  triton_mm_3520 0.3399 ms 88.7%
+  triton_mm_3521 0.3419 ms 88.2%
+  triton_mm_3517 0.3611 ms 83.5%
+  triton_mm_3525 0.3860 ms 78.1%
+  addmm 0.5315 ms 56.7%
+  triton_mm_3527 0.5490 ms 54.9%
+SingleProcess AUTOTUNE takes 5.5892 seconds
+AUTOTUNE addmm(524288x6, 524288x128, 128x6)
+  triton_mm_3642 0.1088 ms 100.0%
+  triton_mm_3641 0.1095 ms 99.4%
+  triton_mm_3644 0.1100 ms 98.9%
+  triton_mm_3647 0.1109 ms 98.1%
+  triton_mm_3648 0.1115 ms 97.6%
+  triton_mm_3640 0.1120 ms 97.2%
+  triton_mm_3649 0.1120 ms 97.2%
+  triton_mm_3643 0.1121 ms 97.1%
+  triton_mm_3645 0.1123 ms 96.9%
+  triton_mm_3650 0.1158 ms 94.0%
+SingleProcess AUTOTUNE takes 3.9194 seconds
+AUTOTUNE mm(32x16, 16x64)
+  triton_mm_3659 0.0060 ms 100.0%
+  triton_mm_3652 0.0060 ms 99.5%
+  triton_mm_3653 0.0060 ms 99.5%
+  triton_mm_3654 0.0060 ms 99.5%
+  triton_mm_3656 0.0060 ms 99.5%
+  triton_mm_3657 0.0060 ms 99.5%
+  triton_mm_3658 0.0060 ms 99.5%
+  triton_mm_3655 0.0065 ms 92.2%
+  triton_mm_3660 0.0066 ms 91.3%
+  mm 0.0072 ms 83.9%
+SingleProcess AUTOTUNE takes 3.0109 seconds
+AUTOTUNE int_mm(32x64, 64x256, 32x256)
+  triton_mm_3669 0.0069 ms 100.0%
+  triton_mm_3666 0.0073 ms 95.2%
+  triton_mm_3667 0.0074 ms 93.9%
+  triton_mm_3663 0.0076 ms 90.4%
+  triton_mm_3662 0.0078 ms 88.2%
+  triton_mm_3665 0.0078 ms 88.2%
+  triton_mm_3661 0.0079 ms 87.8%
+  triton_mm_3664 0.0080 ms 86.4%
+  triton_mm_3668 0.0082 ms 84.0%
+  triton_mm_3671 0.0083 ms 83.1%
+SingleProcess AUTOTUNE takes 3.7245 seconds
+AUTOTUNE convolution(32x6x256x256, 8x6x3x3)
+  triton_convolution_3676 0.2349 ms 100.0%
+  triton_convolution_3675 0.2604 ms 90.2%
+  triton_convolution_3672 0.2661 ms 88.3%
+  convolution 0.3061 ms 76.8%
+  triton_convolution_3673 0.3164 ms 74.3%
+  triton_convolution_3674 0.3608 ms 65.1%
+SingleProcess AUTOTUNE takes 2.0313 seconds
+AUTOTUNE convolution(32x6x256x256, 4x6x7x7)
+  convolution 0.6466 ms 100.0%
+  triton_convolution_3678 1.5045 ms 43.0%
+  triton_convolution_3681 1.6249 ms 39.8%
+  triton_convolution_3680 1.6900 ms 38.3%
+  triton_convolution_3677 1.6934 ms 38.2%
+  triton_convolution_3679 2.1541 ms 30.0%
+SingleProcess AUTOTUNE takes 2.1855 seconds
+AUTOTUNE convolution(32x6x256x256, 4x6x15x15)
+  triton_convolution_3683 6.3238 ms 100.0%
+  triton_convolution_3686 6.8757 ms 92.0%
+  triton_convolution_3685 6.9988 ms 90.4%
+  triton_convolution_3682 7.0954 ms 89.1%
+  triton_convolution_3684 8.9865 ms 70.4%
+  convolution 11.9732 ms 52.8%
+SingleProcess AUTOTUNE takes 2.4538 seconds
+AUTOTUNE convolution(32x16x256x256, 16x16x3x3)
+  convolution 0.2086 ms 100.0%
+  triton_convolution_3691 0.3875 ms 53.8%
+  triton_convolution_3688 0.4056 ms 51.4%
+  triton_convolution_3690 0.4287 ms 48.7%
+  triton_convolution_3689 0.4489 ms 46.5%
+  triton_convolution_3687 0.4495 ms 46.4%
+SingleProcess AUTOTUNE takes 2.4469 seconds
+AUTOTUNE int_mm(32x64, 64x64, 32x64)
+  triton_mm_3696 0.0073 ms 100.0%
+  triton_mm_3697 0.0074 ms 99.6%
+  triton_mm_3695 0.0074 ms 99.3%
+  triton_mm_3698 0.0075 ms 98.3%
+  triton_mm_3693 0.0076 ms 96.0%
+  triton_mm_3694 0.0078 ms 93.9%
+  triton_mm_3692 0.0078 ms 93.5%
+SingleProcess AUTOTUNE takes 2.2130 seconds
+AUTOTUNE int_mm(32x64, 64x32, 32x32)
+  triton_mm_3703 0.0067 ms 100.0%
+  triton_mm_3701 0.0067 ms 99.5%
+  triton_mm_3702 0.0069 ms 97.2%
+  triton_mm_3700 0.0073 ms 91.9%
+  triton_mm_3699 0.0077 ms 87.1%
+SingleProcess AUTOTUNE takes 1.5504 seconds
+AUTOTUNE addmm(524288x16, 524288x64, 64x16)
+  triton_mm_3747 0.0688 ms 100.0%
+  triton_mm_3748 0.0700 ms 98.4%
+  bias_addmm 0.0706 ms 97.5%
+  triton_mm_3740 0.0709 ms 97.0%
+  triton_mm_3741 0.0717 ms 95.9%
+  triton_mm_3749 0.0720 ms 95.5%
+  triton_mm_3742 0.0726 ms 94.8%
+  triton_mm_3739 0.0729 ms 94.4%
+  triton_mm_3746 0.0731 ms 94.2%
+  triton_mm_3743 0.0732 ms 94.1%
+SingleProcess AUTOTUNE takes 4.1355 seconds
+AUTOTUNE convolution(32x16x128x128, 16x16x3x3)
+  convolution 0.0621 ms 100.0%
+  triton_convolution_3755 0.1052 ms 59.1%
+  triton_convolution_3752 0.1079 ms 57.6%
+  triton_convolution_3754 0.1140 ms 54.5%
+  triton_convolution_3751 0.1185 ms 52.4%
+  triton_convolution_3753 0.1228 ms 50.6%
+SingleProcess AUTOTUNE takes 1.9250 seconds
+AUTOTUNE int_mm(64x128, 128x1024, 64x1024)
+  triton_mm_3781 0.0081 ms 100.0%
+  triton_mm_3782 0.0082 ms 98.1%
+  triton_mm_3778 0.0086 ms 93.3%
+  triton_mm_3784 0.0088 ms 92.0%
+  triton_mm_3780 0.0089 ms 90.6%
+  triton_mm_3776 0.0093 ms 86.6%
+  triton_mm_3779 0.0102 ms 79.2%
+  triton_mm_3777 0.0104 ms 77.5%
+  triton_mm_3786 0.0104 ms 77.5%
+  triton_mm_3783 0.0118 ms 68.5%
+SingleProcess AUTOTUNE takes 4.5669 seconds
+AUTOTUNE bmm(32x16384x16, 32x16x512)
+  triton_bmm_3794 0.3451 ms 100.0%
+  triton_bmm_3790 0.3537 ms 97.6%
+  triton_bmm_3788 0.3548 ms 97.3%
+  triton_bmm_3791 0.3555 ms 97.1%
+  triton_bmm_3787 0.3607 ms 95.7%
+  triton_bmm_3795 0.3622 ms 95.3%
+  triton_bmm_3789 0.3623 ms 95.3%
+  triton_bmm_3793 0.4047 ms 85.3%
+  bmm 0.4166 ms 82.8%
+  triton_bmm_3792 0.4334 ms 79.6%
+SingleProcess AUTOTUNE takes 3.4801 seconds
+AUTOTUNE bmm(256x16384x64, 256x64x3)
+  triton_bmm_3800 0.3750 ms 100.0%
+  triton_bmm_3801 0.3788 ms 99.0%
+  triton_bmm_3809 0.3792 ms 98.9%
+  triton_bmm_3808 0.3797 ms 98.8%
+  triton_bmm_3806 0.3802 ms 98.6%
+  triton_bmm_3798 0.3808 ms 98.5%
+  triton_bmm_3805 0.3814 ms 98.3%
+  triton_bmm_3799 0.3816 ms 98.3%
+  triton_bmm_3802 0.3840 ms 97.6%
+  triton_bmm_3807 0.3860 ms 97.1%
+SingleProcess AUTOTUNE takes 3.9589 seconds
+AUTOTUNE bmm(256x16384x3, 256x3x64)
+  triton_bmm_3817 0.3765 ms 100.0%
+  triton_bmm_3814 0.3765 ms 100.0%
+  triton_bmm_3818 0.3804 ms 99.0%
+  triton_bmm_3813 0.3811 ms 98.8%
+  triton_bmm_3812 0.3828 ms 98.4%
+  triton_bmm_3810 0.3878 ms 97.1%
+  triton_bmm_3811 0.3883 ms 97.0%
+  triton_bmm_3816 0.4179 ms 90.1%
+  triton_bmm_3820 0.4447 ms 84.7%
+  triton_bmm_3815 0.4651 ms 81.0%
+SingleProcess AUTOTUNE takes 3.7455 seconds
+AUTOTUNE int_mm(524288x512, 512x16, 524288x16)
+  triton_mm_3829 0.2082 ms 100.0%
+  triton_mm_3830 0.2119 ms 98.3%
+  triton_mm_3831 0.2210 ms 94.2%
+  triton_mm_3823 0.2304 ms 90.4%
+  triton_mm_3822 0.2307 ms 90.2%
+  triton_mm_3825 0.2316 ms 89.9%
+  triton_mm_3821 0.2316 ms 89.9%
+  triton_mm_3824 0.2317 ms 89.8%
+  triton_mm_3826 0.2324 ms 89.6%
+  triton_mm_3827 0.2346 ms 88.8%
+SingleProcess AUTOTUNE takes 3.7697 seconds
+AUTOTUNE addmm(131072x32, 131072x64, 64x32)
+  bias_addmm 0.0284 ms 100.0%
+  triton_mm_3916 0.0284 ms 99.8%
+  triton_mm_3917 0.0296 ms 95.8%
+  triton_mm_3909 0.0298 ms 95.3%
+  triton_mm_3910 0.0300 ms 94.7%
+  triton_mm_3912 0.0302 ms 93.8%
+  triton_mm_3915 0.0308 ms 91.9%
+  triton_mm_3918 0.0310 ms 91.4%
+  triton_mm_3911 0.0312 ms 90.8%
+  triton_mm_3908 0.0321 ms 88.4%
+SingleProcess AUTOTUNE takes 4.1547 seconds
+AUTOTUNE convolution(32x32x64x64, 32x32x3x3)
+  convolution 0.0416 ms 100.0%
+  triton_convolution_3923 0.0700 ms 59.5%
+  triton_convolution_3926 0.0705 ms 59.0%
+  triton_convolution_3925 0.0755 ms 55.1%
+  triton_convolution_3924 0.0758 ms 54.9%
+  triton_convolution_3920 0.0808 ms 51.5%
+  triton_convolution_3921 0.1205 ms 34.5%
+  triton_convolution_3922 0.2179 ms 19.1%
+SingleProcess AUTOTUNE takes 2.8874 seconds
+AUTOTUNE int_mm(131072x32, 32x512, 131072x512)
+  triton_mm_3972 0.1912 ms 100.0%
+  triton_mm_3966 0.2300 ms 83.1%
+  triton_mm_3967 0.2309 ms 82.8%
+  triton_mm_3971 0.2424 ms 78.9%
+  triton_mm_3974 0.2433 ms 78.6%
+  triton_mm_3969 0.2542 ms 75.2%
+  triton_mm_3968 0.3420 ms 55.9%
+  triton_mm_3970 0.3641 ms 52.5%
+  triton_mm_3973 0.4153 ms 46.0%
+  triton_mm_3975 0.4243 ms 45.1%
+SingleProcess AUTOTUNE takes 5.2788 seconds
+AUTOTUNE bmm(256x4096x64, 256x64x3)
+  triton_bmm_3985 0.1102 ms 100.0%
+  triton_bmm_3978 0.1110 ms 99.3%
+  triton_bmm_3979 0.1110 ms 99.3%
+  triton_bmm_3986 0.1117 ms 98.7%
+  triton_bmm_3981 0.1120 ms 98.4%
+  triton_bmm_3980 0.1122 ms 98.3%
+  triton_bmm_3977 0.1124 ms 98.1%
+  triton_bmm_3984 0.1126 ms 97.9%
+  triton_bmm_3987 0.1128 ms 97.7%
+  triton_bmm_3988 0.1132 ms 97.4%
+SingleProcess AUTOTUNE takes 3.7179 seconds
+AUTOTUNE bmm(256x4096x3, 256x3x64)
+  triton_bmm_3993 0.0995 ms 100.0%
+  triton_bmm_3996 0.0998 ms 99.7%
+  triton_bmm_3992 0.1003 ms 99.2%
+  triton_bmm_3997 0.1004 ms 99.2%
+  triton_bmm_3991 0.1010 ms 98.5%
+  triton_bmm_3989 0.1022 ms 97.4%
+  triton_bmm_3990 0.1022 ms 97.4%
+  triton_bmm_3995 0.1048 ms 94.9%
+  triton_bmm_3999 0.1117 ms 89.1%
+  triton_bmm_3994 0.1159 ms 85.9%
+SingleProcess AUTOTUNE takes 3.3701 seconds
+AUTOTUNE int_mm(131072x512, 512x32, 131072x32)
+  triton_mm_4009 0.0748 ms 100.0%
+  triton_mm_4001 0.0752 ms 99.6%
+  triton_mm_4002 0.0753 ms 99.4%
+  triton_mm_4008 0.0766 ms 97.7%
+  triton_mm_4010 0.0778 ms 96.2%
+  triton_mm_4004 0.0783 ms 95.5%
+  triton_mm_4007 0.0839 ms 89.2%
+  triton_mm_4003 0.0846 ms 88.4%
+  triton_mm_4005 0.0858 ms 87.2%
+  triton_mm_4000 0.0895 ms 83.6%
+SingleProcess AUTOTUNE takes 4.0934 seconds
+AUTOTUNE addmm(32768x64, 32768x128, 128x64)
+  triton_mm_4095 0.0166 ms 100.0%
+  triton_mm_4096 0.0169 ms 98.5%
+  bias_addmm 0.0173 ms 95.9%
+  triton_mm_4102 0.0174 ms 95.6%
+  triton_mm_4097 0.0179 ms 92.7%
+  triton_mm_4103 0.0192 ms 86.4%
+  triton_mm_4098 0.0193 ms 86.1%
+  triton_mm_4099 0.0206 ms 80.5%
+  triton_mm_4101 0.0213 ms 78.0%
+  triton_mm_4105 0.0213 ms 77.8%
+SingleProcess AUTOTUNE takes 4.7449 seconds
+AUTOTUNE convolution(32x64x32x32, 64x64x3x3)
+  convolution 0.0325 ms 100.0%
+  triton_convolution_4112 0.0962 ms 33.8%
+  triton_convolution_4107 0.0994 ms 32.7%
+  triton_convolution_4113 0.1257 ms 25.9%
+  triton_convolution_4110 0.1282 ms 25.4%
+  triton_convolution_4111 0.1342 ms 24.3%
+  triton_convolution_4108 0.1807 ms 18.0%
+  triton_convolution_4109 0.4818 ms 6.8%
+SingleProcess AUTOTUNE takes 4.4604 seconds
+AUTOTUNE int_mm(32x64, 64x128, 32x128)
+  triton_mm_4122 0.0069 ms 100.0%
+  triton_mm_4116 0.0071 ms 97.3%
+  triton_mm_4119 0.0073 ms 95.6%
+  triton_mm_4114 0.0073 ms 94.8%
+  triton_mm_4118 0.0073 ms 94.8%
+  triton_mm_4120 0.0074 ms 94.3%
+  triton_mm_4117 0.0080 ms 86.8%
+  triton_mm_4123 0.0083 ms 83.8%
+  triton_mm_4115 0.0083 ms 83.5%
+  triton_mm_4121 0.0088 ms 79.3%
+SingleProcess AUTOTUNE takes 3.3457 seconds
+AUTOTUNE int_mm(32768x64, 64x512, 32768x512)
+  triton_mm_4161 0.0622 ms 100.0%
+  triton_mm_4159 0.0639 ms 97.3%
+  triton_mm_4160 0.0643 ms 96.8%
+  triton_mm_4162 0.0676 ms 92.0%
+  triton_mm_4167 0.0718 ms 86.6%
+  triton_mm_4163 0.0729 ms 85.3%
+  triton_mm_4165 0.0913 ms 68.1%
+  triton_mm_4164 0.0942 ms 66.0%
+  triton_mm_4166 0.0945 ms 65.8%
+  triton_mm_4169 0.1215 ms 51.2%
+SingleProcess AUTOTUNE takes 6.5412 seconds
+AUTOTUNE bmm(256x1024x64, 256x64x3)
+  triton_bmm_4178 0.0407 ms 100.0%
+  triton_bmm_4179 0.0414 ms 98.1%
+  triton_bmm_4172 0.0429 ms 94.8%
+  triton_bmm_4171 0.0429 ms 94.7%
+  triton_bmm_4174 0.0440 ms 92.4%
+  triton_bmm_4170 0.0444 ms 91.6%
+  triton_bmm_4177 0.0444 ms 91.5%
+  triton_bmm_4173 0.0446 ms 91.2%
+  triton_bmm_4175 0.0452 ms 90.0%
+  triton_bmm_4180 0.0456 ms 89.3%
+SingleProcess AUTOTUNE takes 3.8414 seconds
+AUTOTUNE bmm(256x1024x3, 256x3x64)
+  triton_bmm_4185 0.0292 ms 100.0%
+  triton_bmm_4189 0.0292 ms 100.0%
+  triton_bmm_4190 0.0292 ms 99.9%
+  triton_bmm_4186 0.0292 ms 99.7%
+  triton_bmm_4184 0.0300 ms 97.2%
+  triton_bmm_4182 0.0303 ms 96.1%
+  triton_bmm_4183 0.0305 ms 95.6%
+  triton_bmm_4188 0.0314 ms 92.9%
+  triton_bmm_4192 0.0333 ms 87.6%
+  triton_bmm_4187 0.0347 ms 84.0%
+SingleProcess AUTOTUNE takes 3.7519 seconds
+AUTOTUNE int_mm(32768x512, 512x64, 32768x64)
+  triton_mm_4201 0.0326 ms 100.0%
+  triton_mm_4202 0.0328 ms 99.4%
+  triton_mm_4196 0.0354 ms 92.0%
+  triton_mm_4197 0.0360 ms 90.6%
+  triton_mm_4195 0.0367 ms 88.8%
+  triton_mm_4200 0.0371 ms 87.8%
+  triton_mm_4203 0.0375 ms 86.9%
+  triton_mm_4194 0.0455 ms 71.5%
+  triton_mm_4199 0.0462 ms 70.5%
+  triton_mm_4198 0.0464 ms 70.3%
+SingleProcess AUTOTUNE takes 5.4949 seconds
+AUTOTUNE addmm(8192x128, 8192x256, 256x128)
+  triton_mm_4292 0.0144 ms 100.0%
+  triton_mm_4293 0.0145 ms 99.1%
+  bias_addmm 0.0149 ms 96.2%
+  triton_mm_4295 0.0151 ms 95.1%
+  triton_mm_4294 0.0152 ms 94.7%
+  triton_mm_4299 0.0153 ms 93.7%
+  triton_mm_4291 0.0166 ms 86.7%
+  triton_mm_4296 0.0186 ms 77.3%
+  triton_mm_4297 0.0191 ms 75.3%
+  triton_mm_4298 0.0193 ms 74.3%
+SingleProcess AUTOTUNE takes 5.6315 seconds
+AUTOTUNE convolution(32x128x16x16, 128x128x3x3)
+  convolution 0.0252 ms 100.0%
+  triton_convolution_4306 0.1141 ms 22.1%
+  triton_convolution_4303 0.1191 ms 21.2%
+  triton_convolution_4309 0.1249 ms 20.2%
+  triton_convolution_4308 0.1399 ms 18.0%
+  triton_convolution_4307 0.1507 ms 16.8%
+  triton_convolution_4304 0.2574 ms 9.8%
+  triton_convolution_4305 0.4796 ms 5.3%
+SingleProcess AUTOTUNE takes 4.4681 seconds
+AUTOTUNE int_mm(8192x128, 128x512, 8192x512)
+  triton_mm_4365 0.0254 ms 100.0%
+  triton_mm_4359 0.0255 ms 99.4%
+  triton_mm_4357 0.0257 ms 98.6%
+  triton_mm_4358 0.0260 ms 97.5%
+  triton_mm_4360 0.0273 ms 92.9%
+  triton_mm_4361 0.0282 ms 90.1%
+  triton_mm_4364 0.0382 ms 66.4%
+  triton_mm_4363 0.0404 ms 62.9%
+  triton_mm_4362 0.0407 ms 62.3%
+  triton_mm_4367 0.0548 ms 46.3%
+SingleProcess AUTOTUNE takes 5.8225 seconds
+AUTOTUNE bmm(256x256x64, 256x64x3)
+  triton_bmm_4368 0.0153 ms 100.0%
+  triton_bmm_4377 0.0157 ms 97.3%
+  triton_bmm_4376 0.0158 ms 96.6%
+  triton_bmm_4375 0.0160 ms 95.4%
+  triton_bmm_4369 0.0162 ms 94.3%
+  triton_bmm_4378 0.0165 ms 92.6%
+  triton_bmm_4379 0.0165 ms 92.6%
+  triton_bmm_4371 0.0168 ms 90.7%
+  triton_bmm_4374 0.0169 ms 90.3%
+  triton_bmm_4372 0.0169 ms 90.2%
+SingleProcess AUTOTUNE takes 3.7464 seconds
+AUTOTUNE bmm(256x256x3, 256x3x64)
+  triton_bmm_4380 0.0115 ms 100.0%
+  triton_bmm_4388 0.0118 ms 97.8%
+  triton_bmm_4383 0.0118 ms 97.6%
+  triton_bmm_4382 0.0119 ms 96.8%
+  triton_bmm_4381 0.0120 ms 95.7%
+  triton_bmm_4387 0.0123 ms 93.5%
+  triton_bmm_4384 0.0124 ms 93.0%
+  triton_bmm_4386 0.0128 ms 89.8%
+  triton_bmm_4390 0.0135 ms 85.1%
+  triton_bmm_4385 0.0137 ms 83.9%
+SingleProcess AUTOTUNE takes 3.3872 seconds
+AUTOTUNE int_mm(8192x512, 512x128, 8192x128)
+  triton_mm_4399 0.0170 ms 100.0%
+  triton_mm_4400 0.0186 ms 91.6%
+  triton_mm_4393 0.0211 ms 80.6%
+  triton_mm_4391 0.0216 ms 78.9%
+  triton_mm_4392 0.0219 ms 77.7%
+  triton_mm_4395 0.0227 ms 75.1%
+  triton_mm_4394 0.0236 ms 72.1%
+  triton_mm_4396 0.0264 ms 64.5%
+  triton_mm_4397 0.0268 ms 63.4%
+  triton_mm_4398 0.0283 ms 60.2%
+SingleProcess AUTOTUNE takes 7.0206 seconds
+AUTOTUNE addmm(8192x256, 8192x128, 128x256)
+  triton_mm_4492 0.0134 ms 100.0%
+  triton_mm_4490 0.0141 ms 94.8%
+  triton_mm_4491 0.0146 ms 91.9%
+  triton_mm_4493 0.0158 ms 84.8%
+  bias_addmm 0.0160 ms 83.6%
+  triton_mm_4497 0.0162 ms 83.0%
+  triton_mm_4498 0.0162 ms 82.8%
+  triton_mm_4494 0.0163 ms 82.5%
+  triton_mm_4500 0.0179 ms 75.1%
+  triton_mm_4496 0.0203 ms 66.2%
+SingleProcess AUTOTUNE takes 5.7320 seconds
+AUTOTUNE convolution(32x256x16x16, 256x256x3x3)
+  convolution 0.0589 ms 100.0%
+  triton_convolution_4507 0.4267 ms 13.8%
+  triton_convolution_4505 0.4359 ms 13.5%
+  triton_convolution_4508 0.4726 ms 12.5%
+  triton_convolution_4506 0.5426 ms 10.9%
+  triton_convolution_4502 0.5497 ms 10.7%
+  triton_convolution_4503 0.6516 ms 9.0%
+  triton_convolution_4504 1.8864 ms 3.1%
+SingleProcess AUTOTUNE takes 4.7642 seconds
+AUTOTUNE int_mm(32x64, 64x512, 32x512)
+  triton_mm_4514 0.0069 ms 100.0%
+  triton_mm_4513 0.0073 ms 94.3%
+  triton_mm_4517 0.0075 ms 92.3%
+  triton_mm_4511 0.0076 ms 90.4%
+  triton_mm_4509 0.0079 ms 87.4%
+  triton_mm_4515 0.0079 ms 87.4%
+  triton_mm_4512 0.0080 ms 86.4%
+  triton_mm_4516 0.0082 ms 84.4%
+  triton_mm_4519 0.0083 ms 83.1%
+  triton_mm_4510 0.0084 ms 82.4%
+SingleProcess AUTOTUNE takes 3.5704 seconds
+AUTOTUNE int_mm(8192x256, 256x512, 8192x512)
+  triton_mm_4539 0.0300 ms 100.0%
+  triton_mm_4533 0.0301 ms 99.8%
+  triton_mm_4532 0.0310 ms 96.8%
+  triton_mm_4531 0.0326 ms 92.1%
+  triton_mm_4534 0.0330 ms 91.1%
+  triton_mm_4535 0.0337 ms 89.3%
+  triton_mm_4538 0.0490 ms 61.4%
+  triton_mm_4541 0.0500 ms 60.0%
+  triton_mm_4540 0.0503 ms 59.7%
+  triton_mm_4537 0.0531 ms 56.6%
+SingleProcess AUTOTUNE takes 7.5650 seconds
+AUTOTUNE int_mm(8192x512, 512x256, 8192x256)
+  triton_mm_4573 0.0244 ms 100.0%
+  triton_mm_4567 0.0265 ms 92.1%
+  triton_mm_4566 0.0282 ms 86.5%
+  triton_mm_4565 0.0283 ms 86.3%
+  triton_mm_4569 0.0298 ms 82.0%
+  triton_mm_4568 0.0302 ms 80.8%
+  triton_mm_4574 0.0311 ms 78.6%
+  triton_mm_4575 0.0313 ms 78.0%
+  triton_mm_4570 0.0445 ms 54.8%
+  triton_mm_4571 0.0453 ms 53.8%
+SingleProcess AUTOTUNE takes 7.5989 seconds
+AUTOTUNE int_mm(8192x256, 256x64, 8192x64)
+  triton_mm_4591 0.0122 ms 100.0%
+  triton_mm_4592 0.0129 ms 95.0%
+  triton_mm_4587 0.0133 ms 91.8%
+  triton_mm_4585 0.0137 ms 89.0%
+  triton_mm_4584 0.0138 ms 88.8%
+  triton_mm_4588 0.0143 ms 85.7%
+  triton_mm_4586 0.0145 ms 84.3%
+  triton_mm_4583 0.0149 ms 81.8%
+  triton_mm_4589 0.0151 ms 80.8%
+  triton_mm_4593 0.0156 ms 78.1%
+SingleProcess AUTOTUNE takes 5.1368 seconds
+AUTOTUNE convolution(32x384x16x16, 256x384x3x3)
+  convolution 0.0812 ms 100.0%
+  triton_convolution_4726 0.7076 ms 11.5%
+  triton_convolution_4727 0.7458 ms 10.9%
+  triton_convolution_4722 0.8110 ms 10.0%
+  triton_convolution_4721 0.8226 ms 9.9%
+  triton_convolution_4724 0.8360 ms 9.7%
+  triton_convolution_4725 1.0758 ms 7.6%
+  triton_convolution_4723 2.7651 ms 2.9%
+SingleProcess AUTOTUNE takes 4.6369 seconds
+AUTOTUNE addmm(8192x256, 8192x384, 384x256)
+  triton_mm_4804 0.0216 ms 100.0%
+  bias_addmm 0.0227 ms 95.1%
+  triton_mm_4803 0.0234 ms 92.5%
+  triton_mm_4806 0.0234 ms 92.3%
+  triton_mm_4805 0.0239 ms 90.5%
+  triton_mm_4802 0.0240 ms 90.1%
+  triton_mm_4810 0.0242 ms 89.2%
+  addmm 0.0307 ms 70.5%
+  triton_mm_4807 0.0358 ms 60.3%
+  triton_mm_4808 0.0364 ms 59.4%
+SingleProcess AUTOTUNE takes 5.1685 seconds
+AUTOTUNE addmm(8192x512, 8192x256, 256x512)
+  triton_mm_5002 0.0238 ms 100.0%
+  bias_addmm 0.0252 ms 94.5%
+  triton_mm_5001 0.0259 ms 92.0%
+  triton_mm_5003 0.0270 ms 88.3%
+  triton_mm_5004 0.0271 ms 87.7%
+  triton_mm_5000 0.0293 ms 81.2%
+  triton_mm_5008 0.0311 ms 76.5%
+  triton_mm_5007 0.0329 ms 72.3%
+  addmm 0.0404 ms 59.0%
+  triton_mm_5010 0.0404 ms 58.9%
+SingleProcess AUTOTUNE takes 5.5393 seconds
+AUTOTUNE convolution(32x192x32x32, 128x192x3x3)
+  convolution 0.0865 ms 100.0%
+  triton_convolution_5015 0.4990 ms 17.3%
+  triton_convolution_5018 0.5144 ms 16.8%
+  triton_convolution_5017 0.5171 ms 16.7%
+  triton_convolution_5012 0.6531 ms 13.2%
+  triton_convolution_5016 0.7030 ms 12.3%
+  triton_convolution_5013 0.7915 ms 10.9%
+  triton_convolution_5014 2.1484 ms 4.0%
+SingleProcess AUTOTUNE takes 4.1705 seconds
+AUTOTUNE int_mm(32768x128, 128x512, 32768x512)
+  triton_mm_5043 0.0715 ms 100.0%
+  triton_mm_5041 0.0731 ms 97.8%
+  triton_mm_5042 0.0734 ms 97.4%
+  triton_mm_5049 0.0773 ms 92.4%
+  triton_mm_5044 0.0816 ms 87.6%
+  triton_mm_5045 0.0865 ms 82.7%
+  triton_mm_5048 0.1108 ms 64.5%
+  triton_mm_5051 0.1317 ms 54.3%
+  triton_mm_5050 0.1324 ms 54.0%
+  triton_mm_5047 0.1350 ms 53.0%
+SingleProcess AUTOTUNE takes 5.9248 seconds
+AUTOTUNE int_mm(32768x512, 512x128, 32768x128)
+  triton_mm_5083 0.0416 ms 100.0%
+  triton_mm_5077 0.0447 ms 93.0%
+  triton_mm_5079 0.0475 ms 87.7%
+  triton_mm_5084 0.0476 ms 87.4%
+  triton_mm_5078 0.0509 ms 81.8%
+  triton_mm_5076 0.0509 ms 81.7%
+  triton_mm_5075 0.0572 ms 72.7%
+  triton_mm_5085 0.0623 ms 66.8%
+  triton_mm_5082 0.0744 ms 55.9%
+  triton_mm_5080 0.0793 ms 52.5%
+SingleProcess AUTOTUNE takes 6.5638 seconds
+AUTOTUNE convolution(32x128x32x32, 128x128x3x3)
+  convolution 0.0602 ms 100.0%
+  triton_convolution_5089 0.3124 ms 19.3%
+  triton_convolution_5091 0.3282 ms 18.4%
+  triton_convolution_5092 0.3327 ms 18.1%
+  triton_convolution_5086 0.3399 ms 17.7%
+  triton_convolution_5090 0.4023 ms 15.0%
+  triton_convolution_5087 0.6056 ms 9.9%
+  triton_convolution_5088 1.4632 ms 4.1%
+SingleProcess AUTOTUNE takes 4.1818 seconds
+AUTOTUNE addmm(32768x128, 32768x192, 192x128)
+  bias_addmm 0.0264 ms 100.0%
+  triton_mm_5095 0.0276 ms 95.9%
+  triton_mm_5094 0.0288 ms 91.7%
+  triton_mm_5101 0.0305 ms 86.6%
+  triton_mm_5097 0.0306 ms 86.5%
+  triton_mm_5096 0.0307 ms 86.3%
+  triton_mm_5093 0.0319 ms 83.0%
+  triton_mm_5100 0.0322 ms 82.2%
+  addmm 0.0407 ms 64.9%
+  triton_mm_5098 0.0415 ms 63.7%
+SingleProcess AUTOTUNE takes 5.0877 seconds
+AUTOTUNE addmm(32768x256, 32768x128, 128x256)
+  bias_addmm 0.0313 ms 100.0%
+  triton_mm_5292 0.0324 ms 96.8%
+  triton_mm_5293 0.0325 ms 96.5%
+  triton_mm_5291 0.0331 ms 94.7%
+  triton_mm_5298 0.0348 ms 90.1%
+  triton_mm_5295 0.0352 ms 89.0%
+  triton_mm_5294 0.0358 ms 87.5%
+  triton_mm_5299 0.0380 ms 82.5%
+  triton_mm_5301 0.0476 ms 65.8%
+  addmm 0.0560 ms 55.9%
+SingleProcess AUTOTUNE takes 5.1774 seconds
+AUTOTUNE convolution(32x96x64x64, 64x96x3x3)
+  convolution 0.0931 ms 100.0%
+  triton_convolution_5309 0.4329 ms 21.5%
+  triton_convolution_5303 0.4495 ms 20.7%
+  triton_convolution_5308 0.5100 ms 18.3%
+  triton_convolution_5304 0.6578 ms 14.2%
+  triton_convolution_5306 0.7162 ms 13.0%
+  triton_convolution_5307 0.7695 ms 12.1%
+  triton_convolution_5305 1.3102 ms 7.1%
+SingleProcess AUTOTUNE takes 3.9959 seconds
+AUTOTUNE int_mm(131072x64, 64x512, 131072x512)
+  triton_mm_5331 0.2228 ms 100.0%
+  triton_mm_5333 0.2235 ms 99.7%
+  triton_mm_5332 0.2302 ms 96.8%
+  triton_mm_5334 0.2457 ms 90.7%
+  triton_mm_5339 0.2626 ms 84.9%
+  triton_mm_5335 0.2724 ms 81.8%
+  triton_mm_5337 0.3408 ms 65.4%
+  triton_mm_5338 0.3470 ms 64.2%
+  triton_mm_5336 0.3522 ms 63.3%
+  triton_mm_5341 0.4418 ms 50.4%
+SingleProcess AUTOTUNE takes 5.7810 seconds
+AUTOTUNE int_mm(131072x512, 512x64, 131072x64)
+  triton_mm_5373 0.0884 ms 100.0%
+  triton_mm_5374 0.0932 ms 94.8%
+  triton_mm_5369 0.0934 ms 94.7%
+  triton_mm_5375 0.0942 ms 93.8%
+  triton_mm_5367 0.0959 ms 92.1%
+  triton_mm_5372 0.1072 ms 82.4%
+  triton_mm_5368 0.1082 ms 81.7%
+  triton_mm_5366 0.1231 ms 71.8%
+  triton_mm_5365 0.1420 ms 62.2%
+  triton_mm_5370 0.1517 ms 58.3%
+SingleProcess AUTOTUNE takes 5.3113 seconds
+AUTOTUNE convolution(32x64x64x64, 64x64x3x3)
+  convolution 0.0714 ms 100.0%
+  triton_convolution_5376 0.2905 ms 24.6%
+  triton_convolution_5382 0.2907 ms 24.6%
+  triton_convolution_5381 0.3318 ms 21.5%
+  triton_convolution_5379 0.4598 ms 15.5%
+  triton_convolution_5377 0.4738 ms 15.1%
+  triton_convolution_5380 0.4944 ms 14.4%
+  triton_convolution_5378 1.2549 ms 5.7%
+SingleProcess AUTOTUNE takes 4.0084 seconds
+AUTOTUNE addmm(131072x64, 131072x96, 96x64)
+  triton_mm_5383 0.0419 ms 100.0%
+  bias_addmm 0.0437 ms 95.8%
+  triton_mm_5387 0.0439 ms 95.5%
+  triton_mm_5386 0.0439 ms 95.5%
+  triton_mm_5385 0.0440 ms 95.3%
+  triton_mm_5384 0.0440 ms 95.1%
+  triton_mm_5390 0.0445 ms 94.0%
+  triton_mm_5391 0.0448 ms 93.5%
+  triton_mm_5389 0.0500 ms 83.7%
+  triton_mm_5393 0.0524 ms 79.9%
+SingleProcess AUTOTUNE takes 4.8564 seconds
+AUTOTUNE addmm(131072x128, 131072x64, 64x128)
+  triton_mm_5580 0.0448 ms 100.0%
+  triton_mm_5581 0.0467 ms 96.0%
+  triton_mm_5587 0.0479 ms 93.6%
+  triton_mm_5582 0.0519 ms 86.4%
+  triton_mm_5583 0.0540 ms 82.9%
+  triton_mm_5579 0.0543 ms 82.5%
+  bias_addmm 0.0558 ms 80.3%
+  triton_mm_5589 0.0566 ms 79.2%
+  triton_mm_5586 0.0581 ms 77.1%
+  triton_mm_5588 0.0659 ms 68.0%
+SingleProcess AUTOTUNE takes 5.4856 seconds
+AUTOTUNE convolution(32x48x128x128, 32x48x3x3)
+  convolution 0.1794 ms 100.0%
+  triton_convolution_5591 0.4648 ms 38.6%
+  triton_convolution_5597 0.4758 ms 37.7%
+  triton_convolution_5594 0.4783 ms 37.5%
+  triton_convolution_5596 0.5024 ms 35.7%
+  triton_convolution_5595 0.5129 ms 35.0%
+  triton_convolution_5592 0.6247 ms 28.7%
+  triton_convolution_5593 0.7428 ms 24.1%
+SingleProcess AUTOTUNE takes 3.1772 seconds
+AUTOTUNE int_mm(524288x32, 32x512, 524288x512)
+  triton_mm_5622 0.7427 ms 100.0%
+  triton_mm_5617 0.8941 ms 83.1%
+  triton_mm_5616 0.8972 ms 82.8%
+  triton_mm_5621 0.9469 ms 78.4%
+  triton_mm_5624 0.9497 ms 78.2%
+  triton_mm_5619 0.9861 ms 75.3%
+  triton_mm_5618 1.3365 ms 55.6%
+  triton_mm_5620 1.4244 ms 52.1%
+  triton_mm_5625 1.6036 ms 46.3%
+  triton_mm_5623 1.6041 ms 46.3%
+SingleProcess AUTOTUNE takes 5.9176 seconds
+AUTOTUNE int_mm(524288x512, 512x32, 524288x32)
+  triton_mm_5659 0.2332 ms 100.0%
+  triton_mm_5658 0.2382 ms 97.9%
+  triton_mm_5652 0.2420 ms 96.4%
+  triton_mm_5660 0.2436 ms 95.7%
+  triton_mm_5651 0.2440 ms 95.6%
+  triton_mm_5654 0.2496 ms 93.4%
+  triton_mm_5650 0.2629 ms 88.7%
+  triton_mm_5657 0.2669 ms 87.4%
+  triton_mm_5653 0.2882 ms 80.9%
+  triton_mm_5655 0.2966 ms 78.6%
+SingleProcess AUTOTUNE takes 4.5197 seconds
+AUTOTUNE convolution(32x32x128x128, 32x32x3x3)
+  convolution 0.1154 ms 100.0%
+  triton_convolution_5667 0.2408 ms 47.9%
+  triton_convolution_5664 0.2432 ms 47.5%
+  triton_convolution_5665 0.2688 ms 42.9%
+  triton_convolution_5666 0.2751 ms 42.0%
+  triton_convolution_5661 0.2923 ms 39.5%
+  triton_convolution_5662 0.3932 ms 29.3%
+  triton_convolution_5663 0.7424 ms 15.5%
+SingleProcess AUTOTUNE takes 3.1548 seconds
+AUTOTUNE addmm(524288x32, 524288x48, 48x32)
+  triton_mm_5676 0.0692 ms 100.0%
+  triton_mm_5670 0.0698 ms 99.1%
+  triton_mm_5677 0.0719 ms 96.3%
+  triton_mm_5672 0.0722 ms 95.8%
+  triton_mm_5669 0.0735 ms 94.1%
+  triton_mm_5675 0.0762 ms 90.8%
+  triton_mm_5668 0.0768 ms 90.0%
+  triton_mm_5679 0.0777 ms 89.1%
+  triton_mm_5678 0.0784 ms 88.3%
+  triton_mm_5671 0.0803 ms 86.2%
+SingleProcess AUTOTUNE takes 4.4476 seconds
+AUTOTUNE addmm(524288x64, 524288x32, 32x64)
+  triton_mm_5859 0.0725 ms 100.0%
+  triton_mm_5858 0.0729 ms 99.5%
+  triton_mm_5866 0.0729 ms 99.5%
+  triton_mm_5861 0.0729 ms 99.4%
+  triton_mm_5862 0.0730 ms 99.3%
+  triton_mm_5865 0.0733 ms 98.9%
+  triton_mm_5860 0.0739 ms 98.1%
+  bias_addmm 0.0755 ms 96.1%
+  triton_mm_5864 0.0763 ms 95.0%
+  triton_mm_5863 0.0822 ms 88.2%
+SingleProcess AUTOTUNE takes 4.4323 seconds
+AUTOTUNE convolution(32x32x256x256, 16x32x3x3)
+  convolution 0.3147 ms 100.0%
+  triton_convolution_5873 1.0218 ms 30.8%
+  triton_convolution_5871 1.2910 ms 24.4%
+  triton_convolution_5875 1.3161 ms 23.9%
+  triton_convolution_5874 1.3518 ms 23.3%
+  triton_convolution_5872 1.3961 ms 22.5%
+  triton_convolution_5870 1.4202 ms 22.2%
+SingleProcess AUTOTUNE takes 2.8949 seconds
+AUTOTUNE addmm(2097152x16, 2097152x32, 32x16)
+  triton_mm_5888 0.1318 ms 100.0%
+  triton_mm_5890 0.1322 ms 99.7%
+  triton_mm_5893 0.1322 ms 99.7%
+  triton_mm_5892 0.1356 ms 97.1%
+  triton_mm_5894 0.1357 ms 97.1%
+  triton_mm_5891 0.1385 ms 95.2%
+  triton_mm_5887 0.1386 ms 95.1%
+  triton_mm_5889 0.1388 ms 94.9%
+  triton_mm_5886 0.1388 ms 94.9%
+  triton_mm_5896 0.1455 ms 90.6%
+SingleProcess AUTOTUNE takes 3.5259 seconds
+AUTOTUNE addmm(2097152x3, 2097152x19, 19x3)
+  triton_mm_5987 0.1104 ms 100.0%
+  triton_mm_5988 0.1205 ms 91.6%
+  triton_mm_5985 0.1249 ms 88.4%
+  triton_mm_5980 0.1250 ms 88.3%
+  triton_mm_5982 0.1255 ms 88.0%
+  triton_mm_5986 0.1402 ms 78.7%
+  triton_mm_5984 0.1402 ms 78.7%
+  triton_mm_5978 0.1479 ms 74.7%
+  triton_mm_5983 0.1479 ms 74.7%
+  triton_mm_5979 0.1479 ms 74.6%
+SingleProcess AUTOTUNE takes 3.9650 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:01<00:40,  1.39s/it]running benchmark:   7%|▋         | 2/30 [00:02<00:34,  1.24s/it]running benchmark:  10%|█         | 3/30 [00:03<00:32,  1.20s/it]running benchmark:  13%|█▎        | 4/30 [00:04<00:30,  1.18s/it]running benchmark:  17%|█▋        | 5/30 [00:05<00:29,  1.17s/it]running benchmark:  20%|██        | 6/30 [00:07<00:27,  1.16s/it]running benchmark:  23%|██▎       | 7/30 [00:08<00:26,  1.16s/it]running benchmark:  27%|██▋       | 8/30 [00:09<00:25,  1.16s/it]running benchmark:  30%|███       | 9/30 [00:10<00:24,  1.16s/it]running benchmark:  33%|███▎      | 10/30 [00:11<00:23,  1.15s/it]running benchmark:  37%|███▋      | 11/30 [00:12<00:21,  1.15s/it]running benchmark:  40%|████      | 12/30 [00:14<00:20,  1.15s/it]running benchmark:  43%|████▎     | 13/30 [00:15<00:19,  1.15s/it]running benchmark:  47%|████▋     | 14/30 [00:16<00:18,  1.15s/it]running benchmark:  50%|█████     | 15/30 [00:17<00:17,  1.15s/it]running benchmark:  53%|█████▎    | 16/30 [00:18<00:16,  1.15s/it]running benchmark:  57%|█████▋    | 17/30 [00:19<00:14,  1.15s/it]running benchmark:  60%|██████    | 18/30 [00:20<00:13,  1.15s/it]running benchmark:  63%|██████▎   | 19/30 [00:22<00:12,  1.15s/it]running benchmark:  67%|██████▋   | 20/30 [00:23<00:11,  1.15s/it]running benchmark:  70%|███████   | 21/30 [00:24<00:10,  1.15s/it]running benchmark:  73%|███████▎  | 22/30 [00:25<00:09,  1.15s/it]running benchmark:  77%|███████▋  | 23/30 [00:26<00:08,  1.15s/it]running benchmark:  80%|████████  | 24/30 [00:27<00:06,  1.16s/it]running benchmark:  83%|████████▎ | 25/30 [00:29<00:05,  1.16s/it]running benchmark:  87%|████████▋ | 26/30 [00:30<00:04,  1.16s/it]running benchmark:  90%|█████████ | 27/30 [00:31<00:03,  1.16s/it]running benchmark:  93%|█████████▎| 28/30 [00:32<00:02,  1.16s/it]running benchmark:  97%|█████████▋| 29/30 [00:33<00:01,  1.16s/it]running benchmark: 100%|██████████| 30/30 [00:34<00:00,  1.16s/it]running benchmark: 100%|██████████| 30/30 [00:34<00:00,  1.16s/it]
+6142.629ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:05, ?it/s]
+LearningToPaint
+cuda eval  LearningToPaint                     int8dynamic-bs32          
+AUTOTUNE convolution(32x9x128x128, 64x9x3x3)
+  triton_convolution_4 0.0634 ms 100.0%
+  triton_convolution_3 0.0676 ms 93.8%
+  convolution 0.0847 ms 74.9%
+  triton_convolution_5 0.0917 ms 69.1%
+  triton_convolution_0 0.0989 ms 64.1%
+  triton_convolution_2 0.1535 ms 41.3%
+  triton_convolution_1 0.1840 ms 34.5%
+SingleProcess AUTOTUNE takes 3.4348 seconds
+AUTOTUNE convolution(32x64x64x64, 64x64x3x3)
+  convolution 0.0405 ms 100.0%
+  triton_convolution_11 0.1696 ms 23.9%
+  triton_convolution_6 0.1865 ms 21.7%
+  triton_convolution_12 0.1915 ms 21.1%
+  triton_convolution_9 0.2433 ms 16.6%
+  triton_convolution_7 0.2463 ms 16.4%
+  triton_convolution_10 0.2683 ms 15.1%
+  triton_convolution_8 0.5093 ms 7.9%
+SingleProcess AUTOTUNE takes 4.1350 seconds
+AUTOTUNE convolution(32x64x32x32, 64x64x3x3)
+  convolution 0.0302 ms 100.0%
+  triton_convolution_18 0.0957 ms 31.6%
+  triton_convolution_13 0.0987 ms 30.6%
+  triton_convolution_19 0.1252 ms 24.1%
+  triton_convolution_16 0.1282 ms 23.6%
+  triton_convolution_17 0.1343 ms 22.5%
+  triton_convolution_14 0.1781 ms 17.0%
+  triton_convolution_15 0.4825 ms 6.3%
+SingleProcess AUTOTUNE takes 4.1683 seconds
+AUTOTUNE convolution(32x64x64x64, 64x64x1x1)
+  convolution 0.0164 ms 100.0%
+  triton_convolution_25 0.0175 ms 93.8%
+  triton_convolution_24 0.0180 ms 91.3%
+  triton_convolution_20 0.0196 ms 83.7%
+  triton_convolution_21 0.0196 ms 83.6%
+  triton_convolution_23 0.0205 ms 80.2%
+  triton_convolution_26 0.0221 ms 74.4%
+  triton_convolution_22 0.0652 ms 25.2%
+SingleProcess AUTOTUNE takes 4.4357 seconds
+AUTOTUNE convolution(32x64x32x32, 128x64x3x3)
+  convolution 0.0199 ms 100.0%
+  triton_convolution_47 0.0940 ms 21.2%
+  triton_convolution_41 0.1043 ms 19.1%
+  triton_convolution_44 0.1110 ms 18.0%
+  triton_convolution_46 0.1140 ms 17.5%
+  triton_convolution_45 0.1396 ms 14.3%
+  triton_convolution_42 0.1602 ms 12.4%
+  triton_convolution_43 0.2547 ms 7.8%
+SingleProcess AUTOTUNE takes 4.4108 seconds
+AUTOTUNE convolution(32x128x16x16, 128x128x3x3)
+  convolution 0.0247 ms 100.0%
+  triton_convolution_51 0.1149 ms 21.5%
+  triton_convolution_48 0.1221 ms 20.3%
+  triton_convolution_54 0.1253 ms 19.7%
+  triton_convolution_53 0.1406 ms 17.6%
+  triton_convolution_52 0.1510 ms 16.4%
+  triton_convolution_49 0.2622 ms 9.4%
+  triton_convolution_50 0.4778 ms 5.2%
+SingleProcess AUTOTUNE takes 4.0260 seconds
+AUTOTUNE convolution(32x64x32x32, 128x64x1x1)
+  convolution 0.0116 ms 100.0%
+  triton_convolution_55 0.0116 ms 100.0%
+  triton_convolution_59 0.0122 ms 94.5%
+  triton_convolution_58 0.0123 ms 94.3%
+  triton_convolution_61 0.0129 ms 89.6%
+  triton_convolution_60 0.0142 ms 81.1%
+  triton_convolution_56 0.0149 ms 77.3%
+  triton_convolution_57 0.0355 ms 32.6%
+SingleProcess AUTOTUNE takes 4.5177 seconds
+AUTOTUNE convolution(32x128x16x16, 256x128x3x3)
+  convolution 0.0183 ms 100.0%
+  triton_convolution_81 0.1583 ms 11.6%
+  triton_convolution_80 0.1859 ms 9.8%
+  triton_convolution_82 0.1961 ms 9.3%
+  triton_convolution_79 0.2339 ms 7.8%
+  triton_convolution_76 0.3018 ms 6.1%
+  triton_convolution_77 0.3421 ms 5.4%
+  triton_convolution_78 0.4919 ms 3.7%
+SingleProcess AUTOTUNE takes 4.5978 seconds
+AUTOTUNE convolution(32x256x8x8, 256x256x3x3)
+  convolution 0.0268 ms 100.0%
+  triton_convolution_87 0.2352 ms 11.4%
+  triton_convolution_88 0.2450 ms 10.9%
+  triton_convolution_89 0.2860 ms 9.4%
+  triton_convolution_86 0.2869 ms 9.3%
+  triton_convolution_83 0.5385 ms 5.0%
+  triton_convolution_84 0.5716 ms 4.7%
+  triton_convolution_85 0.9107 ms 2.9%
+SingleProcess AUTOTUNE takes 4.5614 seconds
+AUTOTUNE convolution(32x128x16x16, 256x128x1x1)
+  convolution 0.0100 ms 100.0%
+  triton_convolution_94 0.0118 ms 84.3%
+  triton_convolution_93 0.0169 ms 58.9%
+  triton_convolution_96 0.0173 ms 57.4%
+  triton_convolution_95 0.0178 ms 56.0%
+  triton_convolution_91 0.0217 ms 45.9%
+  triton_convolution_90 0.0223 ms 44.6%
+  triton_convolution_92 0.0624 ms 16.0%
+SingleProcess AUTOTUNE takes 4.6681 seconds
+AUTOTUNE convolution(32x256x8x8, 512x256x3x3)
+  convolution 0.0189 ms 100.0%
+  triton_convolution_116 0.3259 ms 5.8%
+  triton_convolution_117 0.3944 ms 4.8%
+  triton_convolution_115 0.3947 ms 4.8%
+  triton_convolution_114 0.4707 ms 4.0%
+  triton_convolution_113 0.5920 ms 3.2%
+  triton_convolution_111 0.6223 ms 3.0%
+  triton_convolution_112 0.7000 ms 2.7%
+SingleProcess AUTOTUNE takes 4.6963 seconds
+AUTOTUNE convolution(32x512x4x4, 512x512x3x3)
+  convolution 0.0292 ms 100.0%
+  triton_convolution_122 0.5045 ms 5.8%
+  triton_convolution_123 0.5503 ms 5.3%
+  triton_convolution_124 0.6709 ms 4.4%
+  triton_convolution_121 0.7495 ms 3.9%
+  triton_convolution_120 0.8126 ms 3.6%
+  triton_convolution_119 1.1806 ms 2.5%
+  triton_convolution_118 1.2664 ms 2.3%
+SingleProcess AUTOTUNE takes 4.7193 seconds
+AUTOTUNE convolution(32x256x8x8, 512x256x1x1)
+  convolution 0.0111 ms 100.0%
+  triton_convolution_129 0.0163 ms 68.4%
+  triton_convolution_128 0.0256 ms 43.5%
+  triton_convolution_131 0.0260 ms 42.8%
+  triton_convolution_130 0.0264 ms 42.1%
+  triton_convolution_126 0.0344 ms 32.3%
+  triton_convolution_125 0.0376 ms 29.6%
+  triton_convolution_127 0.0720 ms 15.4%
+SingleProcess AUTOTUNE takes 4.6173 seconds
+AUTOTUNE int_mm(32x512, 512x65, 32x65)
+  triton_mm_151 0.0099 ms 100.0%
+  triton_mm_155 0.0104 ms 94.8%
+  triton_mm_154 0.0107 ms 92.2%
+  triton_mm_152 0.0110 ms 90.1%
+  triton_mm_150 0.0115 ms 86.1%
+  triton_mm_148 0.0124 ms 80.1%
+  triton_mm_149 0.0132 ms 74.6%
+  triton_mm_147 0.0144 ms 68.8%
+  triton_mm_146 0.0146 ms 67.6%
+  triton_mm_153 0.0185 ms 53.5%
+SingleProcess AUTOTUNE takes 3.7043 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:00, 43.71it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 47.24it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 46.46it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 46.07it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 47.85it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 46.91it/s]
+19953.721ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model Super_SloMo does not support bfloat16, running with amp instead
+loading model: 0it [00:03, ?it/s]
+WARNING:common:Model Super_SloMo does not support bfloat16, running with amp instead
+Super_SloMo
+cuda eval  Super_SloMo                         int8dynamic-bs32          
+WARNING:common:Model Super_SloMo does not support bfloat16, running with amp instead
+AUTOTUNE convolution(10x6x352x352, 32x6x7x7)
+  convolution 0.4046 ms 100.0%
+  triton_convolution_0 1.6721 ms 24.2%
+  triton_convolution_4 1.7145 ms 23.6%
+  triton_convolution_3 1.7158 ms 23.6%
+  triton_convolution_1 2.0017 ms 20.2%
+  triton_convolution_5 2.0108 ms 20.1%
+  triton_convolution_2 2.0403 ms 19.8%
+SingleProcess AUTOTUNE takes 2.8578 seconds
+AUTOTUNE convolution(10x32x352x352, 32x32x7x7)
+  convolution 1.0122 ms 100.0%
+  triton_convolution_12 2.6029 ms 38.9%
+  triton_convolution_9 2.8535 ms 35.5%
+  triton_convolution_10 3.1633 ms 32.0%
+  triton_convolution_6 3.2807 ms 30.9%
+  triton_convolution_11 3.2982 ms 30.7%
+  triton_convolution_7 4.0782 ms 24.8%
+  triton_convolution_8 8.4129 ms 12.0%
+SingleProcess AUTOTUNE takes 3.5024 seconds
+AUTOTUNE convolution(10x32x176x176, 64x32x5x5)
+  convolution 0.1701 ms 100.0%
+  triton_convolution_13 0.7967 ms 21.4%
+  triton_convolution_19 0.8419 ms 20.2%
+  triton_convolution_18 0.8689 ms 19.6%
+  triton_convolution_14 1.1789 ms 14.4%
+  triton_convolution_17 1.2282 ms 13.9%
+  triton_convolution_16 1.2777 ms 13.3%
+  triton_convolution_15 2.2400 ms 7.6%
+SingleProcess AUTOTUNE takes 4.1783 seconds
+AUTOTUNE convolution(10x64x176x176, 64x64x5x5)
+  convolution 0.3459 ms 100.0%
+  triton_convolution_26 1.6006 ms 21.6%
+  triton_convolution_20 1.6231 ms 21.3%
+  triton_convolution_25 1.7526 ms 19.7%
+  triton_convolution_24 2.4295 ms 14.2%
+  triton_convolution_23 2.5253 ms 13.7%
+  triton_convolution_21 2.6150 ms 13.2%
+  triton_convolution_22 7.9167 ms 4.4%
+SingleProcess AUTOTUNE takes 4.4387 seconds
+AUTOTUNE convolution(10x64x88x88, 128x64x3x3)
+  convolution 0.0666 ms 100.0%
+  triton_convolution_33 0.3320 ms 20.0%
+  triton_convolution_30 0.3482 ms 19.1%
+  triton_convolution_27 0.3575 ms 18.6%
+  triton_convolution_32 0.4092 ms 16.3%
+  triton_convolution_28 0.5113 ms 13.0%
+  triton_convolution_31 0.5419 ms 12.3%
+  triton_convolution_29 1.4520 ms 4.6%
+SingleProcess AUTOTUNE takes 4.6912 seconds
+AUTOTUNE convolution(10x128x88x88, 128x128x3x3)
+  convolution 0.1104 ms 100.0%
+  triton_convolution_40 0.6824 ms 16.2%
+  triton_convolution_37 0.6857 ms 16.1%
+  triton_convolution_34 0.8034 ms 13.7%
+  triton_convolution_39 0.8128 ms 13.6%
+  triton_convolution_35 1.0586 ms 10.4%
+  triton_convolution_38 1.2135 ms 9.1%
+  triton_convolution_36 2.8824 ms 3.8%
+SingleProcess AUTOTUNE takes 5.3014 seconds
+AUTOTUNE convolution(10x128x44x44, 256x128x3x3)
+  convolution 0.0570 ms 100.0%
+  triton_convolution_46 0.2894 ms 19.7%
+  triton_convolution_44 0.3036 ms 18.8%
+  triton_convolution_47 0.3189 ms 17.9%
+  triton_convolution_45 0.4775 ms 11.9%
+  triton_convolution_41 0.5072 ms 11.2%
+  triton_convolution_42 0.6262 ms 9.1%
+  triton_convolution_43 1.4287 ms 4.0%
+SingleProcess AUTOTUNE takes 4.8359 seconds
+AUTOTUNE convolution(10x256x44x44, 256x256x3x3)
+  convolution 0.0999 ms 100.0%
+  triton_convolution_53 0.6348 ms 15.7%
+  triton_convolution_51 0.6524 ms 15.3%
+  triton_convolution_54 0.7557 ms 13.2%
+  triton_convolution_48 1.0540 ms 9.5%
+  triton_convolution_52 1.2284 ms 8.1%
+  triton_convolution_49 1.2682 ms 7.9%
+  triton_convolution_50 2.8442 ms 3.5%
+SingleProcess AUTOTUNE takes 5.5297 seconds
+AUTOTUNE convolution(10x256x22x22, 512x256x3x3)
+  convolution 0.0671 ms 100.0%
+  triton_convolution_60 0.4031 ms 16.6%
+  triton_convolution_58 0.4150 ms 16.2%
+  triton_convolution_61 0.5147 ms 13.0%
+  triton_convolution_55 0.5729 ms 11.7%
+  triton_convolution_59 0.6031 ms 11.1%
+  triton_convolution_56 0.6608 ms 10.1%
+  triton_convolution_57 1.8368 ms 3.7%
+SingleProcess AUTOTUNE takes 5.2659 seconds
+AUTOTUNE convolution(10x512x22x22, 512x512x3x3)
+  convolution 0.1216 ms 100.0%
+  triton_convolution_67 0.8693 ms 14.0%
+  triton_convolution_68 1.0610 ms 11.5%
+  triton_convolution_62 1.1201 ms 10.9%
+  triton_convolution_65 1.1919 ms 10.2%
+  triton_convolution_63 1.3020 ms 9.3%
+  triton_convolution_66 1.5619 ms 7.8%
+  triton_convolution_64 3.6816 ms 3.3%
+SingleProcess AUTOTUNE takes 5.4143 seconds
+AUTOTUNE convolution(10x512x11x11, 512x512x3x3)
+  convolution 0.0451 ms 100.0%
+  triton_convolution_74 0.5013 ms 9.0%
+  triton_convolution_73 0.5817 ms 7.8%
+  triton_convolution_75 0.6002 ms 7.5%
+  triton_convolution_72 0.6485 ms 7.0%
+  triton_convolution_69 1.1058 ms 4.1%
+  triton_convolution_70 1.2158 ms 3.7%
+  triton_convolution_71 1.7477 ms 2.6%
+SingleProcess AUTOTUNE takes 4.9283 seconds
+AUTOTUNE convolution(10x1024x22x22, 512x1024x3x3)
+  convolution 0.2363 ms 100.0%
+  triton_convolution_95 1.7378 ms 13.6%
+  triton_convolution_96 2.2327 ms 10.6%
+  triton_convolution_90 2.2358 ms 10.6%
+  triton_convolution_93 2.4093 ms 9.8%
+  triton_convolution_91 2.5978 ms 9.1%
+  triton_convolution_94 3.1289 ms 7.6%
+  triton_convolution_92 7.3473 ms 3.2%
+SingleProcess AUTOTUNE takes 5.0208 seconds
+AUTOTUNE convolution(10x512x44x44, 256x512x3x3)
+  convolution 0.1922 ms 100.0%
+  triton_convolution_102 1.3579 ms 14.2%
+  triton_convolution_103 1.6125 ms 11.9%
+  triton_convolution_100 1.9155 ms 10.0%
+  triton_convolution_97 2.0966 ms 9.2%
+  triton_convolution_98 3.2239 ms 6.0%
+  triton_convolution_101 3.2659 ms 5.9%
+  triton_convolution_99 5.7005 ms 3.4%
+SingleProcess AUTOTUNE takes 4.9362 seconds
+AUTOTUNE convolution(10x256x88x88, 128x256x3x3)
+  convolution 0.2043 ms 100.0%
+  triton_convolution_114 1.4649 ms 13.9%
+  triton_convolution_116 1.6177 ms 12.6%
+  triton_convolution_117 1.6664 ms 12.3%
+  triton_convolution_111 1.9003 ms 10.7%
+  triton_convolution_112 3.0848 ms 6.6%
+  triton_convolution_115 3.1748 ms 6.4%
+  triton_convolution_113 5.8645 ms 3.5%
+SingleProcess AUTOTUNE takes 4.4231 seconds
+AUTOTUNE convolution(10x128x176x176, 64x128x3x3)
+  convolution 0.2486 ms 100.0%
+  triton_convolution_131 1.3766 ms 18.1%
+  triton_convolution_130 1.5741 ms 15.8%
+  triton_convolution_125 1.6985 ms 14.6%
+  triton_convolution_126 2.1381 ms 11.6%
+  triton_convolution_129 2.3831 ms 10.4%
+  triton_convolution_128 2.4092 ms 10.3%
+  triton_convolution_127 5.7680 ms 4.3%
+SingleProcess AUTOTUNE takes 4.4355 seconds
+AUTOTUNE convolution(10x64x352x352, 32x64x3x3)
+  convolution 0.4290 ms 100.0%
+  triton_convolution_139 1.4044 ms 30.5%
+  triton_convolution_145 1.5652 ms 27.4%
+  triton_convolution_142 1.6545 ms 25.9%
+  triton_convolution_143 1.7295 ms 24.8%
+  triton_convolution_144 1.8337 ms 23.4%
+  triton_convolution_140 1.9011 ms 22.6%
+  triton_convolution_141 5.5903 ms 7.7%
+SingleProcess AUTOTUNE takes 3.2853 seconds
+AUTOTUNE convolution(10x32x352x352, 4x32x3x3)
+  convolution 0.1818 ms 100.0%
+  triton_convolution_154 0.4640 ms 39.2%
+  triton_convolution_153 0.5055 ms 36.0%
+  triton_convolution_155 0.5175 ms 35.1%
+  triton_convolution_157 0.8043 ms 22.6%
+  triton_convolution_158 0.8069 ms 22.5%
+  triton_convolution_156 0.8301 ms 21.9%
+SingleProcess AUTOTUNE takes 3.1130 seconds
+AUTOTUNE convolution(10x20x352x352, 32x20x7x7)
+  convolution 0.8995 ms 100.0%
+  triton_convolution_165 2.2390 ms 40.2%
+  triton_convolution_162 2.2796 ms 39.5%
+  triton_convolution_164 2.7227 ms 33.0%
+  triton_convolution_163 2.9286 ms 30.7%
+  triton_convolution_159 3.5227 ms 25.5%
+  triton_convolution_160 3.6525 ms 24.6%
+  triton_convolution_161 4.8865 ms 18.4%
+SingleProcess AUTOTUNE takes 3.5180 seconds
+AUTOTUNE convolution(10x32x352x352, 5x32x3x3)
+  convolution 0.1831 ms 100.0%
+  triton_convolution_313 0.5729 ms 32.0%
+  triton_convolution_314 0.7558 ms 24.2%
+  triton_convolution_317 0.8317 ms 22.0%
+  triton_convolution_318 0.8322 ms 22.0%
+  triton_convolution_316 0.9427 ms 19.4%
+  triton_convolution_315 12.1133 ms 1.5%
+SingleProcess AUTOTUNE takes 3.6658 seconds
+AUTOTUNE convolution(10x3x352x352, 64x3x3x3)
+  convolution 0.2278 ms 100.0%
+  triton_convolution_323 0.4770 ms 47.8%
+  triton_convolution_322 0.4907 ms 46.4%
+  triton_convolution_324 0.6369 ms 35.8%
+  triton_convolution_319 0.6588 ms 34.6%
+  triton_convolution_321 0.7160 ms 31.8%
+  triton_convolution_320 1.0824 ms 21.0%
+SingleProcess AUTOTUNE takes 3.5418 seconds
+AUTOTUNE convolution(10x64x352x352, 64x64x3x3)
+  convolution 0.5342 ms 100.0%
+  triton_convolution_331 2.6922 ms 19.8%
+  triton_convolution_325 2.6961 ms 19.8%
+  triton_convolution_330 3.0790 ms 17.3%
+  triton_convolution_326 3.9701 ms 13.5%
+  triton_convolution_328 4.6183 ms 11.6%
+  triton_convolution_329 4.7727 ms 11.2%
+  triton_convolution_327 10.9658 ms 4.9%
+SingleProcess AUTOTUNE takes 4.6353 seconds
+AUTOTUNE convolution(10x64x176x176, 128x64x3x3)
+  convolution 0.2449 ms 100.0%
+  triton_convolution_332 1.3262 ms 18.5%
+  triton_convolution_338 1.3436 ms 18.2%
+  triton_convolution_335 1.3689 ms 17.9%
+  triton_convolution_337 1.5420 ms 15.9%
+  triton_convolution_333 2.0410 ms 12.0%
+  triton_convolution_336 2.3715 ms 10.3%
+  triton_convolution_334 5.5827 ms 4.4%
+SingleProcess AUTOTUNE takes 4.6855 seconds
+AUTOTUNE convolution(10x128x176x176, 128x128x3x3)
+  convolution 0.4397 ms 100.0%
+  triton_convolution_345 2.6740 ms 16.4%
+  triton_convolution_342 2.7128 ms 16.2%
+  triton_convolution_339 3.0473 ms 14.4%
+  triton_convolution_344 3.0632 ms 14.4%
+  triton_convolution_340 4.2569 ms 10.3%
+  triton_convolution_343 4.7145 ms 9.3%
+  triton_convolution_341 11.0810 ms 4.0%
+SingleProcess AUTOTUNE takes 4.8453 seconds
+AUTOTUNE convolution(10x128x88x88, 256x128x3x3)
+  convolution 0.2155 ms 100.0%
+  triton_convolution_351 1.1050 ms 19.5%
+  triton_convolution_352 1.3376 ms 16.1%
+  triton_convolution_349 1.3405 ms 16.1%
+  triton_convolution_346 1.5461 ms 13.9%
+  triton_convolution_347 2.0954 ms 10.3%
+  triton_convolution_350 2.3806 ms 9.1%
+  triton_convolution_348 5.7077 ms 3.8%
+SingleProcess AUTOTUNE takes 5.0441 seconds
+AUTOTUNE convolution(10x256x88x88, 256x256x3x3)
+  convolution 0.4079 ms 100.0%
+  triton_convolution_358 2.4018 ms 17.0%
+  triton_convolution_356 2.8650 ms 14.2%
+  triton_convolution_353 3.1301 ms 13.0%
+  triton_convolution_359 3.2801 ms 12.4%
+  triton_convolution_354 6.1969 ms 6.6%
+  triton_convolution_357 6.2877 ms 6.5%
+  triton_convolution_355 11.5724 ms 3.5%
+SingleProcess AUTOTUNE takes 5.0340 seconds
+AUTOTUNE convolution(10x256x44x44, 512x256x3x3)
+  convolution 0.1970 ms 100.0%
+  triton_convolution_420 1.2023 ms 16.4%
+  triton_convolution_418 1.2237 ms 16.1%
+  triton_convolution_421 1.4416 ms 13.7%
+  triton_convolution_415 1.6312 ms 12.1%
+  triton_convolution_416 1.9411 ms 10.2%
+  triton_convolution_419 2.3136 ms 8.5%
+  triton_convolution_417 5.6474 ms 3.5%
+SingleProcess AUTOTUNE takes 4.9940 seconds
+AUTOTUNE convolution(10x512x44x44, 512x512x3x3)
+  convolution 0.3855 ms 100.0%
+  triton_convolution_427 2.6630 ms 14.5%
+  triton_convolution_428 3.1021 ms 12.4%
+  triton_convolution_422 3.3003 ms 11.7%
+  triton_convolution_425 3.6816 ms 10.5%
+  triton_convolution_423 5.5068 ms 7.0%
+  triton_convolution_426 6.2000 ms 6.2%
+  triton_convolution_424 11.3312 ms 3.4%
+SingleProcess AUTOTUNE takes 5.1921 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:04,  5.86it/s]running benchmark:  10%|█         | 3/30 [00:00<00:02,  9.44it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:02, 10.63it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:02, 11.16it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 11.43it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:01, 11.62it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:01, 11.77it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01, 11.85it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:01, 11.86it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 11.89it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 11.94it/s]running benchmark:  77%|███████▋  | 23/30 [00:02<00:00, 11.97it/s]running benchmark:  83%|████████▎ | 25/30 [00:02<00:00, 11.94it/s]running benchmark:  90%|█████████ | 27/30 [00:02<00:00, 11.92it/s]running benchmark:  97%|█████████▋| 29/30 [00:02<00:00, 11.95it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 11.58it/s]
+1794.951ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+alexnet
+cuda eval  alexnet                             int8dynamic-bs32          
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 130.51it/s]running benchmark:  93%|█████████▎| 28/30 [00:00<00:00, 133.46it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 133.09it/s]
+6780.301ms
+loading model: 0it [00:00, ?it/s]basic_gnn_edgecnn
+loading model: 0it [00:04, ?it/s]
+cuda eval  basic_gnn_edgecnn                   int8dynamic-bs32          
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 186.16it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 197.01it/s]
+1365.143ms
+loading model: 0it [00:00, ?it/s]basic_gnn_gcn
+loading model: 0it [00:03, ?it/s]
+cuda eval  basic_gnn_gcn                       int8dynamic-bs32          
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 19.48it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 87.16it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 108.56it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 96.06it/s] 
+1045.424ms
+loading model: 0it [00:00, ?it/s]basic_gnn_gin
+loading model: 0it [00:03, ?it/s]
+cuda eval  basic_gnn_gin                       int8dynamic-bs32          
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 343.11it/s]
+1247.961ms
+loading model: 0it [00:00, ?it/s]basic_gnn_sage
+loading model: 0it [00:02, ?it/s]
+cuda eval  basic_gnn_sage                      int8dynamic-bs32          
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 155.04it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 164.15it/s]
+1274.460ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:06, ?it/s]
+cm3leon_generate
+cuda eval  cm3leon_generate                    int8dynamic-bs32          
+AUTOTUNE mm(32x1536, 1536x1536)
+  mm 0.0163 ms 100.0%
+  triton_mm_5 0.0174 ms 93.6%
+  triton_mm_9 0.0194 ms 83.9%
+  triton_mm_6 0.0196 ms 82.9%
+  triton_mm_8 0.0199 ms 81.8%
+  triton_mm_4 0.0214 ms 76.0%
+  triton_mm_3 0.0235 ms 69.4%
+  triton_mm_2 0.0280 ms 58.2%
+  triton_mm_1 0.0299 ms 54.5%
+  triton_mm_0 0.0486 ms 33.5%
+SingleProcess AUTOTUNE takes 4.4429 seconds
+AUTOTUNE addmm(32x1536, 32x1536, 1536x1536)
+  bias_addmm 0.0169 ms 100.0%
+  triton_mm_17 0.0179 ms 94.3%
+  triton_mm_18 0.0196 ms 86.1%
+  triton_mm_21 0.0199 ms 84.9%
+  triton_mm_20 0.0203 ms 83.3%
+  triton_mm_16 0.0220 ms 77.0%
+  addmm 0.0221 ms 76.5%
+  triton_mm_15 0.0245 ms 68.9%
+  triton_mm_14 0.0286 ms 59.1%
+  triton_mm_13 0.0310 ms 54.5%
+SingleProcess AUTOTUNE takes 4.8107 seconds
+AUTOTUNE bmm(512x1x96, 512x96x1)
+  triton_bmm_27 0.0077 ms 100.0%
+  triton_bmm_29 0.0079 ms 97.6%
+  triton_bmm_25 0.0081 ms 95.3%
+  triton_bmm_26 0.0083 ms 93.1%
+  triton_bmm_28 0.0085 ms 90.9%
+  triton_bmm_24 0.0090 ms 85.9%
+  triton_bmm_30 0.0095 ms 81.4%
+  triton_bmm_31 0.0095 ms 81.1%
+  bmm 0.0108 ms 71.1%
+SingleProcess AUTOTUNE takes 2.5446 seconds
+AUTOTUNE bmm(512x1x1, 512x1x96)
+  triton_bmm_45 0.0076 ms 100.0%
+  triton_bmm_50 0.0077 ms 99.6%
+  bmm 0.0079 ms 96.4%
+  triton_bmm_53 0.0079 ms 96.4%
+  triton_bmm_47 0.0080 ms 95.6%
+  triton_bmm_51 0.0082 ms 93.0%
+  triton_bmm_44 0.0083 ms 92.6%
+  triton_bmm_46 0.0083 ms 92.6%
+  triton_bmm_48 0.0083 ms 91.7%
+  triton_bmm_49 0.0085 ms 90.2%
+SingleProcess AUTOTUNE takes 2.8189 seconds
+AUTOTUNE mm(32x1536, 1536x6144)
+  triton_mm_74 0.0272 ms 100.0%
+  triton_mm_75 0.0275 ms 99.1%
+  mm 0.0276 ms 98.5%
+  triton_mm_71 0.0278 ms 97.9%
+  triton_mm_72 0.0281 ms 97.0%
+  triton_mm_70 0.0287 ms 94.9%
+  triton_mm_69 0.0297 ms 91.6%
+  triton_mm_68 0.0337 ms 80.9%
+  triton_mm_67 0.0355 ms 76.8%
+  triton_mm_66 0.0559 ms 48.7%
+SingleProcess AUTOTUNE takes 4.1566 seconds
+AUTOTUNE mm(32x6144, 6144x1536)
+  mm 0.0306 ms 100.0%
+  triton_mm_83 0.0515 ms 59.4%
+  triton_mm_84 0.0571 ms 53.5%
+  triton_mm_87 0.0579 ms 52.8%
+  triton_mm_86 0.0589 ms 51.9%
+  triton_mm_82 0.0650 ms 47.0%
+  triton_mm_81 0.0744 ms 41.1%
+  triton_mm_80 0.0952 ms 32.1%
+  triton_mm_79 0.1022 ms 29.9%
+  triton_mm_78 0.1503 ms 20.3%
+SingleProcess AUTOTUNE takes 3.7466 seconds
+AUTOTUNE bmm(512x1x96, 512x96x2)
+  triton_bmm_116 0.0075 ms 100.0%
+  triton_bmm_117 0.0076 ms 98.7%
+  triton_bmm_118 0.0077 ms 97.1%
+  triton_bmm_119 0.0078 ms 95.1%
+  triton_bmm_115 0.0080 ms 92.8%
+  triton_bmm_114 0.0084 ms 88.3%
+  triton_bmm_121 0.0096 ms 77.9%
+  triton_bmm_120 0.0100 ms 74.4%
+  bmm 0.0115 ms 65.1%
+SingleProcess AUTOTUNE takes 2.7152 seconds
+AUTOTUNE bmm(512x1x2, 512x2x96)
+  triton_bmm_138 0.0077 ms 100.0%
+  triton_bmm_139 0.0077 ms 100.0%
+  triton_bmm_140 0.0077 ms 100.0%
+  bmm 0.0080 ms 96.4%
+  triton_bmm_143 0.0080 ms 95.6%
+  triton_bmm_141 0.0081 ms 94.9%
+  triton_bmm_135 0.0081 ms 94.5%
+  triton_bmm_136 0.0081 ms 94.3%
+  triton_bmm_134 0.0082 ms 93.4%
+  triton_bmm_137 0.0082 ms 93.4%
+SingleProcess AUTOTUNE takes 2.6428 seconds
+AUTOTUNE bmm(512x1x96, 512x96x3)
+  triton_bmm_205 0.0075 ms 100.0%
+  triton_bmm_206 0.0076 ms 99.2%
+  triton_bmm_207 0.0076 ms 98.7%
+  triton_bmm_208 0.0082 ms 90.9%
+  triton_bmm_209 0.0084 ms 89.1%
+  triton_bmm_204 0.0090 ms 83.3%
+  triton_bmm_211 0.0092 ms 81.2%
+  triton_bmm_210 0.0095 ms 78.8%
+  bmm 0.0112 ms 67.0%
+SingleProcess AUTOTUNE takes 2.7842 seconds
+AUTOTUNE bmm(512x1x3, 512x3x96)
+  triton_bmm_228 0.0077 ms 100.0%
+  triton_bmm_229 0.0079 ms 96.8%
+  triton_bmm_231 0.0082 ms 93.4%
+  triton_bmm_225 0.0083 ms 93.0%
+  triton_bmm_227 0.0083 ms 93.0%
+  triton_bmm_224 0.0084 ms 91.8%
+  triton_bmm_230 0.0084 ms 91.3%
+  triton_bmm_226 0.0085 ms 90.7%
+  triton_bmm_232 0.0085 ms 90.2%
+  triton_bmm_233 0.0085 ms 90.2%
+SingleProcess AUTOTUNE takes 2.6817 seconds
+AUTOTUNE bmm(512x1x96, 512x96x4)
+  triton_bmm_299 0.0079 ms 100.0%
+  triton_bmm_296 0.0082 ms 97.3%
+  triton_bmm_295 0.0082 ms 96.9%
+  triton_bmm_297 0.0083 ms 95.4%
+  triton_bmm_298 0.0084 ms 94.3%
+  triton_bmm_294 0.0090 ms 87.9%
+  triton_bmm_301 0.0092 ms 86.4%
+  triton_bmm_300 0.0095 ms 83.8%
+  bmm 0.0116 ms 68.3%
+SingleProcess AUTOTUNE takes 2.6160 seconds
+AUTOTUNE bmm(512x1x4, 512x4x96)
+  triton_bmm_316 0.0077 ms 100.0%
+  triton_bmm_320 0.0077 ms 100.0%
+  triton_bmm_322 0.0079 ms 96.8%
+  triton_bmm_323 0.0079 ms 96.8%
+  triton_bmm_314 0.0081 ms 94.5%
+  bmm 0.0082 ms 93.7%
+  triton_bmm_315 0.0082 ms 93.4%
+  triton_bmm_318 0.0083 ms 93.0%
+  triton_bmm_321 0.0083 ms 92.7%
+  triton_bmm_317 0.0084 ms 91.3%
+SingleProcess AUTOTUNE takes 2.9185 seconds
+AUTOTUNE bmm(512x1x96, 512x96x5)
+  triton_bmm_385 0.0081 ms 100.0%
+  triton_bmm_387 0.0081 ms 99.6%
+  triton_bmm_386 0.0083 ms 97.7%
+  triton_bmm_388 0.0085 ms 95.1%
+  triton_bmm_389 0.0085 ms 95.1%
+  triton_bmm_384 0.0090 ms 89.7%
+  triton_bmm_391 0.0098 ms 83.0%
+  triton_bmm_390 0.0100 ms 80.6%
+  bmm 0.0114 ms 71.3%
+SingleProcess AUTOTUNE takes 2.6746 seconds
+AUTOTUNE bmm(512x1x5, 512x5x96)
+  triton_bmm_407 0.0077 ms 100.0%
+  triton_bmm_406 0.0077 ms 99.6%
+  triton_bmm_410 0.0077 ms 99.6%
+  triton_bmm_413 0.0080 ms 96.4%
+  triton_bmm_409 0.0081 ms 95.2%
+  bmm 0.0082 ms 93.7%
+  triton_bmm_405 0.0083 ms 92.7%
+  triton_bmm_411 0.0083 ms 92.1%
+  triton_bmm_404 0.0084 ms 91.3%
+  triton_bmm_408 0.0085 ms 90.6%
+SingleProcess AUTOTUNE takes 2.7660 seconds
+AUTOTUNE bmm(512x1x96, 512x96x6)
+  triton_bmm_476 0.0082 ms 100.0%
+  triton_bmm_477 0.0082 ms 99.8%
+  triton_bmm_475 0.0083 ms 98.7%
+  triton_bmm_478 0.0085 ms 96.1%
+  triton_bmm_479 0.0087 ms 94.0%
+  triton_bmm_474 0.0090 ms 91.0%
+  triton_bmm_481 0.0097 ms 84.9%
+  triton_bmm_480 0.0097 ms 84.7%
+  bmm 0.0117 ms 70.3%
+SingleProcess AUTOTUNE takes 2.8415 seconds
+AUTOTUNE bmm(512x1x6, 512x6x96)
+  triton_bmm_495 0.0077 ms 100.0%
+  triton_bmm_497 0.0078 ms 98.4%
+  triton_bmm_496 0.0079 ms 97.2%
+  triton_bmm_498 0.0079 ms 97.2%
+  triton_bmm_499 0.0079 ms 97.2%
+  triton_bmm_500 0.0083 ms 92.7%
+  triton_bmm_502 0.0084 ms 92.3%
+  triton_bmm_503 0.0085 ms 90.9%
+  triton_bmm_494 0.0085 ms 90.8%
+  triton_bmm_501 0.0085 ms 90.8%
+SingleProcess AUTOTUNE takes 2.8052 seconds
+AUTOTUNE bmm(512x1x96, 512x96x7)
+  triton_bmm_567 0.0077 ms 100.0%
+  triton_bmm_566 0.0081 ms 95.3%
+  triton_bmm_569 0.0082 ms 94.9%
+  triton_bmm_565 0.0083 ms 93.6%
+  triton_bmm_564 0.0085 ms 91.3%
+  triton_bmm_568 0.0085 ms 91.3%
+  triton_bmm_571 0.0094 ms 82.6%
+  triton_bmm_570 0.0103 ms 75.4%
+  bmm 0.0114 ms 68.0%
+SingleProcess AUTOTUNE takes 3.0359 seconds
+AUTOTUNE bmm(512x1x7, 512x7x96)
+  triton_bmm_586 0.0079 ms 100.0%
+  triton_bmm_588 0.0079 ms 100.0%
+  triton_bmm_591 0.0079 ms 100.0%
+  triton_bmm_589 0.0082 ms 96.9%
+  triton_bmm_592 0.0082 ms 96.9%
+  triton_bmm_593 0.0082 ms 96.9%
+  triton_bmm_584 0.0085 ms 93.6%
+  triton_bmm_587 0.0085 ms 93.4%
+  triton_bmm_590 0.0085 ms 93.2%
+  triton_bmm_585 0.0087 ms 90.8%
+SingleProcess AUTOTUNE takes 2.6865 seconds
+AUTOTUNE bmm(512x1x96, 512x96x8)
+  triton_bmm_658 0.0080 ms 100.0%
+  triton_bmm_659 0.0082 ms 97.6%
+  triton_bmm_656 0.0083 ms 96.1%
+  triton_bmm_657 0.0083 ms 96.1%
+  triton_bmm_655 0.0085 ms 94.0%
+  triton_bmm_654 0.0092 ms 86.2%
+  triton_bmm_660 0.0097 ms 81.9%
+  triton_bmm_661 0.0098 ms 81.6%
+  bmm 0.0120 ms 66.4%
+SingleProcess AUTOTUNE takes 2.6543 seconds
+AUTOTUNE bmm(512x1x8, 512x8x96)
+  triton_bmm_674 0.0079 ms 100.0%
+  triton_bmm_675 0.0079 ms 100.0%
+  triton_bmm_676 0.0079 ms 100.0%
+  triton_bmm_681 0.0079 ms 100.0%
+  triton_bmm_682 0.0082 ms 96.9%
+  bmm 0.0084 ms 93.9%
+  triton_bmm_677 0.0085 ms 93.2%
+  triton_bmm_680 0.0085 ms 93.2%
+  triton_bmm_678 0.0085 ms 92.9%
+  triton_bmm_679 0.0087 ms 90.8%
+SingleProcess AUTOTUNE takes 2.8368 seconds
+AUTOTUNE bmm(512x1x96, 512x96x9)
+  triton_bmm_746 0.0077 ms 100.0%
+  triton_bmm_747 0.0085 ms 91.3%
+  triton_bmm_745 0.0085 ms 91.0%
+  triton_bmm_748 0.0086 ms 90.3%
+  triton_bmm_749 0.0088 ms 88.5%
+  triton_bmm_744 0.0092 ms 84.0%
+  triton_bmm_751 0.0094 ms 82.0%
+  triton_bmm_750 0.0100 ms 77.8%
+  bmm 0.0118 ms 65.4%
+SingleProcess AUTOTUNE takes 2.5976 seconds
+AUTOTUNE bmm(512x1x9, 512x9x96)
+  triton_bmm_767 0.0080 ms 100.0%
+  triton_bmm_771 0.0080 ms 100.0%
+  triton_bmm_764 0.0081 ms 98.4%
+  triton_bmm_770 0.0081 ms 98.0%
+  triton_bmm_765 0.0082 ms 97.3%
+  triton_bmm_769 0.0084 ms 94.3%
+  triton_bmm_772 0.0084 ms 94.3%
+  triton_bmm_773 0.0084 ms 94.3%
+  triton_bmm_766 0.0087 ms 91.2%
+  triton_bmm_768 0.0087 ms 91.2%
+SingleProcess AUTOTUNE takes 2.8667 seconds
+AUTOTUNE bmm(512x1x96, 512x96x10)
+  triton_bmm_836 0.0079 ms 100.0%
+  triton_bmm_838 0.0080 ms 99.2%
+  triton_bmm_837 0.0085 ms 93.2%
+  triton_bmm_835 0.0085 ms 92.9%
+  triton_bmm_834 0.0087 ms 91.5%
+  triton_bmm_839 0.0090 ms 88.3%
+  triton_bmm_841 0.0094 ms 84.1%
+  triton_bmm_840 0.0105 ms 75.6%
+  bmm 0.0121 ms 65.4%
+SingleProcess AUTOTUNE takes 2.7254 seconds
+AUTOTUNE bmm(512x1x10, 512x10x96)
+  triton_bmm_861 0.0082 ms 100.0%
+  triton_bmm_862 0.0082 ms 99.6%
+  triton_bmm_863 0.0083 ms 98.1%
+  triton_bmm_854 0.0084 ms 97.0%
+  triton_bmm_856 0.0084 ms 97.0%
+  triton_bmm_860 0.0084 ms 97.0%
+  triton_bmm_857 0.0087 ms 93.4%
+  triton_bmm_859 0.0088 ms 93.1%
+  triton_bmm_858 0.0088 ms 92.4%
+  triton_bmm_855 0.0090 ms 91.1%
+SingleProcess AUTOTUNE takes 2.7351 seconds
+AUTOTUNE bmm(512x1x96, 512x96x11)
+  triton_bmm_925 0.0080 ms 100.0%
+  triton_bmm_926 0.0080 ms 100.0%
+  triton_bmm_927 0.0080 ms 100.0%
+  triton_bmm_928 0.0082 ms 97.3%
+  triton_bmm_924 0.0089 ms 90.3%
+  triton_bmm_929 0.0089 ms 89.6%
+  triton_bmm_931 0.0102 ms 78.6%
+  triton_bmm_930 0.0106 ms 75.3%
+  bmm 0.0121 ms 66.1%
+SingleProcess AUTOTUNE takes 2.8351 seconds
+AUTOTUNE bmm(512x1x11, 512x11x96)
+  triton_bmm_947 0.0084 ms 100.0%
+  triton_bmm_946 0.0084 ms 99.6%
+  triton_bmm_948 0.0084 ms 99.6%
+  triton_bmm_951 0.0084 ms 99.6%
+  triton_bmm_945 0.0084 ms 99.2%
+  triton_bmm_949 0.0087 ms 96.3%
+  triton_bmm_952 0.0087 ms 96.3%
+  triton_bmm_944 0.0090 ms 93.2%
+  triton_bmm_950 0.0090 ms 93.2%
+  triton_bmm_953 0.0092 ms 90.3%
+SingleProcess AUTOTUNE takes 2.7877 seconds
+AUTOTUNE bmm(512x1x96, 512x96x12)
+  triton_bmm_1017 0.0080 ms 100.0%
+  triton_bmm_1016 0.0087 ms 91.5%
+  triton_bmm_1015 0.0087 ms 91.2%
+  triton_bmm_1018 0.0088 ms 90.5%
+  triton_bmm_1019 0.0092 ms 86.5%
+  triton_bmm_1014 0.0095 ms 84.1%
+  triton_bmm_1021 0.0097 ms 82.5%
+  triton_bmm_1020 0.0107 ms 74.6%
+  bmm 0.0124 ms 64.4%
+SingleProcess AUTOTUNE takes 2.8840 seconds
+AUTOTUNE bmm(512x1x12, 512x12x96)
+  triton_bmm_1034 0.0084 ms 100.0%
+  triton_bmm_1041 0.0084 ms 100.0%
+  triton_bmm_1035 0.0084 ms 99.6%
+  triton_bmm_1043 0.0087 ms 96.7%
+  triton_bmm_1036 0.0090 ms 93.6%
+  triton_bmm_1037 0.0090 ms 93.6%
+  triton_bmm_1038 0.0090 ms 93.6%
+  triton_bmm_1040 0.0090 ms 93.6%
+  bmm 0.0092 ms 91.3%
+  triton_bmm_1039 0.0092 ms 91.0%
+SingleProcess AUTOTUNE takes 2.6580 seconds
+AUTOTUNE bmm(512x1x96, 512x96x13)
+  triton_bmm_1106 0.0082 ms 100.0%
+  triton_bmm_1107 0.0082 ms 100.0%
+  triton_bmm_1108 0.0083 ms 99.2%
+  triton_bmm_1109 0.0086 ms 95.2%
+  triton_bmm_1105 0.0088 ms 93.5%
+  triton_bmm_1104 0.0089 ms 92.1%
+  triton_bmm_1111 0.0097 ms 84.8%
+  triton_bmm_1110 0.0102 ms 80.6%
+  bmm 0.0124 ms 66.6%
+SingleProcess AUTOTUNE takes 2.6283 seconds
+AUTOTUNE bmm(512x1x13, 512x13x96)
+  triton_bmm_1127 0.0084 ms 100.0%
+  triton_bmm_1130 0.0084 ms 99.6%
+  triton_bmm_1128 0.0085 ms 98.9%
+  triton_bmm_1124 0.0086 ms 98.1%
+  triton_bmm_1125 0.0087 ms 97.0%
+  triton_bmm_1129 0.0089 ms 94.3%
+  triton_bmm_1126 0.0090 ms 93.6%
+  triton_bmm_1131 0.0090 ms 93.6%
+  triton_bmm_1132 0.0095 ms 88.6%
+  triton_bmm_1133 0.0095 ms 88.6%
+SingleProcess AUTOTUNE takes 2.7925 seconds
+AUTOTUNE bmm(512x1x96, 512x96x14)
+  triton_bmm_1197 0.0082 ms 100.0%
+  triton_bmm_1195 0.0084 ms 97.3%
+  triton_bmm_1198 0.0085 ms 97.0%
+  triton_bmm_1199 0.0088 ms 93.8%
+  triton_bmm_1196 0.0088 ms 93.5%
+  triton_bmm_1194 0.0089 ms 92.1%
+  triton_bmm_1201 0.0098 ms 83.7%
+  triton_bmm_1200 0.0108 ms 76.5%
+  bmm 0.0126 ms 65.1%
+SingleProcess AUTOTUNE takes 2.7861 seconds
+AUTOTUNE bmm(512x1x14, 512x14x96)
+  triton_bmm_1214 0.0087 ms 100.0%
+  triton_bmm_1215 0.0087 ms 100.0%
+  triton_bmm_1216 0.0087 ms 100.0%
+  triton_bmm_1217 0.0087 ms 100.0%
+  triton_bmm_1218 0.0087 ms 100.0%
+  triton_bmm_1223 0.0087 ms 100.0%
+  triton_bmm_1221 0.0092 ms 94.1%
+  triton_bmm_1219 0.0092 ms 93.8%
+  triton_bmm_1220 0.0092 ms 93.8%
+  triton_bmm_1222 0.0092 ms 93.8%
+SingleProcess AUTOTUNE takes 2.6698 seconds
+AUTOTUNE bmm(512x1x96, 512x96x15)
+  triton_bmm_1286 0.0083 ms 100.0%
+  triton_bmm_1289 0.0087 ms 95.2%
+  triton_bmm_1287 0.0088 ms 93.8%
+  triton_bmm_1285 0.0091 ms 91.2%
+  triton_bmm_1288 0.0091 ms 91.2%
+  triton_bmm_1284 0.0096 ms 86.6%
+  triton_bmm_1291 0.0098 ms 84.6%
+  triton_bmm_1290 0.0102 ms 81.2%
+  bmm 0.0124 ms 66.8%
+SingleProcess AUTOTUNE takes 2.7863 seconds
+AUTOTUNE bmm(512x1x15, 512x15x96)
+  triton_bmm_1304 0.0087 ms 100.0%
+  triton_bmm_1307 0.0087 ms 100.0%
+  triton_bmm_1308 0.0087 ms 100.0%
+  triton_bmm_1310 0.0087 ms 99.6%
+  triton_bmm_1309 0.0090 ms 96.8%
+  triton_bmm_1313 0.0090 ms 96.8%
+  triton_bmm_1311 0.0092 ms 94.1%
+  triton_bmm_1306 0.0093 ms 93.8%
+  triton_bmm_1305 0.0095 ms 91.6%
+  triton_bmm_1312 0.0095 ms 91.3%
+SingleProcess AUTOTUNE takes 3.3219 seconds
+AUTOTUNE bmm(512x1x96, 512x96x16)
+  triton_bmm_1376 0.0083 ms 100.0%
+  triton_bmm_1377 0.0083 ms 100.0%
+  triton_bmm_1378 0.0085 ms 97.0%
+  triton_bmm_1375 0.0091 ms 91.2%
+  triton_bmm_1379 0.0095 ms 87.2%
+  triton_bmm_1374 0.0096 ms 86.6%
+  triton_bmm_1381 0.0097 ms 85.2%
+  triton_bmm_1380 0.0102 ms 81.2%
+  bmm 0.0128 ms 64.7%
+SingleProcess AUTOTUNE takes 2.5331 seconds
+AUTOTUNE bmm(512x1x16, 512x16x96)
+  triton_bmm_1397 0.0087 ms 100.0%
+  triton_bmm_1399 0.0087 ms 100.0%
+  triton_bmm_1398 0.0087 ms 99.6%
+  triton_bmm_1400 0.0087 ms 99.6%
+  triton_bmm_1395 0.0090 ms 97.1%
+  triton_bmm_1401 0.0093 ms 93.8%
+  triton_bmm_1402 0.0093 ms 93.8%
+  triton_bmm_1403 0.0093 ms 93.8%
+  triton_bmm_1396 0.0093 ms 93.5%
+  triton_bmm_1394 0.0094 ms 92.7%
+SingleProcess AUTOTUNE takes 2.6803 seconds
+AUTOTUNE bmm(512x1x96, 512x96x17)
+  triton_bmm_1465 0.0091 ms 100.0%
+  triton_bmm_1466 0.0094 ms 96.3%
+  triton_bmm_1467 0.0094 ms 96.1%
+  triton_bmm_1464 0.0099 ms 91.6%
+  triton_bmm_1468 0.0099 ms 91.6%
+  triton_bmm_1471 0.0100 ms 90.7%
+  triton_bmm_1469 0.0106 ms 85.8%
+  triton_bmm_1470 0.0108 ms 83.7%
+  bmm 0.0142 ms 63.6%
+SingleProcess AUTOTUNE takes 2.6105 seconds
+AUTOTUNE bmm(512x1x17, 512x17x96)
+  triton_bmm_1485 0.0092 ms 100.0%
+  triton_bmm_1491 0.0092 ms 99.7%
+  triton_bmm_1484 0.0095 ms 96.6%
+  triton_bmm_1489 0.0098 ms 94.4%
+  triton_bmm_1492 0.0098 ms 94.4%
+  triton_bmm_1487 0.0098 ms 94.1%
+  triton_bmm_1488 0.0101 ms 91.4%
+  triton_bmm_1494 0.0101 ms 91.4%
+  triton_bmm_1486 0.0101 ms 91.1%
+  triton_bmm_1490 0.0102 ms 90.1%
+SingleProcess AUTOTUNE takes 3.3077 seconds
+AUTOTUNE bmm(512x1x96, 512x96x18)
+  triton_bmm_1557 0.0089 ms 100.0%
+  triton_bmm_1558 0.0089 ms 100.0%
+  triton_bmm_1556 0.0091 ms 97.9%
+  triton_bmm_1555 0.0093 ms 95.5%
+  triton_bmm_1559 0.0093 ms 95.2%
+  triton_bmm_1561 0.0103 ms 86.6%
+  triton_bmm_1562 0.0105 ms 85.0%
+  triton_bmm_1560 0.0116 ms 76.6%
+  bmm 0.0135 ms 66.0%
+SingleProcess AUTOTUNE takes 2.6040 seconds
+AUTOTUNE bmm(512x1x18, 512x18x96)
+  triton_bmm_1576 0.0095 ms 100.0%
+  triton_bmm_1581 0.0095 ms 99.7%
+  triton_bmm_1584 0.0100 ms 94.9%
+  triton_bmm_1578 0.0100 ms 94.3%
+  triton_bmm_1579 0.0100 ms 94.3%
+  triton_bmm_1580 0.0100 ms 94.3%
+  triton_bmm_1582 0.0100 ms 94.3%
+  triton_bmm_1583 0.0100 ms 94.3%
+  triton_bmm_1575 0.0101 ms 94.0%
+  triton_bmm_1577 0.0101 ms 94.0%
+SingleProcess AUTOTUNE takes 3.4226 seconds
+AUTOTUNE bmm(512x1x96, 512x96x19)
+  triton_bmm_1647 0.0093 ms 100.0%
+  triton_bmm_1650 0.0094 ms 98.6%
+  triton_bmm_1646 0.0095 ms 97.3%
+  triton_bmm_1649 0.0097 ms 96.0%
+  triton_bmm_1648 0.0097 ms 95.7%
+  triton_bmm_1651 0.0102 ms 90.6%
+  triton_bmm_1652 0.0105 ms 88.4%
+  triton_bmm_1653 0.0108 ms 85.8%
+  bmm 0.0144 ms 64.3%
+SingleProcess AUTOTUNE takes 2.7501 seconds
+AUTOTUNE bmm(512x1x19, 512x19x96)
+  triton_bmm_1669 0.0095 ms 100.0%
+  triton_bmm_1673 0.0095 ms 100.0%
+  triton_bmm_1666 0.0098 ms 96.7%
+  triton_bmm_1670 0.0098 ms 96.7%
+  triton_bmm_1671 0.0100 ms 94.9%
+  triton_bmm_1667 0.0100 ms 94.3%
+  triton_bmm_1676 0.0103 ms 91.6%
+  triton_bmm_1672 0.0104 ms 91.4%
+  triton_bmm_1668 0.0104 ms 91.1%
+  triton_bmm_1674 0.0106 ms 89.4%
+SingleProcess AUTOTUNE takes 3.0366 seconds
+AUTOTUNE bmm(512x1x96, 512x96x20)
+  triton_bmm_1739 0.0091 ms 100.0%
+  triton_bmm_1740 0.0091 ms 100.0%
+  triton_bmm_1738 0.0092 ms 99.0%
+  triton_bmm_1737 0.0095 ms 95.6%
+  triton_bmm_1741 0.0095 ms 95.6%
+  triton_bmm_1742 0.0106 ms 86.4%
+  triton_bmm_1744 0.0106 ms 86.2%
+  triton_bmm_1743 0.0108 ms 84.1%
+  bmm 0.0136 ms 67.2%
+SingleProcess AUTOTUNE takes 2.6715 seconds
+AUTOTUNE bmm(512x1x20, 512x20x96)
+  triton_bmm_1760 0.0095 ms 100.0%
+  triton_bmm_1765 0.0096 ms 98.7%
+  triton_bmm_1757 0.0098 ms 97.4%
+  triton_bmm_1763 0.0098 ms 97.4%
+  triton_bmm_1758 0.0101 ms 94.3%
+  triton_bmm_1764 0.0101 ms 94.3%
+  triton_bmm_1766 0.0101 ms 94.3%
+  triton_bmm_1762 0.0103 ms 92.5%
+  triton_bmm_1767 0.0103 ms 92.2%
+  triton_bmm_1759 0.0103 ms 92.0%
+SingleProcess AUTOTUNE takes 3.0968 seconds
+AUTOTUNE bmm(512x1x96, 512x96x21)
+  triton_bmm_1830 0.0093 ms 100.0%
+  triton_bmm_1831 0.0099 ms 94.5%
+  triton_bmm_1829 0.0101 ms 92.4%
+  triton_bmm_1828 0.0102 ms 91.5%
+  triton_bmm_1832 0.0102 ms 90.9%
+  triton_bmm_1835 0.0102 ms 90.9%
+  triton_bmm_1833 0.0105 ms 88.7%
+  triton_bmm_1834 0.0105 ms 88.4%
+  bmm 0.0145 ms 64.1%
+SingleProcess AUTOTUNE takes 2.5700 seconds
+AUTOTUNE bmm(512x1x21, 512x21x96)
+  triton_bmm_1855 0.0097 ms 100.0%
+  triton_bmm_1851 0.0097 ms 99.7%
+  triton_bmm_1853 0.0101 ms 96.2%
+  triton_bmm_1854 0.0101 ms 95.6%
+  triton_bmm_1849 0.0103 ms 94.4%
+  triton_bmm_1858 0.0106 ms 91.8%
+  triton_bmm_1850 0.0106 ms 91.5%
+  triton_bmm_1856 0.0106 ms 91.3%
+  triton_bmm_1848 0.0108 ms 89.9%
+  triton_bmm_1852 0.0108 ms 89.9%
+SingleProcess AUTOTUNE takes 3.3779 seconds
+AUTOTUNE bmm(512x1x96, 512x96x22)
+  triton_bmm_1922 0.0094 ms 100.0%
+  triton_bmm_1921 0.0100 ms 94.6%
+  triton_bmm_1920 0.0101 ms 93.4%
+  triton_bmm_1919 0.0103 ms 91.6%
+  triton_bmm_1923 0.0103 ms 91.3%
+  triton_bmm_1925 0.0105 ms 89.7%
+  triton_bmm_1926 0.0109 ms 86.3%
+  triton_bmm_1924 0.0125 ms 75.4%
+  bmm 0.0139 ms 67.8%
+SingleProcess AUTOTUNE takes 2.5925 seconds
+AUTOTUNE bmm(512x1x22, 512x22x96)
+  triton_bmm_1944 0.0097 ms 100.0%
+  triton_bmm_1946 0.0097 ms 100.0%
+  triton_bmm_1939 0.0100 ms 97.4%
+  triton_bmm_1941 0.0100 ms 97.4%
+  triton_bmm_1943 0.0100 ms 97.4%
+  triton_bmm_1945 0.0100 ms 97.4%
+  triton_bmm_1949 0.0100 ms 97.4%
+  triton_bmm_1942 0.0103 ms 94.4%
+  triton_bmm_1947 0.0103 ms 94.1%
+  triton_bmm_1940 0.0104 ms 93.8%
+SingleProcess AUTOTUNE takes 3.1086 seconds
+AUTOTUNE bmm(512x1x96, 512x96x23)
+  triton_bmm_2013 0.0096 ms 100.0%
+  triton_bmm_2012 0.0101 ms 94.9%
+  triton_bmm_2011 0.0103 ms 93.5%
+  triton_bmm_2010 0.0106 ms 90.9%
+  triton_bmm_2014 0.0106 ms 90.9%
+  triton_bmm_2015 0.0107 ms 89.6%
+  triton_bmm_2017 0.0110 ms 87.0%
+  triton_bmm_2016 0.0112 ms 85.7%
+  bmm 0.0148 ms 65.1%
+SingleProcess AUTOTUNE takes 2.8254 seconds
+AUTOTUNE bmm(512x1x23, 512x23x96)
+  triton_bmm_2040 0.0102 ms 100.0%
+  triton_bmm_2032 0.0103 ms 99.4%
+  triton_bmm_2035 0.0103 ms 99.1%
+  triton_bmm_2036 0.0103 ms 98.9%
+  triton_bmm_2030 0.0104 ms 98.6%
+  triton_bmm_2039 0.0105 ms 97.6%
+  triton_bmm_2031 0.0105 ms 97.0%
+  triton_bmm_2033 0.0105 ms 97.0%
+  triton_bmm_2037 0.0105 ms 97.0%
+  triton_bmm_2034 0.0108 ms 94.4%
+SingleProcess AUTOTUNE takes 3.5602 seconds
+AUTOTUNE bmm(512x1x96, 512x96x24)
+  triton_bmm_2103 0.0096 ms 100.0%
+  triton_bmm_2104 0.0096 ms 100.0%
+  triton_bmm_2102 0.0096 ms 99.3%
+  triton_bmm_2101 0.0099 ms 96.8%
+  triton_bmm_2105 0.0100 ms 95.5%
+  triton_bmm_2106 0.0105 ms 91.4%
+  triton_bmm_2108 0.0105 ms 91.4%
+  triton_bmm_2107 0.0113 ms 84.7%
+  bmm 0.0142 ms 67.5%
+SingleProcess AUTOTUNE takes 2.9167 seconds
+AUTOTUNE bmm(512x1x24, 512x24x96)
+  triton_bmm_2128 0.0100 ms 100.0%
+  triton_bmm_2131 0.0102 ms 98.3%
+  triton_bmm_2125 0.0102 ms 97.5%
+  triton_bmm_2123 0.0103 ms 97.2%
+  triton_bmm_2129 0.0105 ms 95.0%
+  triton_bmm_2124 0.0105 ms 94.8%
+  triton_bmm_2126 0.0106 ms 94.5%
+  triton_bmm_2122 0.0106 ms 94.3%
+  triton_bmm_2121 0.0108 ms 92.3%
+  triton_bmm_2127 0.0108 ms 92.3%
+SingleProcess AUTOTUNE takes 3.5958 seconds
+AUTOTUNE mm(32x1536, 1536x2048)
+  mm 0.0157 ms 100.0%
+  triton_mm_2173 0.0185 ms 84.8%
+  triton_mm_2174 0.0194 ms 80.9%
+  triton_mm_2177 0.0198 ms 79.2%
+  triton_mm_2176 0.0202 ms 77.8%
+  triton_mm_2172 0.0218 ms 72.2%
+  triton_mm_2171 0.0237 ms 66.2%
+  triton_mm_2170 0.0283 ms 55.5%
+  triton_mm_2169 0.0308 ms 51.1%
+  triton_mm_2168 0.0491 ms 32.0%
+SingleProcess AUTOTUNE takes 4.0478 seconds
+AUTOTUNE bmm(512x1x96, 512x96x25)
+  triton_bmm_2206 0.0097 ms 100.0%
+  triton_bmm_2207 0.0097 ms 99.7%
+  triton_bmm_2204 0.0100 ms 96.5%
+  triton_bmm_2208 0.0101 ms 96.2%
+  triton_bmm_2205 0.0105 ms 92.1%
+  triton_bmm_2210 0.0110 ms 88.3%
+  triton_bmm_2211 0.0111 ms 87.4%
+  triton_bmm_2209 0.0116 ms 83.9%
+  bmm 0.0152 ms 63.8%
+SingleProcess AUTOTUNE takes 2.6926 seconds
+AUTOTUNE bmm(512x1x25, 512x25x96)
+  triton_bmm_2231 0.0102 ms 100.0%
+  triton_bmm_2234 0.0103 ms 99.4%
+  triton_bmm_2229 0.0106 ms 96.7%
+  triton_bmm_2227 0.0106 ms 96.4%
+  triton_bmm_2225 0.0108 ms 94.4%
+  triton_bmm_2226 0.0111 ms 92.2%
+  triton_bmm_2228 0.0111 ms 92.1%
+  triton_bmm_2224 0.0111 ms 91.9%
+  triton_bmm_2230 0.0111 ms 91.9%
+  triton_bmm_2232 0.0111 ms 91.9%
+SingleProcess AUTOTUNE takes 3.1099 seconds
+AUTOTUNE bmm(512x1x96, 512x96x26)
+  triton_bmm_2296 0.0099 ms 100.0%
+  triton_bmm_2295 0.0100 ms 98.4%
+  triton_bmm_2299 0.0101 ms 97.2%
+  triton_bmm_2298 0.0103 ms 95.7%
+  triton_bmm_2297 0.0103 ms 95.4%
+  triton_bmm_2302 0.0105 ms 93.9%
+  triton_bmm_2301 0.0113 ms 87.0%
+  triton_bmm_2300 0.0125 ms 78.8%
+  bmm 0.0142 ms 69.4%
+SingleProcess AUTOTUNE takes 2.6803 seconds
+AUTOTUNE bmm(512x1x26, 512x26x96)
+  triton_bmm_2322 0.0102 ms 100.0%
+  triton_bmm_2321 0.0105 ms 97.2%
+  triton_bmm_2324 0.0105 ms 97.2%
+  triton_bmm_2320 0.0107 ms 95.2%
+  triton_bmm_2323 0.0107 ms 95.2%
+  triton_bmm_2318 0.0108 ms 94.6%
+  triton_bmm_2316 0.0108 ms 94.4%
+  triton_bmm_2325 0.0108 ms 94.4%
+  triton_bmm_2315 0.0110 ms 92.2%
+  triton_bmm_2317 0.0110 ms 92.2%
+SingleProcess AUTOTUNE takes 3.2139 seconds
+AUTOTUNE bmm(512x1x96, 512x96x27)
+  triton_bmm_2387 0.0100 ms 100.0%
+  triton_bmm_2388 0.0104 ms 96.3%
+  triton_bmm_2389 0.0104 ms 96.3%
+  triton_bmm_2393 0.0107 ms 93.7%
+  triton_bmm_2386 0.0108 ms 92.9%
+  triton_bmm_2390 0.0109 ms 92.4%
+  triton_bmm_2392 0.0110 ms 91.5%
+  triton_bmm_2391 0.0112 ms 89.5%
+  bmm 0.0153 ms 65.7%
+SingleProcess AUTOTUNE takes 2.7669 seconds
+AUTOTUNE bmm(512x1x27, 512x27x96)
+  triton_bmm_2409 0.0102 ms 100.0%
+  triton_bmm_2414 0.0106 ms 96.4%
+  triton_bmm_2412 0.0108 ms 95.2%
+  triton_bmm_2415 0.0108 ms 95.2%
+  triton_bmm_2413 0.0108 ms 94.7%
+  triton_bmm_2407 0.0110 ms 92.8%
+  triton_bmm_2416 0.0110 ms 92.8%
+  triton_bmm_2411 0.0112 ms 91.7%
+  triton_bmm_2408 0.0112 ms 91.4%
+  triton_bmm_2406 0.0113 ms 90.7%
+SingleProcess AUTOTUNE takes 3.2907 seconds
+AUTOTUNE bmm(512x1x96, 512x96x28)
+  triton_bmm_2480 0.0100 ms 100.0%
+  triton_bmm_2478 0.0101 ms 98.7%
+  triton_bmm_2477 0.0102 ms 97.2%
+  triton_bmm_2481 0.0103 ms 96.6%
+  triton_bmm_2479 0.0105 ms 94.8%
+  triton_bmm_2484 0.0107 ms 93.4%
+  triton_bmm_2482 0.0115 ms 86.4%
+  triton_bmm_2483 0.0115 ms 86.4%
+  bmm 0.0143 ms 69.7%
+SingleProcess AUTOTUNE takes 2.5298 seconds
+AUTOTUNE bmm(512x1x28, 512x28x96)
+  triton_bmm_2500 0.0102 ms 100.0%
+  triton_bmm_2502 0.0102 ms 100.0%
+  triton_bmm_2507 0.0104 ms 98.5%
+  triton_bmm_2497 0.0106 ms 96.7%
+  triton_bmm_2499 0.0106 ms 96.4%
+  triton_bmm_2504 0.0108 ms 95.0%
+  triton_bmm_2505 0.0108 ms 95.0%
+  triton_bmm_2498 0.0109 ms 93.6%
+  triton_bmm_2506 0.0110 ms 92.8%
+  triton_bmm_2503 0.0111 ms 92.2%
+SingleProcess AUTOTUNE takes 3.2387 seconds
+AUTOTUNE bmm(512x1x96, 512x96x29)
+  triton_bmm_2570 0.0101 ms 100.0%
+  triton_bmm_2569 0.0101 ms 99.4%
+  triton_bmm_2568 0.0103 ms 98.1%
+  triton_bmm_2572 0.0103 ms 97.5%
+  triton_bmm_2571 0.0107 ms 94.6%
+  triton_bmm_2575 0.0108 ms 93.8%
+  triton_bmm_2574 0.0115 ms 87.7%
+  triton_bmm_2573 0.0117 ms 85.8%
+  bmm 0.0155 ms 65.2%
+SingleProcess AUTOTUNE takes 2.5780 seconds
+AUTOTUNE bmm(512x1x29, 512x29x96)
+  triton_bmm_2598 0.0107 ms 100.0%
+  triton_bmm_2593 0.0108 ms 99.4%
+  triton_bmm_2588 0.0108 ms 98.8%
+  triton_bmm_2590 0.0108 ms 98.8%
+  triton_bmm_2592 0.0108 ms 98.8%
+  triton_bmm_2594 0.0108 ms 98.8%
+  triton_bmm_2591 0.0110 ms 96.8%
+  triton_bmm_2595 0.0110 ms 96.8%
+  triton_bmm_2589 0.0111 ms 96.3%
+  triton_bmm_2596 0.0114 ms 94.1%
+SingleProcess AUTOTUNE takes 3.3157 seconds
+AUTOTUNE bmm(512x1x96, 512x96x30)
+  triton_bmm_2661 0.0101 ms 100.0%
+  triton_bmm_2662 0.0101 ms 100.0%
+  triton_bmm_2666 0.0108 ms 94.0%
+  triton_bmm_2660 0.0108 ms 93.2%
+  triton_bmm_2659 0.0109 ms 92.4%
+  triton_bmm_2665 0.0110 ms 91.9%
+  triton_bmm_2663 0.0111 ms 91.3%
+  triton_bmm_2664 0.0132 ms 76.3%
+  bmm 0.0145 ms 69.8%
+SingleProcess AUTOTUNE takes 2.6094 seconds
+AUTOTUNE bmm(512x1x30, 512x30x96)
+  triton_bmm_2682 0.0105 ms 100.0%
+  triton_bmm_2688 0.0105 ms 99.4%
+  triton_bmm_2679 0.0108 ms 97.3%
+  triton_bmm_2681 0.0108 ms 97.3%
+  triton_bmm_2680 0.0110 ms 94.8%
+  triton_bmm_2684 0.0110 ms 94.8%
+  triton_bmm_2686 0.0110 ms 94.8%
+  triton_bmm_2687 0.0110 ms 94.8%
+  triton_bmm_2689 0.0111 ms 94.5%
+  triton_bmm_2685 0.0113 ms 92.8%
+SingleProcess AUTOTUNE takes 3.0737 seconds
+AUTOTUNE bmm(512x1x96, 512x96x31)
+  triton_bmm_2752 0.0108 ms 100.0%
+  triton_bmm_2753 0.0108 ms 99.7%
+  triton_bmm_2751 0.0109 ms 99.1%
+  triton_bmm_2750 0.0111 ms 97.4%
+  triton_bmm_2754 0.0111 ms 97.1%
+  triton_bmm_2757 0.0116 ms 93.6%
+  triton_bmm_2755 0.0116 ms 93.4%
+  triton_bmm_2756 0.0121 ms 89.7%
+  bmm 0.0155 ms 69.7%
+SingleProcess AUTOTUNE takes 2.5283 seconds
+AUTOTUNE bmm(512x1x31, 512x31x96)
+  triton_bmm_2773 0.0107 ms 100.0%
+  triton_bmm_2777 0.0107 ms 100.0%
+  triton_bmm_2771 0.0108 ms 99.4%
+  triton_bmm_2779 0.0110 ms 96.8%
+  triton_bmm_2774 0.0111 ms 96.5%
+  triton_bmm_2780 0.0113 ms 94.9%
+  triton_bmm_2775 0.0114 ms 93.8%
+  triton_bmm_2778 0.0114 ms 93.8%
+  triton_bmm_2770 0.0116 ms 92.0%
+  triton_bmm_2776 0.0116 ms 92.0%
+SingleProcess AUTOTUNE takes 3.0228 seconds
+AUTOTUNE bmm(512x1x96, 512x96x32)
+  triton_bmm_2843 0.0102 ms 100.0%
+  triton_bmm_2842 0.0103 ms 99.1%
+  triton_bmm_2848 0.0108 ms 94.7%
+  triton_bmm_2844 0.0108 ms 94.4%
+  triton_bmm_2846 0.0109 ms 93.8%
+  triton_bmm_2841 0.0110 ms 92.7%
+  triton_bmm_2845 0.0111 ms 91.7%
+  triton_bmm_2847 0.0118 ms 86.7%
+  bmm 0.0148 ms 68.9%
+SingleProcess AUTOTUNE takes 2.5807 seconds
+AUTOTUNE bmm(512x1x32, 512x32x96)
+  triton_bmm_2871 0.0102 ms 100.0%
+  triton_bmm_2869 0.0104 ms 98.2%
+  triton_bmm_2864 0.0106 ms 96.7%
+  triton_bmm_2868 0.0106 ms 96.4%
+  triton_bmm_2866 0.0110 ms 93.0%
+  triton_bmm_2861 0.0112 ms 91.4%
+  triton_bmm_2863 0.0112 ms 91.4%
+  triton_bmm_2867 0.0112 ms 91.4%
+  triton_bmm_2865 0.0112 ms 91.2%
+  triton_bmm_2870 0.0113 ms 90.7%
+SingleProcess AUTOTUNE takes 3.1974 seconds
+AUTOTUNE bmm(512x1x96, 512x96x33)
+  triton_bmm_2932 0.0110 ms 100.0%
+  triton_bmm_2936 0.0113 ms 97.5%
+  triton_bmm_2934 0.0117 ms 94.3%
+  triton_bmm_2933 0.0121 ms 91.5%
+  triton_bmm_2937 0.0121 ms 91.5%
+  triton_bmm_2939 0.0121 ms 91.0%
+  triton_bmm_2935 0.0122 ms 90.8%
+  triton_bmm_2938 0.0123 ms 89.8%
+  triton_bmm_2940 0.0124 ms 88.9%
+  bmm 0.0170 ms 64.8%
+SingleProcess AUTOTUNE takes 3.0678 seconds
+AUTOTUNE bmm(512x1x33, 512x33x96)
+  triton_bmm_2964 0.0116 ms 100.0%
+  bmm 0.0121 ms 96.0%
+  triton_bmm_2955 0.0122 ms 95.3%
+  triton_bmm_2953 0.0123 ms 94.3%
+  triton_bmm_2954 0.0123 ms 94.3%
+  triton_bmm_2961 0.0124 ms 93.8%
+  triton_bmm_2962 0.0124 ms 93.6%
+  triton_bmm_2960 0.0124 ms 93.3%
+  triton_bmm_2963 0.0126 ms 92.1%
+  triton_bmm_2959 0.0129 ms 90.3%
+SingleProcess AUTOTUNE takes 3.6571 seconds
+AUTOTUNE bmm(512x1x96, 512x96x34)
+  triton_bmm_3026 0.0113 ms 100.0%
+  triton_bmm_3025 0.0114 ms 99.6%
+  triton_bmm_3030 0.0115 ms 98.6%
+  triton_bmm_3027 0.0116 ms 98.1%
+  triton_bmm_3028 0.0117 ms 97.0%
+  triton_bmm_3029 0.0117 ms 97.0%
+  triton_bmm_3033 0.0124 ms 91.2%
+  triton_bmm_3032 0.0126 ms 89.8%
+  triton_bmm_3031 0.0136 ms 83.5%
+  bmm 0.0150 ms 75.6%
+SingleProcess AUTOTUNE takes 3.4457 seconds
+AUTOTUNE bmm(512x1x34, 512x34x96)
+  triton_bmm_3057 0.0111 ms 100.0%
+  triton_bmm_3047 0.0112 ms 98.6%
+  triton_bmm_3054 0.0116 ms 95.8%
+  triton_bmm_3055 0.0118 ms 93.8%
+  triton_bmm_3056 0.0118 ms 93.5%
+  triton_bmm_3046 0.0119 ms 92.8%
+  triton_bmm_3053 0.0123 ms 90.3%
+  bmm 0.0126 ms 88.2%
+  triton_bmm_3048 0.0128 ms 86.8%
+  triton_bmm_3049 0.0140 ms 79.2%
+SingleProcess AUTOTUNE takes 3.5837 seconds
+AUTOTUNE bmm(512x1x96, 512x96x35)
+  triton_bmm_3120 0.0112 ms 100.0%
+  triton_bmm_3123 0.0115 ms 97.2%
+  triton_bmm_3118 0.0116 ms 96.7%
+  triton_bmm_3119 0.0116 ms 96.2%
+  triton_bmm_3121 0.0117 ms 95.6%
+  triton_bmm_3122 0.0119 ms 94.1%
+  triton_bmm_3126 0.0120 ms 93.1%
+  triton_bmm_3125 0.0126 ms 88.6%
+  triton_bmm_3124 0.0128 ms 87.3%
+  bmm 0.0171 ms 65.7%
+SingleProcess AUTOTUNE takes 2.8688 seconds
+AUTOTUNE bmm(512x1x35, 512x35x96)
+  triton_bmm_3150 0.0112 ms 100.0%
+  triton_bmm_3140 0.0118 ms 94.9%
+  triton_bmm_3146 0.0119 ms 94.4%
+  triton_bmm_3147 0.0120 ms 93.4%
+  triton_bmm_3149 0.0121 ms 92.6%
+  triton_bmm_3139 0.0123 ms 91.2%
+  triton_bmm_3148 0.0125 ms 89.8%
+  triton_bmm_3143 0.0129 ms 87.3%
+  triton_bmm_3141 0.0130 ms 86.7%
+  triton_bmm_3145 0.0130 ms 86.7%
+SingleProcess AUTOTUNE takes 3.4398 seconds
+AUTOTUNE bmm(512x1x96, 512x96x36)
+  triton_bmm_3211 0.0107 ms 100.0%
+  triton_bmm_3212 0.0114 ms 93.8%
+  triton_bmm_3213 0.0116 ms 92.0%
+  triton_bmm_3215 0.0116 ms 92.0%
+  triton_bmm_3214 0.0117 ms 91.5%
+  triton_bmm_3217 0.0120 ms 89.1%
+  triton_bmm_3216 0.0121 ms 88.4%
+  triton_bmm_3219 0.0125 ms 85.9%
+  triton_bmm_3218 0.0125 ms 85.7%
+  bmm 0.0150 ms 71.4%
+SingleProcess AUTOTUNE takes 3.2514 seconds
+AUTOTUNE bmm(512x1x36, 512x36x96)
+  triton_bmm_3232 0.0117 ms 100.0%
+  triton_bmm_3243 0.0118 ms 99.5%
+  triton_bmm_3240 0.0118 ms 99.2%
+  triton_bmm_3242 0.0118 ms 99.2%
+  triton_bmm_3241 0.0119 ms 98.7%
+  triton_bmm_3233 0.0119 ms 98.1%
+  triton_bmm_3239 0.0123 ms 95.1%
+  triton_bmm_3234 0.0124 ms 94.8%
+  bmm 0.0128 ms 91.7%
+  triton_bmm_3238 0.0133 ms 88.0%
+SingleProcess AUTOTUNE takes 3.5753 seconds
+AUTOTUNE bmm(512x1x96, 512x96x37)
+  triton_bmm_3308 0.0115 ms 100.0%
+  triton_bmm_3304 0.0116 ms 98.9%
+  triton_bmm_3309 0.0116 ms 98.6%
+  triton_bmm_3305 0.0117 ms 98.1%
+  triton_bmm_3307 0.0119 ms 96.8%
+  triton_bmm_3306 0.0119 ms 96.2%
+  triton_bmm_3310 0.0123 ms 93.5%
+  triton_bmm_3311 0.0123 ms 93.5%
+  triton_bmm_3312 0.0127 ms 90.7%
+  bmm 0.0172 ms 66.7%
+SingleProcess AUTOTUNE takes 3.1592 seconds
+AUTOTUNE bmm(512x1x37, 512x37x96)
+  triton_bmm_3336 0.0118 ms 100.0%
+  triton_bmm_3334 0.0121 ms 97.9%
+  triton_bmm_3325 0.0123 ms 96.3%
+  triton_bmm_3327 0.0125 ms 94.4%
+  triton_bmm_3332 0.0125 ms 94.4%
+  triton_bmm_3326 0.0126 ms 93.9%
+  triton_bmm_3333 0.0127 ms 93.2%
+  triton_bmm_3335 0.0128 ms 92.0%
+  triton_bmm_3329 0.0130 ms 90.9%
+  bmm 0.0130 ms 90.7%
+SingleProcess AUTOTUNE takes 3.5187 seconds
+AUTOTUNE bmm(512x1x96, 512x96x38)
+  triton_bmm_3397 0.0111 ms 100.0%
+  triton_bmm_3401 0.0113 ms 98.0%
+  triton_bmm_3399 0.0118 ms 94.0%
+  triton_bmm_3400 0.0118 ms 93.5%
+  triton_bmm_3398 0.0121 ms 91.3%
+  triton_bmm_3404 0.0122 ms 91.1%
+  triton_bmm_3402 0.0123 ms 90.3%
+  triton_bmm_3405 0.0126 ms 87.7%
+  triton_bmm_3403 0.0141 ms 78.3%
+  bmm 0.0152 ms 72.7%
+SingleProcess AUTOTUNE takes 2.8185 seconds
+AUTOTUNE bmm(512x1x38, 512x38x96)
+  triton_bmm_3429 0.0114 ms 100.0%
+  triton_bmm_3428 0.0115 ms 99.2%
+  triton_bmm_3425 0.0117 ms 97.5%
+  triton_bmm_3426 0.0119 ms 95.4%
+  triton_bmm_3419 0.0120 ms 95.2%
+  triton_bmm_3418 0.0120 ms 94.9%
+  triton_bmm_3427 0.0126 ms 90.4%
+  bmm 0.0128 ms 88.8%
+  triton_bmm_3420 0.0129 ms 88.6%
+  triton_bmm_3421 0.0136 ms 84.1%
+SingleProcess AUTOTUNE takes 3.7808 seconds
+AUTOTUNE bmm(512x1x96, 512x96x39)
+  triton_bmm_3490 0.0112 ms 100.0%
+  triton_bmm_3492 0.0115 ms 97.8%
+  triton_bmm_3495 0.0118 ms 95.4%
+  triton_bmm_3491 0.0119 ms 94.4%
+  triton_bmm_3493 0.0119 ms 94.1%
+  triton_bmm_3494 0.0122 ms 92.4%
+  triton_bmm_3497 0.0124 ms 90.9%
+  triton_bmm_3496 0.0124 ms 90.2%
+  triton_bmm_3498 0.0128 ms 87.5%
+  bmm 0.0173 ms 65.0%
+SingleProcess AUTOTUNE takes 2.9732 seconds
+AUTOTUNE bmm(512x1x39, 512x39x96)
+  triton_bmm_3522 0.0118 ms 100.0%
+  triton_bmm_3518 0.0121 ms 98.1%
+  triton_bmm_3512 0.0121 ms 97.9%
+  triton_bmm_3511 0.0124 ms 95.4%
+  triton_bmm_3513 0.0127 ms 93.4%
+  triton_bmm_3519 0.0127 ms 93.2%
+  triton_bmm_3520 0.0127 ms 93.0%
+  triton_bmm_3521 0.0130 ms 91.1%
+  bmm 0.0131 ms 90.7%
+  triton_bmm_3515 0.0132 ms 89.8%
+SingleProcess AUTOTUNE takes 3.6338 seconds
+AUTOTUNE bmm(512x1x96, 512x96x40)
+  triton_bmm_3583 0.0110 ms 100.0%
+  triton_bmm_3585 0.0112 ms 97.7%
+  triton_bmm_3588 0.0118 ms 93.0%
+  triton_bmm_3586 0.0118 ms 92.7%
+  triton_bmm_3587 0.0119 ms 92.5%
+  triton_bmm_3591 0.0121 ms 90.5%
+  triton_bmm_3584 0.0122 ms 90.0%
+  triton_bmm_3589 0.0123 ms 89.3%
+  triton_bmm_3590 0.0123 ms 89.3%
+  bmm 0.0157 ms 69.7%
+SingleProcess AUTOTUNE takes 2.7741 seconds
+AUTOTUNE bmm(512x1x40, 512x40x96)
+  triton_bmm_3614 0.0115 ms 100.0%
+  triton_bmm_3615 0.0115 ms 100.0%
+  triton_bmm_3605 0.0116 ms 99.7%
+  triton_bmm_3606 0.0120 ms 95.7%
+  triton_bmm_3611 0.0124 ms 93.3%
+  triton_bmm_3604 0.0124 ms 92.5%
+  bmm 0.0125 ms 92.1%
+  triton_bmm_3612 0.0127 ms 90.9%
+  triton_bmm_3613 0.0127 ms 90.9%
+  triton_bmm_3607 0.0136 ms 84.5%
+SingleProcess AUTOTUNE takes 3.9361 seconds
+AUTOTUNE bmm(512x1x96, 512x96x41)
+  triton_bmm_3676 0.0119 ms 100.0%
+  triton_bmm_3678 0.0123 ms 97.1%
+  triton_bmm_3680 0.0124 ms 96.4%
+  triton_bmm_3681 0.0124 ms 95.9%
+  triton_bmm_3682 0.0126 ms 94.5%
+  triton_bmm_3679 0.0126 ms 94.2%
+  triton_bmm_3677 0.0127 ms 93.9%
+  triton_bmm_3684 0.0129 ms 92.2%
+  triton_bmm_3683 0.0130 ms 91.9%
+  bmm 0.0179 ms 66.7%
+SingleProcess AUTOTUNE takes 3.1567 seconds
+AUTOTUNE bmm(512x1x41, 512x41x96)
+  triton_bmm_3708 0.0120 ms 100.0%
+  triton_bmm_3704 0.0121 ms 99.2%
+  triton_bmm_3705 0.0124 ms 97.4%
+  triton_bmm_3706 0.0129 ms 93.1%
+  triton_bmm_3698 0.0130 ms 92.8%
+  triton_bmm_3707 0.0131 ms 91.9%
+  triton_bmm_3699 0.0135 ms 89.1%
+  triton_bmm_3703 0.0135 ms 89.1%
+  bmm 0.0136 ms 88.5%
+  triton_bmm_3702 0.0138 ms 87.4%
+SingleProcess AUTOTUNE takes 3.6575 seconds
+AUTOTUNE bmm(512x1x96, 512x96x42)
+  triton_bmm_3773 0.0116 ms 100.0%
+  triton_bmm_3769 0.0118 ms 98.1%
+  triton_bmm_3772 0.0121 ms 96.3%
+  triton_bmm_3771 0.0122 ms 95.5%
+  triton_bmm_3776 0.0124 ms 94.0%
+  triton_bmm_3770 0.0125 ms 93.1%
+  triton_bmm_3774 0.0125 ms 93.1%
+  triton_bmm_3777 0.0129 ms 90.1%
+  triton_bmm_3775 0.0138 ms 84.0%
+  bmm 0.0155 ms 74.8%
+SingleProcess AUTOTUNE takes 3.4393 seconds
+AUTOTUNE bmm(512x1x42, 512x42x96)
+  triton_bmm_3800 0.0115 ms 100.0%
+  triton_bmm_3801 0.0116 ms 99.7%
+  triton_bmm_3791 0.0116 ms 98.9%
+  triton_bmm_3797 0.0118 ms 97.3%
+  triton_bmm_3790 0.0121 ms 95.0%
+  triton_bmm_3798 0.0121 ms 95.0%
+  triton_bmm_3792 0.0124 ms 93.3%
+  triton_bmm_3799 0.0129 ms 89.3%
+  bmm 0.0131 ms 88.2%
+  triton_bmm_3793 0.0138 ms 83.7%
+SingleProcess AUTOTUNE takes 3.8230 seconds
+AUTOTUNE bmm(512x1x96, 512x96x43)
+  triton_bmm_3862 0.0116 ms 100.0%
+  triton_bmm_3864 0.0119 ms 97.3%
+  triton_bmm_3866 0.0124 ms 93.0%
+  triton_bmm_3870 0.0125 ms 92.1%
+  triton_bmm_3869 0.0126 ms 91.6%
+  triton_bmm_3867 0.0127 ms 91.2%
+  triton_bmm_3865 0.0128 ms 90.6%
+  triton_bmm_3863 0.0128 ms 90.0%
+  triton_bmm_3868 0.0134 ms 86.4%
+  bmm 0.0180 ms 64.1%
+SingleProcess AUTOTUNE takes 2.9736 seconds
+AUTOTUNE bmm(512x1x43, 512x43x96)
+  triton_bmm_3894 0.0121 ms 100.0%
+  triton_bmm_3890 0.0123 ms 99.0%
+  triton_bmm_3891 0.0126 ms 96.4%
+  triton_bmm_3893 0.0126 ms 96.4%
+  triton_bmm_3883 0.0127 ms 95.5%
+  triton_bmm_3884 0.0131 ms 92.9%
+  triton_bmm_3885 0.0131 ms 92.9%
+  triton_bmm_3892 0.0131 ms 92.4%
+  bmm 0.0138 ms 88.1%
+  triton_bmm_3886 0.0140 ms 86.3%
+SingleProcess AUTOTUNE takes 3.6480 seconds
+AUTOTUNE bmm(512x1x96, 512x96x44)
+  triton_bmm_3955 0.0117 ms 100.0%
+  triton_bmm_3956 0.0120 ms 98.0%
+  triton_bmm_3957 0.0121 ms 96.7%
+  triton_bmm_3958 0.0122 ms 95.9%
+  triton_bmm_3959 0.0122 ms 95.9%
+  triton_bmm_3963 0.0125 ms 93.7%
+  triton_bmm_3961 0.0125 ms 93.5%
+  triton_bmm_3960 0.0126 ms 93.0%
+  triton_bmm_3962 0.0130 ms 90.3%
+  bmm 0.0157 ms 74.6%
+SingleProcess AUTOTUNE takes 2.9121 seconds
+AUTOTUNE bmm(512x1x44, 512x44x96)
+  triton_bmm_3977 0.0116 ms 100.0%
+  triton_bmm_3987 0.0117 ms 99.5%
+  triton_bmm_3983 0.0119 ms 98.1%
+  triton_bmm_3986 0.0123 ms 94.8%
+  triton_bmm_3985 0.0125 ms 93.3%
+  triton_bmm_3976 0.0125 ms 92.9%
+  triton_bmm_3978 0.0128 ms 91.0%
+  triton_bmm_3984 0.0129 ms 90.1%
+  bmm 0.0141 ms 82.5%
+  triton_bmm_3982 0.0142 ms 82.0%
+SingleProcess AUTOTUNE takes 3.7584 seconds
+AUTOTUNE bmm(512x1x96, 512x96x45)
+  triton_bmm_4048 0.0116 ms 100.0%
+  triton_bmm_4052 0.0120 ms 96.8%
+  triton_bmm_4049 0.0124 ms 94.0%
+  triton_bmm_4050 0.0125 ms 92.6%
+  triton_bmm_4055 0.0127 ms 91.4%
+  triton_bmm_4053 0.0128 ms 91.0%
+  triton_bmm_4054 0.0128 ms 90.8%
+  triton_bmm_4051 0.0130 ms 89.4%
+  triton_bmm_4056 0.0132 ms 88.1%
+  bmm 0.0181 ms 64.2%
+SingleProcess AUTOTUNE takes 3.0591 seconds
+AUTOTUNE bmm(512x1x45, 512x45x96)
+  triton_bmm_4080 0.0117 ms 100.0%
+  triton_bmm_4079 0.0126 ms 92.9%
+  triton_bmm_4076 0.0129 ms 91.3%
+  triton_bmm_4078 0.0132 ms 89.3%
+  triton_bmm_4070 0.0132 ms 89.1%
+  triton_bmm_4077 0.0132 ms 88.9%
+  triton_bmm_4069 0.0133 ms 88.4%
+  triton_bmm_4072 0.0136 ms 86.6%
+  triton_bmm_4073 0.0138 ms 85.3%
+  bmm 0.0138 ms 85.0%
+SingleProcess AUTOTUNE takes 3.5940 seconds
+AUTOTUNE bmm(512x1x96, 512x96x46)
+  triton_bmm_4143 0.0119 ms 100.0%
+  triton_bmm_4141 0.0121 ms 98.4%
+  triton_bmm_4142 0.0121 ms 97.9%
+  triton_bmm_4144 0.0123 ms 96.4%
+  triton_bmm_4145 0.0125 ms 95.1%
+  triton_bmm_4148 0.0127 ms 93.7%
+  triton_bmm_4146 0.0128 ms 92.8%
+  triton_bmm_4149 0.0131 ms 90.5%
+  triton_bmm_4147 0.0146 ms 81.2%
+  bmm 0.0157 ms 75.4%
+SingleProcess AUTOTUNE takes 3.1614 seconds
+AUTOTUNE bmm(512x1x46, 512x46x96)
+  triton_bmm_4172 0.0117 ms 100.0%
+  triton_bmm_4173 0.0118 ms 99.7%
+  triton_bmm_4169 0.0119 ms 98.7%
+  triton_bmm_4163 0.0124 ms 94.3%
+  triton_bmm_4171 0.0126 ms 93.4%
+  triton_bmm_4162 0.0128 ms 91.6%
+  triton_bmm_4170 0.0131 ms 90.0%
+  triton_bmm_4164 0.0131 ms 89.7%
+  triton_bmm_4165 0.0139 ms 84.4%
+  bmm 0.0143 ms 81.9%
+SingleProcess AUTOTUNE takes 3.9072 seconds
+AUTOTUNE bmm(512x1x96, 512x96x47)
+  triton_bmm_4234 0.0118 ms 100.0%
+  triton_bmm_4238 0.0123 ms 96.1%
+  triton_bmm_4239 0.0124 ms 95.6%
+  triton_bmm_4235 0.0125 ms 94.6%
+  triton_bmm_4237 0.0125 ms 94.1%
+  triton_bmm_4242 0.0127 ms 92.7%
+  triton_bmm_4236 0.0128 ms 92.5%
+  triton_bmm_4241 0.0129 ms 91.8%
+  triton_bmm_4240 0.0134 ms 88.3%
+  bmm 0.0181 ms 65.2%
+SingleProcess AUTOTUNE takes 2.9918 seconds
+AUTOTUNE bmm(512x1x47, 512x47x96)
+  triton_bmm_4266 0.0124 ms 100.0%
+  triton_bmm_4262 0.0130 ms 95.3%
+  triton_bmm_4256 0.0133 ms 93.0%
+  triton_bmm_4255 0.0134 ms 92.3%
+  triton_bmm_4263 0.0134 ms 92.3%
+  triton_bmm_4264 0.0134 ms 92.3%
+  triton_bmm_4265 0.0134 ms 92.3%
+  triton_bmm_4257 0.0139 ms 88.7%
+  triton_bmm_4258 0.0143 ms 86.3%
+  bmm 0.0144 ms 85.8%
+SingleProcess AUTOTUNE takes 3.7565 seconds
+AUTOTUNE bmm(512x1x96, 512x96x48)
+  triton_bmm_4329 0.0119 ms 100.0%
+  triton_bmm_4331 0.0119 ms 99.7%
+  triton_bmm_4327 0.0120 ms 99.2%
+  triton_bmm_4330 0.0124 ms 96.4%
+  triton_bmm_4335 0.0126 ms 94.7%
+  triton_bmm_4334 0.0127 ms 93.9%
+  triton_bmm_4328 0.0127 ms 93.5%
+  triton_bmm_4333 0.0127 ms 93.5%
+  triton_bmm_4332 0.0130 ms 91.4%
+  bmm 0.0162 ms 73.4%
+SingleProcess AUTOTUNE takes 2.7901 seconds
+AUTOTUNE bmm(512x1x48, 512x48x96)
+  triton_bmm_4359 0.0115 ms 100.0%
+  triton_bmm_4355 0.0119 ms 96.4%
+  triton_bmm_4348 0.0120 ms 95.6%
+  triton_bmm_4358 0.0123 ms 93.4%
+  triton_bmm_4357 0.0124 ms 92.9%
+  triton_bmm_4349 0.0126 ms 91.1%
+  triton_bmm_4356 0.0126 ms 91.0%
+  triton_bmm_4350 0.0129 ms 89.2%
+  bmm 0.0133 ms 86.6%
+  triton_bmm_4353 0.0142 ms 81.0%
+SingleProcess AUTOTUNE takes 3.7464 seconds
+AUTOTUNE bmm(512x1x96, 512x96x49)
+  triton_bmm_4420 0.0120 ms 100.0%
+  triton_bmm_4422 0.0124 ms 96.6%
+  triton_bmm_4424 0.0124 ms 96.4%
+  triton_bmm_4425 0.0124 ms 96.1%
+  triton_bmm_4421 0.0126 ms 94.9%
+  triton_bmm_4423 0.0127 ms 94.4%
+  triton_bmm_4426 0.0131 ms 91.7%
+  triton_bmm_4427 0.0135 ms 88.4%
+  triton_bmm_4428 0.0136 ms 88.2%
+  bmm 0.0165 ms 72.5%
+SingleProcess AUTOTUNE takes 3.0195 seconds
+AUTOTUNE bmm(512x1x49, 512x49x96)
+  triton_bmm_4448 0.0125 ms 100.0%
+  triton_bmm_4442 0.0128 ms 97.5%
+  triton_bmm_4441 0.0130 ms 96.1%
+  triton_bmm_4452 0.0131 ms 95.4%
+  triton_bmm_4451 0.0138 ms 90.9%
+  triton_bmm_4449 0.0138 ms 90.7%
+  bmm 0.0139 ms 90.1%
+  triton_bmm_4444 0.0140 ms 89.7%
+  triton_bmm_4450 0.0141 ms 88.8%
+  triton_bmm_4443 0.0141 ms 88.7%
+SingleProcess AUTOTUNE takes 3.5376 seconds
+AUTOTUNE bmm(512x1x96, 512x96x50)
+  triton_bmm_4513 0.0118 ms 100.0%
+  triton_bmm_4517 0.0123 ms 96.6%
+  triton_bmm_4514 0.0124 ms 95.4%
+  triton_bmm_4516 0.0125 ms 94.6%
+  triton_bmm_4515 0.0128 ms 92.7%
+  triton_bmm_4520 0.0129 ms 91.6%
+  triton_bmm_4518 0.0131 ms 90.7%
+  triton_bmm_4521 0.0135 ms 87.9%
+  triton_bmm_4519 0.0149 ms 79.6%
+  bmm 0.0165 ms 71.6%
+SingleProcess AUTOTUNE takes 2.7915 seconds
+AUTOTUNE bmm(512x1x50, 512x50x96)
+  triton_bmm_4541 0.0120 ms 100.0%
+  triton_bmm_4535 0.0121 ms 99.2%
+  triton_bmm_4534 0.0124 ms 96.4%
+  triton_bmm_4536 0.0128 ms 93.3%
+  triton_bmm_4544 0.0132 ms 90.8%
+  triton_bmm_4542 0.0135 ms 88.6%
+  triton_bmm_4545 0.0136 ms 88.2%
+  triton_bmm_4543 0.0139 ms 86.2%
+  triton_bmm_4539 0.0144 ms 83.1%
+  triton_bmm_4540 0.0147 ms 81.7%
+SingleProcess AUTOTUNE takes 3.9766 seconds
+AUTOTUNE bmm(512x1x96, 512x96x51)
+  triton_bmm_4610 0.0125 ms 100.0%
+  triton_bmm_4606 0.0126 ms 99.0%
+  triton_bmm_4611 0.0126 ms 98.7%
+  triton_bmm_4609 0.0127 ms 98.0%
+  triton_bmm_4607 0.0128 ms 97.5%
+  triton_bmm_4608 0.0131 ms 95.6%
+  triton_bmm_4614 0.0131 ms 95.6%
+  triton_bmm_4613 0.0131 ms 95.1%
+  triton_bmm_4612 0.0136 ms 91.8%
+  bmm 0.0166 ms 75.3%
+SingleProcess AUTOTUNE takes 2.8854 seconds
+AUTOTUNE bmm(512x1x51, 512x51x96)
+  triton_bmm_4634 0.0126 ms 100.0%
+  triton_bmm_4638 0.0128 ms 99.0%
+  triton_bmm_4627 0.0131 ms 96.3%
+  triton_bmm_4629 0.0137 ms 92.5%
+  triton_bmm_4628 0.0137 ms 92.3%
+  triton_bmm_4635 0.0139 ms 91.0%
+  triton_bmm_4636 0.0141 ms 89.6%
+  triton_bmm_4633 0.0144 ms 87.6%
+  triton_bmm_4637 0.0144 ms 87.6%
+  triton_bmm_4630 0.0147 ms 86.2%
+SingleProcess AUTOTUNE takes 3.7118 seconds
+AUTOTUNE bmm(512x1x96, 512x96x52)
+  triton_bmm_4699 0.0118 ms 100.0%
+  triton_bmm_4703 0.0124 ms 95.1%
+  triton_bmm_4702 0.0127 ms 92.5%
+  triton_bmm_4701 0.0129 ms 91.3%
+  triton_bmm_4706 0.0130 ms 90.6%
+  triton_bmm_4707 0.0130 ms 90.4%
+  triton_bmm_4705 0.0131 ms 90.2%
+  triton_bmm_4700 0.0132 ms 89.1%
+  triton_bmm_4704 0.0132 ms 88.9%
+  bmm 0.0167 ms 70.6%
+SingleProcess AUTOTUNE takes 3.4355 seconds
+AUTOTUNE bmm(512x1x52, 512x52x96)
+  triton_bmm_4721 0.0122 ms 100.0%
+  triton_bmm_4720 0.0123 ms 98.7%
+  triton_bmm_4727 0.0127 ms 96.0%
+  triton_bmm_4722 0.0128 ms 95.0%
+  triton_bmm_4730 0.0128 ms 95.0%
+  triton_bmm_4728 0.0131 ms 93.1%
+  triton_bmm_4731 0.0131 ms 93.1%
+  triton_bmm_4729 0.0133 ms 91.1%
+  triton_bmm_4723 0.0142 ms 85.6%
+  triton_bmm_4725 0.0143 ms 85.2%
+SingleProcess AUTOTUNE takes 3.5157 seconds
+AUTOTUNE bmm(512x1x96, 512x96x53)
+  triton_bmm_4792 0.0122 ms 100.0%
+  triton_bmm_4794 0.0127 ms 96.0%
+  triton_bmm_4797 0.0127 ms 95.5%
+  triton_bmm_4795 0.0128 ms 94.8%
+  triton_bmm_4800 0.0132 ms 92.0%
+  triton_bmm_4796 0.0132 ms 91.8%
+  triton_bmm_4793 0.0136 ms 89.6%
+  triton_bmm_4799 0.0138 ms 88.2%
+  triton_bmm_4798 0.0139 ms 87.8%
+  bmm 0.0168 ms 72.5%
+SingleProcess AUTOTUNE takes 2.9045 seconds
+AUTOTUNE bmm(512x1x53, 512x53x96)
+  triton_bmm_4820 0.0128 ms 100.0%
+  triton_bmm_4824 0.0128 ms 100.0%
+  triton_bmm_4813 0.0133 ms 96.4%
+  triton_bmm_4821 0.0134 ms 95.2%
+  triton_bmm_4815 0.0138 ms 92.8%
+  triton_bmm_4814 0.0138 ms 92.6%
+  triton_bmm_4823 0.0139 ms 92.2%
+  triton_bmm_4822 0.0141 ms 90.5%
+  bmm 0.0148 ms 86.8%
+  triton_bmm_4816 0.0148 ms 86.2%
+SingleProcess AUTOTUNE takes 3.4614 seconds
+AUTOTUNE bmm(512x1x96, 512x96x54)
+  triton_bmm_4885 0.0121 ms 100.0%
+  triton_bmm_4887 0.0126 ms 96.2%
+  triton_bmm_4886 0.0128 ms 95.0%
+  triton_bmm_4890 0.0128 ms 94.5%
+  triton_bmm_4889 0.0132 ms 92.2%
+  triton_bmm_4892 0.0132 ms 92.0%
+  triton_bmm_4888 0.0134 ms 90.2%
+  triton_bmm_4893 0.0137 ms 88.3%
+  triton_bmm_4891 0.0151 ms 80.3%
+  bmm 0.0168 ms 72.2%
+SingleProcess AUTOTUNE takes 3.0030 seconds
+AUTOTUNE bmm(512x1x54, 512x54x96)
+  triton_bmm_4913 0.0122 ms 100.0%
+  triton_bmm_4907 0.0124 ms 97.9%
+  triton_bmm_4906 0.0126 ms 96.7%
+  triton_bmm_4914 0.0131 ms 93.2%
+  triton_bmm_4916 0.0133 ms 91.4%
+  triton_bmm_4915 0.0135 ms 90.3%
+  triton_bmm_4908 0.0135 ms 90.1%
+  triton_bmm_4917 0.0139 ms 88.0%
+  triton_bmm_4909 0.0143 ms 85.2%
+  triton_bmm_4910 0.0145 ms 84.3%
+SingleProcess AUTOTUNE takes 3.7752 seconds
+AUTOTUNE bmm(512x1x96, 512x96x55)
+  triton_bmm_4980 0.0128 ms 100.0%
+  triton_bmm_4982 0.0129 ms 99.5%
+  triton_bmm_4978 0.0130 ms 98.8%
+  triton_bmm_4981 0.0130 ms 98.8%
+  triton_bmm_4985 0.0134 ms 95.2%
+  triton_bmm_4983 0.0135 ms 95.0%
+  triton_bmm_4979 0.0137 ms 93.2%
+  triton_bmm_4984 0.0140 ms 91.7%
+  triton_bmm_4986 0.0140 ms 91.7%
+  bmm 0.0168 ms 76.0%
+SingleProcess AUTOTUNE takes 3.0807 seconds
+AUTOTUNE bmm(512x1x55, 512x55x96)
+  triton_bmm_5000 0.0133 ms 100.0%
+  triton_bmm_5006 0.0135 ms 98.6%
+  triton_bmm_5010 0.0135 ms 98.3%
+  triton_bmm_4999 0.0140 ms 95.2%
+  triton_bmm_5001 0.0140 ms 95.0%
+  triton_bmm_5007 0.0141 ms 94.1%
+  triton_bmm_5008 0.0142 ms 93.5%
+  triton_bmm_5009 0.0147 ms 90.4%
+  triton_bmm_5005 0.0148 ms 89.8%
+  bmm 0.0148 ms 89.6%
+SingleProcess AUTOTUNE takes 3.7114 seconds
+AUTOTUNE bmm(512x1x96, 512x96x56)
+  triton_bmm_5071 0.0126 ms 100.0%
+  triton_bmm_5076 0.0129 ms 97.5%
+  triton_bmm_5079 0.0132 ms 95.6%
+  triton_bmm_5073 0.0133 ms 94.9%
+  triton_bmm_5075 0.0133 ms 94.9%
+  triton_bmm_5072 0.0134 ms 94.0%
+  triton_bmm_5074 0.0134 ms 93.8%
+  triton_bmm_5077 0.0138 ms 91.6%
+  triton_bmm_5078 0.0138 ms 91.2%
+  bmm 0.0168 ms 75.0%
+SingleProcess AUTOTUNE takes 2.7839 seconds
+AUTOTUNE bmm(512x1x56, 512x56x96)
+  triton_bmm_5099 0.0123 ms 100.0%
+  triton_bmm_5092 0.0125 ms 98.2%
+  triton_bmm_5093 0.0126 ms 98.0%
+  triton_bmm_5102 0.0128 ms 96.0%
+  triton_bmm_5094 0.0130 ms 94.8%
+  triton_bmm_5100 0.0135 ms 91.3%
+  triton_bmm_5103 0.0138 ms 89.1%
+  triton_bmm_5101 0.0140 ms 87.7%
+  triton_bmm_5095 0.0144 ms 85.6%
+  triton_bmm_5096 0.0144 ms 85.6%
+SingleProcess AUTOTUNE takes 3.6137 seconds
+AUTOTUNE bmm(512x1x96, 512x96x57)
+  triton_bmm_5164 0.0131 ms 100.0%
+  triton_bmm_5166 0.0136 ms 96.2%
+  triton_bmm_5167 0.0137 ms 95.8%
+  triton_bmm_5168 0.0137 ms 95.8%
+  triton_bmm_5169 0.0137 ms 95.6%
+  triton_bmm_5165 0.0139 ms 94.2%
+  triton_bmm_5172 0.0140 ms 93.2%
+  triton_bmm_5170 0.0141 ms 93.0%
+  triton_bmm_5171 0.0141 ms 92.7%
+  bmm 0.0170 ms 76.9%
+SingleProcess AUTOTUNE takes 3.0899 seconds
+AUTOTUNE bmm(512x1x57, 512x57x96)
+  triton_bmm_5196 0.0132 ms 100.0%
+  triton_bmm_5186 0.0135 ms 98.1%
+  triton_bmm_5185 0.0136 ms 96.9%
+  triton_bmm_5192 0.0138 ms 95.8%
+  triton_bmm_5194 0.0139 ms 94.9%
+  triton_bmm_5193 0.0142 ms 93.2%
+  triton_bmm_5195 0.0144 ms 92.0%
+  triton_bmm_5188 0.0146 ms 90.4%
+  triton_bmm_5189 0.0147 ms 90.0%
+  triton_bmm_5187 0.0148 ms 89.2%
+SingleProcess AUTOTUNE takes 3.5332 seconds
+AUTOTUNE bmm(512x1x96, 512x96x58)
+  triton_bmm_5257 0.0125 ms 100.0%
+  triton_bmm_5259 0.0130 ms 96.6%
+  triton_bmm_5258 0.0131 ms 96.1%
+  triton_bmm_5260 0.0131 ms 95.8%
+  triton_bmm_5265 0.0134 ms 93.3%
+  triton_bmm_5261 0.0135 ms 92.9%
+  triton_bmm_5264 0.0136 ms 92.5%
+  triton_bmm_5262 0.0138 ms 91.2%
+  triton_bmm_5263 0.0146 ms 86.0%
+  bmm 0.0171 ms 73.4%
+SingleProcess AUTOTUNE takes 2.7765 seconds
+AUTOTUNE bmm(512x1x58, 512x58x96)
+  triton_bmm_5279 0.0127 ms 100.0%
+  triton_bmm_5285 0.0130 ms 98.0%
+  triton_bmm_5278 0.0130 ms 97.8%
+  triton_bmm_5280 0.0132 ms 96.6%
+  triton_bmm_5288 0.0135 ms 94.5%
+  triton_bmm_5289 0.0135 ms 94.1%
+  triton_bmm_5287 0.0138 ms 92.1%
+  triton_bmm_5286 0.0139 ms 91.5%
+  triton_bmm_5281 0.0146 ms 87.5%
+  triton_bmm_5283 0.0146 ms 87.3%
+SingleProcess AUTOTUNE takes 3.7572 seconds
+AUTOTUNE bmm(512x1x96, 512x96x59)
+  triton_bmm_5350 0.0132 ms 100.0%
+  triton_bmm_5355 0.0132 ms 99.8%
+  triton_bmm_5351 0.0134 ms 98.3%
+  triton_bmm_5358 0.0136 ms 96.9%
+  triton_bmm_5352 0.0137 ms 95.8%
+  triton_bmm_5354 0.0137 ms 95.8%
+  triton_bmm_5353 0.0139 ms 94.5%
+  triton_bmm_5356 0.0141 ms 93.0%
+  triton_bmm_5357 0.0142 ms 92.4%
+  bmm 0.0172 ms 76.3%
+SingleProcess AUTOTUNE takes 3.4499 seconds
+AUTOTUNE bmm(512x1x59, 512x59x96)
+  triton_bmm_5372 0.0136 ms 100.0%
+  triton_bmm_5382 0.0136 ms 100.0%
+  triton_bmm_5378 0.0139 ms 98.2%
+  triton_bmm_5379 0.0139 ms 98.2%
+  triton_bmm_5380 0.0140 ms 96.8%
+  triton_bmm_5371 0.0143 ms 95.3%
+  triton_bmm_5373 0.0143 ms 94.9%
+  triton_bmm_5381 0.0148 ms 91.6%
+  triton_bmm_5377 0.0152 ms 89.5%
+  triton_bmm_5376 0.0154 ms 88.5%
+SingleProcess AUTOTUNE takes 3.5486 seconds
+AUTOTUNE bmm(512x1x96, 512x96x60)
+  triton_bmm_5443 0.0131 ms 100.0%
+  triton_bmm_5447 0.0132 ms 99.8%
+  triton_bmm_5446 0.0133 ms 98.6%
+  triton_bmm_5449 0.0136 ms 96.5%
+  triton_bmm_5445 0.0136 ms 96.2%
+  triton_bmm_5448 0.0139 ms 94.5%
+  triton_bmm_5444 0.0139 ms 94.3%
+  triton_bmm_5450 0.0140 ms 93.6%
+  triton_bmm_5451 0.0142 ms 92.6%
+  bmm 0.0172 ms 76.1%
+SingleProcess AUTOTUNE takes 3.2140 seconds
+AUTOTUNE bmm(512x1x60, 512x60x96)
+  triton_bmm_5471 0.0126 ms 100.0%
+  triton_bmm_5464 0.0129 ms 97.5%
+  triton_bmm_5474 0.0131 ms 96.3%
+  triton_bmm_5465 0.0135 ms 93.3%
+  triton_bmm_5473 0.0138 ms 91.0%
+  triton_bmm_5466 0.0139 ms 90.8%
+  triton_bmm_5472 0.0142 ms 88.3%
+  triton_bmm_5475 0.0142 ms 88.3%
+  triton_bmm_5468 0.0147 ms 85.8%
+  triton_bmm_5467 0.0147 ms 85.6%
+SingleProcess AUTOTUNE takes 4.2701 seconds
+AUTOTUNE bmm(512x1x96, 512x96x61)
+  triton_bmm_5541 0.0133 ms 100.0%
+  triton_bmm_5536 0.0134 ms 99.3%
+  triton_bmm_5538 0.0134 ms 99.3%
+  triton_bmm_5537 0.0136 ms 97.9%
+  triton_bmm_5542 0.0138 ms 96.5%
+  triton_bmm_5540 0.0140 ms 95.3%
+  triton_bmm_5539 0.0140 ms 95.0%
+  triton_bmm_5544 0.0143 ms 93.1%
+  triton_bmm_5543 0.0144 ms 92.4%
+  bmm 0.0173 ms 76.8%
+SingleProcess AUTOTUNE takes 2.9209 seconds
+AUTOTUNE bmm(512x1x61, 512x61x96)
+  triton_bmm_5568 0.0133 ms 100.0%
+  triton_bmm_5558 0.0138 ms 96.3%
+  triton_bmm_5557 0.0139 ms 95.8%
+  triton_bmm_5564 0.0141 ms 94.3%
+  triton_bmm_5565 0.0142 ms 93.5%
+  triton_bmm_5566 0.0147 ms 90.6%
+  triton_bmm_5567 0.0149 ms 88.9%
+  triton_bmm_5561 0.0150 ms 88.7%
+  triton_bmm_5559 0.0151 ms 87.9%
+  triton_bmm_5560 0.0151 ms 87.7%
+SingleProcess AUTOTUNE takes 3.6379 seconds
+AUTOTUNE bmm(512x1x96, 512x96x62)
+  triton_bmm_5633 0.0133 ms 100.0%
+  triton_bmm_5631 0.0133 ms 99.8%
+  triton_bmm_5629 0.0134 ms 98.8%
+  triton_bmm_5634 0.0135 ms 98.3%
+  triton_bmm_5637 0.0137 ms 96.7%
+  triton_bmm_5636 0.0138 ms 96.3%
+  triton_bmm_5632 0.0140 ms 94.5%
+  triton_bmm_5630 0.0141 ms 94.1%
+  triton_bmm_5635 0.0148 ms 89.6%
+  bmm 0.0173 ms 76.6%
+SingleProcess AUTOTUNE takes 3.0619 seconds
+AUTOTUNE bmm(512x1x62, 512x62x96)
+  triton_bmm_5657 0.0127 ms 100.0%
+  triton_bmm_5651 0.0131 ms 97.1%
+  triton_bmm_5660 0.0133 ms 95.7%
+  triton_bmm_5652 0.0134 ms 94.7%
+  triton_bmm_5658 0.0136 ms 93.2%
+  triton_bmm_5650 0.0139 ms 91.7%
+  triton_bmm_5661 0.0144 ms 88.0%
+  triton_bmm_5659 0.0146 ms 86.9%
+  triton_bmm_5654 0.0147 ms 86.3%
+  triton_bmm_5655 0.0147 ms 86.3%
+SingleProcess AUTOTUNE takes 3.6300 seconds
+AUTOTUNE bmm(512x1x96, 512x96x63)
+  triton_bmm_5722 0.0131 ms 100.0%
+  triton_bmm_5724 0.0136 ms 96.5%
+  triton_bmm_5725 0.0136 ms 96.5%
+  triton_bmm_5726 0.0136 ms 96.2%
+  triton_bmm_5730 0.0139 ms 94.0%
+  triton_bmm_5729 0.0140 ms 93.2%
+  triton_bmm_5727 0.0143 ms 91.8%
+  triton_bmm_5728 0.0143 ms 91.3%
+  triton_bmm_5723 0.0144 ms 90.7%
+  bmm 0.0175 ms 74.6%
+SingleProcess AUTOTUNE takes 2.9939 seconds
+AUTOTUNE bmm(512x1x63, 512x63x96)
+  triton_bmm_5750 0.0137 ms 100.0%
+  triton_bmm_5744 0.0140 ms 97.9%
+  triton_bmm_5743 0.0141 ms 96.8%
+  triton_bmm_5752 0.0143 ms 95.3%
+  triton_bmm_5751 0.0145 ms 94.5%
+  triton_bmm_5753 0.0149 ms 91.6%
+  triton_bmm_5747 0.0151 ms 90.7%
+  triton_bmm_5754 0.0151 ms 90.7%
+  triton_bmm_5745 0.0153 ms 89.5%
+  triton_bmm_5749 0.0156 ms 87.9%
+SingleProcess AUTOTUNE takes 3.7118 seconds
+AUTOTUNE bmm(512x1x96, 512x96x64)
+  triton_bmm_5815 0.0127 ms 100.0%
+  triton_bmm_5819 0.0133 ms 95.4%
+  triton_bmm_5820 0.0135 ms 94.5%
+  triton_bmm_5817 0.0138 ms 92.3%
+  triton_bmm_5818 0.0140 ms 91.1%
+  triton_bmm_5816 0.0141 ms 90.5%
+  triton_bmm_5822 0.0142 ms 89.6%
+  triton_bmm_5823 0.0143 ms 88.8%
+  triton_bmm_5821 0.0144 ms 88.6%
+  bmm 0.0173 ms 73.6%
+SingleProcess AUTOTUNE takes 2.7827 seconds
+AUTOTUNE bmm(512x1x64, 512x64x96)
+  triton_bmm_5836 0.0132 ms 100.0%
+  triton_bmm_5837 0.0132 ms 100.0%
+  triton_bmm_5838 0.0134 ms 98.3%
+  triton_bmm_5843 0.0134 ms 98.1%
+  triton_bmm_5845 0.0135 ms 97.2%
+  triton_bmm_5847 0.0136 ms 96.7%
+  triton_bmm_5846 0.0140 ms 94.3%
+  triton_bmm_5844 0.0146 ms 90.3%
+  triton_bmm_5842 0.0148 ms 89.0%
+  triton_bmm_5840 0.0148 ms 88.6%
+SingleProcess AUTOTUNE takes 3.5173 seconds
+AUTOTUNE bmm(512x1x96, 512x96x65)
+  triton_bmm_5915 0.0146 ms 100.0%
+  triton_bmm_5908 0.0148 ms 98.5%
+  triton_bmm_5911 0.0148 ms 98.5%
+  triton_bmm_5909 0.0149 ms 97.4%
+  triton_bmm_5919 0.0150 ms 96.8%
+  triton_bmm_5912 0.0152 ms 95.8%
+  triton_bmm_5910 0.0156 ms 93.4%
+  triton_bmm_5916 0.0156 ms 93.2%
+  triton_bmm_5914 0.0161 ms 90.6%
+  triton_bmm_5918 0.0161 ms 90.4%
+SingleProcess AUTOTUNE takes 3.9935 seconds
+AUTOTUNE bmm(512x1x65, 512x65x96)
+  triton_bmm_5943 0.0149 ms 100.0%
+  triton_bmm_5932 0.0156 ms 95.4%
+  triton_bmm_5939 0.0156 ms 95.3%
+  triton_bmm_5942 0.0157 ms 94.9%
+  triton_bmm_5933 0.0158 ms 94.3%
+  triton_bmm_5938 0.0163 ms 91.2%
+  triton_bmm_5936 0.0166 ms 89.6%
+  triton_bmm_5934 0.0168 ms 88.6%
+  triton_bmm_5935 0.0168 ms 88.4%
+  triton_bmm_5937 0.0169 ms 88.2%
+SingleProcess AUTOTUNE takes 3.6759 seconds
+AUTOTUNE bmm(512x1x96, 512x96x66)
+  triton_bmm_6011 0.0138 ms 100.0%
+  triton_bmm_6007 0.0145 ms 95.6%
+  triton_bmm_6004 0.0147 ms 93.9%
+  triton_bmm_6006 0.0152 ms 90.9%
+  triton_bmm_6005 0.0155 ms 89.3%
+  triton_bmm_6015 0.0155 ms 89.1%
+  triton_bmm_6008 0.0158 ms 87.4%
+  triton_bmm_6010 0.0161 ms 85.9%
+  triton_bmm_6012 0.0162 ms 85.5%
+  triton_bmm_6014 0.0166 ms 83.4%
+SingleProcess AUTOTUNE takes 3.6390 seconds
+AUTOTUNE bmm(512x1x66, 512x66x96)
+  triton_bmm_6035 0.0141 ms 100.0%
+  triton_bmm_6038 0.0147 ms 96.3%
+  triton_bmm_6029 0.0148 ms 95.3%
+  triton_bmm_6028 0.0149 ms 94.8%
+  triton_bmm_6039 0.0149 ms 94.6%
+  triton_bmm_6030 0.0151 ms 93.6%
+  triton_bmm_6033 0.0161 ms 87.7%
+  triton_bmm_6032 0.0162 ms 87.2%
+  triton_bmm_6036 0.0165 ms 85.5%
+  triton_bmm_6031 0.0166 ms 85.0%
+SingleProcess AUTOTUNE takes 4.1135 seconds
+AUTOTUNE bmm(512x1x96, 512x96x67)
+  triton_bmm_6107 0.0146 ms 100.0%
+  triton_bmm_6100 0.0150 ms 97.4%
+  triton_bmm_6101 0.0151 ms 96.6%
+  triton_bmm_6104 0.0154 ms 95.2%
+  triton_bmm_6103 0.0154 ms 94.8%
+  triton_bmm_6111 0.0157 ms 92.9%
+  triton_bmm_6108 0.0158 ms 92.3%
+  triton_bmm_6106 0.0162 ms 90.4%
+  triton_bmm_6110 0.0163 ms 89.8%
+  triton_bmm_6105 0.0163 ms 89.6%
+SingleProcess AUTOTUNE takes 3.8695 seconds
+AUTOTUNE bmm(512x1x67, 512x67x96)
+  triton_bmm_6135 0.0149 ms 100.0%
+  triton_bmm_6131 0.0150 ms 99.6%
+  triton_bmm_6134 0.0157 ms 95.1%
+  triton_bmm_6125 0.0158 ms 94.3%
+  triton_bmm_6124 0.0161 ms 92.6%
+  triton_bmm_6130 0.0165 ms 90.1%
+  triton_bmm_6126 0.0168 ms 88.8%
+  triton_bmm_6128 0.0169 ms 88.3%
+  triton_bmm_6127 0.0170 ms 87.6%
+  triton_bmm_6129 0.0171 ms 87.4%
+SingleProcess AUTOTUNE takes 4.3958 seconds
+AUTOTUNE bmm(512x1x96, 512x96x68)
+  triton_bmm_6203 0.0143 ms 100.0%
+  triton_bmm_6196 0.0145 ms 98.2%
+  triton_bmm_6197 0.0146 ms 97.6%
+  triton_bmm_6199 0.0150 ms 95.1%
+  triton_bmm_6200 0.0155 ms 92.3%
+  triton_bmm_6204 0.0156 ms 91.4%
+  triton_bmm_6207 0.0156 ms 91.4%
+  triton_bmm_6198 0.0158 ms 90.1%
+  triton_bmm_6206 0.0158 ms 90.1%
+  triton_bmm_6202 0.0161 ms 88.5%
+SingleProcess AUTOTUNE takes 3.8666 seconds
+AUTOTUNE bmm(512x1x68, 512x68x96)
+  triton_bmm_6221 0.0142 ms 100.0%
+  triton_bmm_6230 0.0147 ms 96.9%
+  triton_bmm_6227 0.0148 ms 95.7%
+  triton_bmm_6231 0.0149 ms 95.3%
+  triton_bmm_6220 0.0153 ms 92.7%
+  triton_bmm_6228 0.0154 ms 92.5%
+  triton_bmm_6222 0.0157 ms 90.6%
+  triton_bmm_6226 0.0163 ms 87.2%
+  triton_bmm_6224 0.0164 ms 86.4%
+  triton_bmm_6223 0.0167 ms 84.9%
+SingleProcess AUTOTUNE takes 3.7889 seconds
+AUTOTUNE bmm(512x1x96, 512x96x69)
+  triton_bmm_6299 0.0144 ms 100.0%
+  triton_bmm_6292 0.0148 ms 97.6%
+  triton_bmm_6295 0.0150 ms 95.7%
+  triton_bmm_6293 0.0154 ms 93.8%
+  triton_bmm_6303 0.0154 ms 93.6%
+  triton_bmm_6296 0.0156 ms 92.4%
+  triton_bmm_6294 0.0159 ms 90.4%
+  triton_bmm_6298 0.0163 ms 88.2%
+  triton_bmm_6300 0.0164 ms 87.7%
+  triton_bmm_6301 0.0166 ms 86.7%
+SingleProcess AUTOTUNE takes 3.8991 seconds
+AUTOTUNE bmm(512x1x69, 512x69x96)
+  triton_bmm_6327 0.0152 ms 100.0%
+  triton_bmm_6323 0.0156 ms 96.9%
+  triton_bmm_6317 0.0159 ms 95.4%
+  triton_bmm_6316 0.0162 ms 93.9%
+  triton_bmm_6318 0.0163 ms 93.1%
+  triton_bmm_6326 0.0164 ms 92.4%
+  triton_bmm_6320 0.0164 ms 92.2%
+  triton_bmm_6322 0.0168 ms 90.3%
+  triton_bmm_6319 0.0172 ms 87.9%
+  triton_bmm_6321 0.0173 ms 87.8%
+SingleProcess AUTOTUNE takes 3.6750 seconds
+AUTOTUNE bmm(512x1x96, 512x96x70)
+  triton_bmm_6395 0.0148 ms 100.0%
+  triton_bmm_6391 0.0148 ms 99.6%
+  triton_bmm_6388 0.0151 ms 97.7%
+  triton_bmm_6389 0.0158 ms 93.5%
+  triton_bmm_6399 0.0159 ms 92.9%
+  triton_bmm_6396 0.0160 ms 92.4%
+  triton_bmm_6392 0.0161 ms 91.7%
+  triton_bmm_6390 0.0162 ms 90.9%
+  triton_bmm_6394 0.0168 ms 88.0%
+  triton_bmm_6398 0.0168 ms 87.6%
+SingleProcess AUTOTUNE takes 4.0595 seconds
+AUTOTUNE bmm(512x1x70, 512x70x96)
+  triton_bmm_6419 0.0143 ms 100.0%
+  triton_bmm_6413 0.0144 ms 99.1%
+  triton_bmm_6423 0.0148 ms 96.8%
+  triton_bmm_6422 0.0148 ms 96.5%
+  triton_bmm_6412 0.0149 ms 95.7%
+  triton_bmm_6416 0.0158 ms 90.5%
+  triton_bmm_6417 0.0159 ms 90.1%
+  triton_bmm_6414 0.0159 ms 89.8%
+  triton_bmm_6420 0.0164 ms 87.3%
+  triton_bmm_6415 0.0169 ms 84.4%
+SingleProcess AUTOTUNE takes 3.7808 seconds
+AUTOTUNE bmm(512x1x96, 512x96x71)
+  triton_bmm_6491 0.0150 ms 100.0%
+  triton_bmm_6484 0.0155 ms 97.3%
+  triton_bmm_6487 0.0157 ms 95.9%
+  triton_bmm_6486 0.0160 ms 94.2%
+  triton_bmm_6492 0.0160 ms 94.0%
+  triton_bmm_6495 0.0160 ms 93.9%
+  triton_bmm_6485 0.0162 ms 93.1%
+  triton_bmm_6488 0.0162 ms 92.9%
+  triton_bmm_6490 0.0165 ms 91.3%
+  triton_bmm_6489 0.0166 ms 90.6%
+SingleProcess AUTOTUNE takes 3.8694 seconds
+AUTOTUNE bmm(512x1x71, 512x71x96)
+  triton_bmm_6519 0.0148 ms 100.0%
+  triton_bmm_6509 0.0154 ms 96.3%
+  triton_bmm_6508 0.0158 ms 93.5%
+  triton_bmm_6515 0.0159 ms 92.8%
+  triton_bmm_6518 0.0160 ms 92.2%
+  triton_bmm_6510 0.0164 ms 90.4%
+  triton_bmm_6512 0.0167 ms 88.3%
+  triton_bmm_6513 0.0170 ms 87.2%
+  triton_bmm_6511 0.0174 ms 85.1%
+  triton_bmm_6514 0.0176 ms 84.2%
+SingleProcess AUTOTUNE takes 4.0981 seconds
+AUTOTUNE bmm(512x1x96, 512x96x72)
+  triton_bmm_6587 0.0140 ms 100.0%
+  triton_bmm_6580 0.0144 ms 97.3%
+  triton_bmm_6583 0.0152 ms 92.2%
+  triton_bmm_6581 0.0156 ms 89.8%
+  triton_bmm_6591 0.0157 ms 89.0%
+  triton_bmm_6589 0.0158 ms 88.7%
+  triton_bmm_6588 0.0159 ms 88.1%
+  triton_bmm_6584 0.0159 ms 88.0%
+  triton_bmm_6582 0.0160 ms 87.8%
+  triton_bmm_6590 0.0160 ms 87.4%
+SingleProcess AUTOTUNE takes 3.7188 seconds
+AUTOTUNE bmm(512x1x72, 512x72x96)
+  triton_bmm_6614 0.0149 ms 100.0%
+  triton_bmm_6615 0.0149 ms 100.0%
+  triton_bmm_6611 0.0150 ms 99.4%
+  triton_bmm_6605 0.0151 ms 98.9%
+  triton_bmm_6604 0.0154 ms 96.6%
+  triton_bmm_6606 0.0159 ms 93.8%
+  triton_bmm_6613 0.0160 ms 93.0%
+  triton_bmm_6612 0.0166 ms 89.8%
+  bmm 0.0169 ms 88.1%
+  triton_bmm_6610 0.0170 ms 87.8%
+SingleProcess AUTOTUNE takes 3.7639 seconds
+AUTOTUNE bmm(512x1x96, 512x96x73)
+  triton_bmm_6683 0.0148 ms 100.0%
+  triton_bmm_6676 0.0150 ms 98.1%
+  triton_bmm_6677 0.0157 ms 93.7%
+  triton_bmm_6679 0.0158 ms 93.1%
+  triton_bmm_6682 0.0161 ms 91.8%
+  triton_bmm_6678 0.0162 ms 91.3%
+  triton_bmm_6687 0.0162 ms 90.9%
+  triton_bmm_6680 0.0163 ms 90.6%
+  triton_bmm_6684 0.0167 ms 88.3%
+  triton_bmm_6685 0.0167 ms 88.1%
+SingleProcess AUTOTUNE takes 3.7662 seconds
+AUTOTUNE bmm(512x1x73, 512x73x96)
+  triton_bmm_6701 0.0154 ms 100.0%
+  triton_bmm_6707 0.0156 ms 99.0%
+  triton_bmm_6711 0.0156 ms 98.8%
+  triton_bmm_6700 0.0162 ms 95.4%
+  triton_bmm_6710 0.0163 ms 94.5%
+  triton_bmm_6702 0.0169 ms 91.3%
+  triton_bmm_6703 0.0170 ms 90.6%
+  triton_bmm_6704 0.0175 ms 88.0%
+  triton_bmm_6705 0.0176 ms 87.5%
+  triton_bmm_6706 0.0178 ms 86.7%
+SingleProcess AUTOTUNE takes 3.9709 seconds
+AUTOTUNE bmm(512x1x96, 512x96x74)
+  triton_bmm_6779 0.0150 ms 100.0%
+  triton_bmm_6775 0.0151 ms 99.8%
+  triton_bmm_6772 0.0155 ms 97.1%
+  triton_bmm_6783 0.0156 ms 96.7%
+  triton_bmm_6773 0.0156 ms 96.5%
+  triton_bmm_6774 0.0159 ms 94.6%
+  triton_bmm_6780 0.0162 ms 92.7%
+  triton_bmm_6776 0.0164 ms 91.8%
+  triton_bmm_6777 0.0165 ms 90.9%
+  triton_bmm_6778 0.0171 ms 87.9%
+SingleProcess AUTOTUNE takes 3.7965 seconds
+AUTOTUNE bmm(512x1x74, 512x74x96)
+  triton_bmm_6803 0.0145 ms 100.0%
+  triton_bmm_6806 0.0146 ms 99.3%
+  triton_bmm_6797 0.0146 ms 99.1%
+  triton_bmm_6807 0.0152 ms 95.4%
+  triton_bmm_6798 0.0156 ms 92.6%
+  triton_bmm_6796 0.0156 ms 92.4%
+  triton_bmm_6800 0.0159 ms 90.8%
+  triton_bmm_6799 0.0165 ms 87.4%
+  triton_bmm_6801 0.0166 ms 86.9%
+  triton_bmm_6804 0.0168 ms 86.3%
+SingleProcess AUTOTUNE takes 3.6945 seconds
+AUTOTUNE bmm(512x1x96, 512x96x75)
+  triton_bmm_6871 0.0154 ms 100.0%
+  triton_bmm_6875 0.0154 ms 100.0%
+  triton_bmm_6879 0.0158 ms 97.6%
+  triton_bmm_6868 0.0158 ms 97.2%
+  triton_bmm_6869 0.0159 ms 96.6%
+  triton_bmm_6872 0.0160 ms 96.0%
+  triton_bmm_6874 0.0163 ms 94.7%
+  triton_bmm_6873 0.0167 ms 92.0%
+  triton_bmm_6876 0.0168 ms 91.4%
+  triton_bmm_6877 0.0169 ms 90.8%
+SingleProcess AUTOTUNE takes 3.7128 seconds
+AUTOTUNE bmm(512x1x75, 512x75x96)
+  triton_bmm_6903 0.0150 ms 100.0%
+  triton_bmm_6899 0.0155 ms 96.9%
+  triton_bmm_6893 0.0156 ms 96.5%
+  triton_bmm_6892 0.0161 ms 93.4%
+  triton_bmm_6902 0.0164 ms 92.0%
+  triton_bmm_6894 0.0164 ms 91.4%
+  triton_bmm_6895 0.0173 ms 87.0%
+  triton_bmm_6898 0.0175 ms 85.8%
+  triton_bmm_6896 0.0178 ms 84.7%
+  triton_bmm_6897 0.0178 ms 84.6%
+SingleProcess AUTOTUNE takes 4.2914 seconds
+AUTOTUNE bmm(512x1x96, 512x96x76)
+  triton_bmm_6971 0.0144 ms 100.0%
+  triton_bmm_6964 0.0148 ms 97.6%
+  triton_bmm_6967 0.0150 ms 96.2%
+  triton_bmm_6965 0.0160 ms 90.1%
+  triton_bmm_6968 0.0162 ms 89.3%
+  triton_bmm_6975 0.0162 ms 89.3%
+  triton_bmm_6970 0.0162 ms 89.1%
+  triton_bmm_6966 0.0166 ms 86.9%
+  triton_bmm_6972 0.0168 ms 86.1%
+  triton_bmm_6973 0.0169 ms 85.6%
+SingleProcess AUTOTUNE takes 3.8916 seconds
+AUTOTUNE bmm(512x1x76, 512x76x96)
+  triton_bmm_6988 0.0150 ms 100.0%
+  triton_bmm_6995 0.0151 ms 99.4%
+  triton_bmm_6989 0.0152 ms 98.7%
+  triton_bmm_6998 0.0155 ms 97.1%
+  triton_bmm_6999 0.0156 ms 96.5%
+  triton_bmm_6990 0.0156 ms 96.3%
+  triton_bmm_6996 0.0161 ms 93.4%
+  triton_bmm_6991 0.0166 ms 90.6%
+  triton_bmm_6992 0.0167 ms 89.9%
+  triton_bmm_6993 0.0168 ms 89.5%
+SingleProcess AUTOTUNE takes 4.6631 seconds
+AUTOTUNE bmm(512x1x96, 512x96x77)
+  triton_bmm_7067 0.0151 ms 100.0%
+  triton_bmm_7060 0.0154 ms 97.7%
+  triton_bmm_7063 0.0156 ms 96.9%
+  triton_bmm_7061 0.0161 ms 93.6%
+  triton_bmm_7064 0.0162 ms 93.3%
+  triton_bmm_7071 0.0164 ms 91.8%
+  triton_bmm_7068 0.0165 ms 91.3%
+  triton_bmm_7062 0.0165 ms 91.1%
+  triton_bmm_7066 0.0169 ms 89.3%
+  triton_bmm_7065 0.0174 ms 86.7%
+SingleProcess AUTOTUNE takes 3.9548 seconds
+AUTOTUNE bmm(512x1x77, 512x77x96)
+  triton_bmm_7091 0.0156 ms 100.0%
+  triton_bmm_7095 0.0156 ms 99.8%
+  triton_bmm_7084 0.0162 ms 96.2%
+  triton_bmm_7085 0.0162 ms 96.1%
+  triton_bmm_7094 0.0164 ms 94.9%
+  triton_bmm_7086 0.0170 ms 91.7%
+  triton_bmm_7088 0.0172 ms 90.4%
+  triton_bmm_7090 0.0177 ms 88.2%
+  triton_bmm_7087 0.0179 ms 87.0%
+  triton_bmm_7089 0.0180 ms 86.8%
+SingleProcess AUTOTUNE takes 3.7526 seconds
+AUTOTUNE bmm(512x1x96, 512x96x78)
+  triton_bmm_7163 0.0149 ms 100.0%
+  triton_bmm_7156 0.0158 ms 94.3%
+  triton_bmm_7159 0.0159 ms 94.2%
+  triton_bmm_7158 0.0163 ms 91.7%
+  triton_bmm_7167 0.0163 ms 91.6%
+  triton_bmm_7164 0.0164 ms 91.4%
+  triton_bmm_7157 0.0164 ms 90.9%
+  triton_bmm_7160 0.0165 ms 90.3%
+  triton_bmm_7161 0.0167 ms 89.5%
+  triton_bmm_7162 0.0168 ms 89.0%
+SingleProcess AUTOTUNE takes 3.7733 seconds
+AUTOTUNE bmm(512x1x78, 512x78x96)
+  triton_bmm_7181 0.0147 ms 100.0%
+  triton_bmm_7187 0.0151 ms 97.0%
+  triton_bmm_7190 0.0154 ms 95.6%
+  triton_bmm_7191 0.0155 ms 94.8%
+  triton_bmm_7180 0.0158 ms 93.1%
+  triton_bmm_7182 0.0163 ms 90.4%
+  triton_bmm_7184 0.0165 ms 88.8%
+  triton_bmm_7185 0.0168 ms 87.6%
+  triton_bmm_7186 0.0172 ms 85.3%
+  triton_bmm_7183 0.0173 ms 84.8%
+SingleProcess AUTOTUNE takes 3.7256 seconds
+AUTOTUNE bmm(512x1x96, 512x96x79)
+  triton_bmm_7259 0.0152 ms 100.0%
+  triton_bmm_7252 0.0156 ms 97.7%
+  triton_bmm_7255 0.0156 ms 97.3%
+  triton_bmm_7256 0.0163 ms 93.6%
+  triton_bmm_7253 0.0163 ms 93.3%
+  triton_bmm_7260 0.0166 ms 91.7%
+  triton_bmm_7263 0.0166 ms 91.5%
+  triton_bmm_7257 0.0170 ms 89.6%
+  triton_bmm_7258 0.0170 ms 89.5%
+  triton_bmm_7262 0.0173 ms 88.1%
+SingleProcess AUTOTUNE takes 4.1437 seconds
+AUTOTUNE bmm(512x1x79, 512x79x96)
+  triton_bmm_7283 0.0157 ms 100.0%
+  triton_bmm_7277 0.0157 ms 99.6%
+  triton_bmm_7276 0.0165 ms 95.1%
+  triton_bmm_7278 0.0167 ms 93.7%
+  triton_bmm_7286 0.0171 ms 91.6%
+  triton_bmm_7279 0.0176 ms 89.3%
+  triton_bmm_7281 0.0176 ms 89.1%
+  triton_bmm_7280 0.0176 ms 89.0%
+  triton_bmm_7287 0.0177 ms 88.4%
+  triton_bmm_7282 0.0180 ms 87.3%
+SingleProcess AUTOTUNE takes 3.7204 seconds
+AUTOTUNE bmm(512x1x96, 512x96x80)
+  triton_bmm_7355 0.0154 ms 100.0%
+  triton_bmm_7348 0.0156 ms 98.6%
+  triton_bmm_7351 0.0157 ms 97.8%
+  triton_bmm_7350 0.0160 ms 95.7%
+  triton_bmm_7352 0.0161 ms 95.4%
+  triton_bmm_7357 0.0162 ms 94.7%
+  triton_bmm_7359 0.0163 ms 94.1%
+  triton_bmm_7349 0.0164 ms 93.7%
+  triton_bmm_7356 0.0165 ms 92.8%
+  triton_bmm_7354 0.0166 ms 92.7%
+SingleProcess AUTOTUNE takes 3.6152 seconds
+AUTOTUNE bmm(512x1x80, 512x80x96)
+  triton_bmm_7379 0.0148 ms 100.0%
+  triton_bmm_7373 0.0150 ms 98.9%
+  triton_bmm_7382 0.0151 ms 98.5%
+  triton_bmm_7372 0.0152 ms 97.9%
+  triton_bmm_7383 0.0152 ms 97.9%
+  triton_bmm_7374 0.0158 ms 93.7%
+  triton_bmm_7376 0.0169 ms 88.0%
+  triton_bmm_7377 0.0169 ms 87.9%
+  triton_bmm_7378 0.0169 ms 87.9%
+  triton_bmm_7381 0.0169 ms 87.7%
+SingleProcess AUTOTUNE takes 3.7769 seconds
+AUTOTUNE bmm(512x1x96, 512x96x81)
+  triton_bmm_7444 0.0158 ms 100.0%
+  triton_bmm_7451 0.0161 ms 98.5%
+  triton_bmm_7447 0.0164 ms 96.8%
+  triton_bmm_7445 0.0165 ms 95.8%
+  triton_bmm_7455 0.0168 ms 94.4%
+  triton_bmm_7446 0.0170 ms 93.1%
+  triton_bmm_7448 0.0170 ms 93.1%
+  triton_bmm_7449 0.0172 ms 92.3%
+  triton_bmm_7450 0.0172 ms 92.3%
+  triton_bmm_7452 0.0173 ms 91.2%
+SingleProcess AUTOTUNE takes 3.7539 seconds
+AUTOTUNE bmm(512x1x81, 512x81x96)
+  triton_bmm_7469 0.0159 ms 100.0%
+  triton_bmm_7470 0.0167 ms 95.0%
+  triton_bmm_7475 0.0169 ms 94.1%
+  triton_bmm_7479 0.0169 ms 94.1%
+  triton_bmm_7468 0.0170 ms 93.2%
+  triton_bmm_7472 0.0179 ms 88.6%
+  triton_bmm_7478 0.0180 ms 88.1%
+  triton_bmm_7471 0.0183 ms 86.6%
+  triton_bmm_7473 0.0184 ms 86.4%
+  triton_bmm_7477 0.0185 ms 85.7%
+SingleProcess AUTOTUNE takes 3.6341 seconds
+AUTOTUNE bmm(512x1x96, 512x96x82)
+  triton_bmm_7547 0.0153 ms 100.0%
+  triton_bmm_7543 0.0155 ms 98.8%
+  triton_bmm_7540 0.0162 ms 94.5%
+  triton_bmm_7544 0.0165 ms 92.6%
+  triton_bmm_7551 0.0167 ms 91.9%
+  triton_bmm_7548 0.0167 ms 91.6%
+  triton_bmm_7541 0.0169 ms 90.5%
+  triton_bmm_7545 0.0171 ms 89.8%
+  triton_bmm_7542 0.0171 ms 89.5%
+  triton_bmm_7546 0.0177 ms 86.5%
+SingleProcess AUTOTUNE takes 3.7916 seconds
+AUTOTUNE bmm(512x1x82, 512x82x96)
+  triton_bmm_7571 0.0149 ms 100.0%
+  triton_bmm_7565 0.0149 ms 99.6%
+  triton_bmm_7574 0.0158 ms 94.1%
+  triton_bmm_7566 0.0159 ms 93.4%
+  triton_bmm_7564 0.0160 ms 92.8%
+  triton_bmm_7568 0.0163 ms 91.4%
+  triton_bmm_7575 0.0164 ms 90.6%
+  triton_bmm_7567 0.0169 ms 88.2%
+  triton_bmm_7569 0.0170 ms 87.7%
+  triton_bmm_7570 0.0170 ms 87.7%
+SingleProcess AUTOTUNE takes 4.0911 seconds
+AUTOTUNE bmm(512x1x96, 512x96x83)
+  triton_bmm_7643 0.0156 ms 100.0%
+  triton_bmm_7639 0.0160 ms 98.0%
+  triton_bmm_7636 0.0164 ms 95.1%
+  triton_bmm_7640 0.0166 ms 94.2%
+  triton_bmm_7637 0.0168 ms 93.3%
+  triton_bmm_7642 0.0169 ms 92.8%
+  triton_bmm_7647 0.0169 ms 92.4%
+  triton_bmm_7644 0.0170 ms 92.3%
+  triton_bmm_7638 0.0172 ms 91.1%
+  triton_bmm_7641 0.0173 ms 90.6%
+SingleProcess AUTOTUNE takes 3.9480 seconds
+AUTOTUNE bmm(512x1x83, 512x83x96)
+  triton_bmm_7671 0.0162 ms 100.0%
+  triton_bmm_7661 0.0163 ms 99.4%
+  triton_bmm_7667 0.0167 ms 96.7%
+  triton_bmm_7662 0.0170 ms 95.1%
+  triton_bmm_7660 0.0173 ms 93.3%
+  triton_bmm_7670 0.0179 ms 90.2%
+  triton_bmm_7664 0.0180 ms 89.7%
+  triton_bmm_7669 0.0184 ms 87.8%
+  triton_bmm_7666 0.0184 ms 87.7%
+  triton_bmm_7665 0.0185 ms 87.3%
+SingleProcess AUTOTUNE takes 3.9882 seconds
+AUTOTUNE bmm(512x1x96, 512x96x84)
+  triton_bmm_7739 0.0152 ms 100.0%
+  triton_bmm_7735 0.0155 ms 97.9%
+  triton_bmm_7732 0.0160 ms 94.8%
+  triton_bmm_7736 0.0163 ms 93.5%
+  triton_bmm_7734 0.0166 ms 91.4%
+  triton_bmm_7738 0.0168 ms 90.5%
+  triton_bmm_7743 0.0168 ms 90.5%
+  triton_bmm_7733 0.0168 ms 90.3%
+  triton_bmm_7737 0.0171 ms 88.8%
+  triton_bmm_7742 0.0171 ms 88.8%
+SingleProcess AUTOTUNE takes 3.9843 seconds
+AUTOTUNE bmm(512x1x84, 512x84x96)
+  triton_bmm_7763 0.0150 ms 100.0%
+  triton_bmm_7757 0.0151 ms 99.8%
+  triton_bmm_7756 0.0160 ms 94.2%
+  triton_bmm_7758 0.0160 ms 93.8%
+  triton_bmm_7766 0.0162 ms 92.7%
+  triton_bmm_7767 0.0167 ms 90.2%
+  triton_bmm_7759 0.0170 ms 88.5%
+  triton_bmm_7762 0.0171 ms 87.9%
+  triton_bmm_7761 0.0172 ms 87.5%
+  triton_bmm_7760 0.0176 ms 85.5%
+SingleProcess AUTOTUNE takes 3.6558 seconds
+AUTOTUNE bmm(512x1x96, 512x96x85)
+  triton_bmm_7835 0.0158 ms 100.0%
+  triton_bmm_7831 0.0160 ms 98.4%
+  triton_bmm_7839 0.0165 ms 95.5%
+  triton_bmm_7828 0.0167 ms 94.6%
+  triton_bmm_7832 0.0172 ms 91.6%
+  triton_bmm_7830 0.0173 ms 91.0%
+  triton_bmm_7829 0.0174 ms 90.6%
+  triton_bmm_7834 0.0174 ms 90.5%
+  triton_bmm_7837 0.0176 ms 89.5%
+  triton_bmm_7836 0.0177 ms 89.3%
+SingleProcess AUTOTUNE takes 4.3611 seconds
+AUTOTUNE bmm(512x1x85, 512x85x96)
+  triton_bmm_7853 0.0162 ms 100.0%
+  triton_bmm_7859 0.0167 ms 97.1%
+  triton_bmm_7863 0.0169 ms 96.2%
+  triton_bmm_7852 0.0170 ms 95.7%
+  triton_bmm_7854 0.0172 ms 94.4%
+  triton_bmm_7857 0.0182 ms 89.3%
+  triton_bmm_7861 0.0182 ms 89.1%
+  triton_bmm_7855 0.0183 ms 88.6%
+  triton_bmm_7862 0.0184 ms 88.0%
+  triton_bmm_7858 0.0186 ms 87.4%
+SingleProcess AUTOTUNE takes 3.6937 seconds
+AUTOTUNE bmm(512x1x96, 512x96x86)
+  triton_bmm_7931 0.0156 ms 100.0%
+  triton_bmm_7927 0.0159 ms 98.6%
+  triton_bmm_7924 0.0161 ms 97.0%
+  triton_bmm_7935 0.0170 ms 91.9%
+  triton_bmm_7925 0.0173 ms 90.3%
+  triton_bmm_7928 0.0173 ms 90.2%
+  triton_bmm_7934 0.0176 ms 88.7%
+  triton_bmm_7926 0.0177 ms 88.6%
+  triton_bmm_7932 0.0177 ms 88.3%
+  triton_bmm_7929 0.0179 ms 87.3%
+SingleProcess AUTOTUNE takes 3.8354 seconds
+AUTOTUNE bmm(512x1x86, 512x86x96)
+  triton_bmm_7949 0.0157 ms 100.0%
+  triton_bmm_7955 0.0157 ms 99.6%
+  triton_bmm_7948 0.0162 ms 97.0%
+  triton_bmm_7950 0.0162 ms 96.8%
+  triton_bmm_7958 0.0164 ms 95.3%
+  triton_bmm_7959 0.0166 ms 94.4%
+  triton_bmm_7952 0.0170 ms 92.3%
+  triton_bmm_7953 0.0172 ms 91.1%
+  triton_bmm_7954 0.0176 ms 88.9%
+  triton_bmm_7951 0.0177 ms 88.6%
+SingleProcess AUTOTUNE takes 4.0341 seconds
+AUTOTUNE bmm(512x1x96, 512x96x87)
+  triton_bmm_8023 0.0163 ms 100.0%
+  triton_bmm_8027 0.0165 ms 98.6%
+  triton_bmm_8020 0.0169 ms 96.8%
+  triton_bmm_8031 0.0172 ms 95.0%
+  triton_bmm_8024 0.0174 ms 93.6%
+  triton_bmm_8025 0.0175 ms 93.1%
+  triton_bmm_8026 0.0177 ms 92.4%
+  triton_bmm_8029 0.0177 ms 92.4%
+  triton_bmm_8021 0.0177 ms 92.1%
+  triton_bmm_8028 0.0178 ms 91.9%
+SingleProcess AUTOTUNE takes 4.0095 seconds
+AUTOTUNE bmm(512x1x87, 512x87x96)
+  triton_bmm_8045 0.0163 ms 100.0%
+  triton_bmm_8055 0.0164 ms 99.8%
+  triton_bmm_8051 0.0167 ms 97.7%
+  triton_bmm_8044 0.0170 ms 95.9%
+  triton_bmm_8046 0.0178 ms 91.9%
+  triton_bmm_8054 0.0179 ms 91.1%
+  triton_bmm_8053 0.0180 ms 90.7%
+  triton_bmm_8049 0.0183 ms 89.0%
+  triton_bmm_8047 0.0189 ms 86.4%
+  triton_bmm_8050 0.0189 ms 86.4%
+SingleProcess AUTOTUNE takes 4.0575 seconds
+AUTOTUNE bmm(512x1x96, 512x96x88)
+  triton_bmm_8123 0.0156 ms 100.0%
+  triton_bmm_8116 0.0159 ms 98.0%
+  triton_bmm_8127 0.0164 ms 95.3%
+  triton_bmm_8119 0.0165 ms 94.6%
+  triton_bmm_8125 0.0166 ms 94.0%
+  triton_bmm_8124 0.0172 ms 90.9%
+  triton_bmm_8120 0.0172 ms 90.7%
+  triton_bmm_8117 0.0172 ms 90.5%
+  triton_bmm_8121 0.0173 ms 90.4%
+  triton_bmm_8126 0.0173 ms 90.0%
+SingleProcess AUTOTUNE takes 3.7451 seconds
+AUTOTUNE bmm(512x1x88, 512x88x96)
+  triton_bmm_8147 0.0153 ms 100.0%
+  triton_bmm_8141 0.0154 ms 99.6%
+  triton_bmm_8140 0.0157 ms 97.7%
+  triton_bmm_8150 0.0161 ms 95.1%
+  triton_bmm_8142 0.0169 ms 90.6%
+  triton_bmm_8151 0.0170 ms 90.1%
+  triton_bmm_8146 0.0173 ms 88.6%
+  triton_bmm_8143 0.0177 ms 86.6%
+  triton_bmm_8144 0.0178 ms 86.2%
+  triton_bmm_8149 0.0178 ms 86.2%
+SingleProcess AUTOTUNE takes 4.0299 seconds
+AUTOTUNE bmm(512x1x96, 512x96x89)
+  triton_bmm_8219 0.0164 ms 100.0%
+  triton_bmm_8215 0.0166 ms 98.5%
+  triton_bmm_8216 0.0172 ms 95.3%
+  triton_bmm_8218 0.0173 ms 94.3%
+  triton_bmm_8213 0.0174 ms 93.9%
+  triton_bmm_8223 0.0174 ms 93.8%
+  triton_bmm_8221 0.0176 ms 92.7%
+  triton_bmm_8220 0.0179 ms 91.4%
+  triton_bmm_8222 0.0180 ms 90.8%
+  triton_bmm_8217 0.0182 ms 89.8%
+SingleProcess AUTOTUNE takes 4.0836 seconds
+AUTOTUNE bmm(512x1x89, 512x89x96)
+  triton_bmm_8237 0.0164 ms 100.0%
+  triton_bmm_8243 0.0165 ms 99.4%
+  triton_bmm_8247 0.0166 ms 99.0%
+  triton_bmm_8236 0.0172 ms 95.5%
+  triton_bmm_8238 0.0174 ms 94.3%
+  triton_bmm_8241 0.0185 ms 89.1%
+  triton_bmm_8240 0.0185 ms 88.8%
+  triton_bmm_8245 0.0187 ms 88.2%
+  triton_bmm_8246 0.0187 ms 88.0%
+  triton_bmm_8239 0.0192 ms 85.8%
+SingleProcess AUTOTUNE takes 4.2023 seconds
+AUTOTUNE bmm(512x1x96, 512x96x90)
+  triton_bmm_8315 0.0162 ms 100.0%
+  triton_bmm_8308 0.0165 ms 98.3%
+  triton_bmm_8311 0.0168 ms 96.2%
+  triton_bmm_8312 0.0171 ms 94.9%
+  triton_bmm_8319 0.0172 ms 93.9%
+  triton_bmm_8310 0.0173 ms 93.4%
+  triton_bmm_8313 0.0176 ms 92.0%
+  triton_bmm_8309 0.0177 ms 91.3%
+  triton_bmm_8314 0.0178 ms 91.2%
+  triton_bmm_8318 0.0179 ms 90.7%
+SingleProcess AUTOTUNE takes 4.1487 seconds
+AUTOTUNE bmm(512x1x90, 512x90x96)
+  triton_bmm_8332 0.0161 ms 100.0%
+  triton_bmm_8339 0.0161 ms 100.0%
+  triton_bmm_8333 0.0161 ms 99.6%
+  triton_bmm_8342 0.0163 ms 98.6%
+  triton_bmm_8334 0.0165 ms 97.3%
+  triton_bmm_8336 0.0166 ms 96.5%
+  triton_bmm_8343 0.0173 ms 92.6%
+  triton_bmm_8337 0.0175 ms 91.9%
+  triton_bmm_8335 0.0179 ms 89.6%
+  triton_bmm_8338 0.0180 ms 89.5%
+SingleProcess AUTOTUNE takes 3.9932 seconds
+AUTOTUNE bmm(512x1x96, 512x96x91)
+  triton_bmm_8407 0.0166 ms 100.0%
+  triton_bmm_8404 0.0167 ms 99.6%
+  triton_bmm_8411 0.0171 ms 97.2%
+  triton_bmm_8408 0.0173 ms 96.1%
+  triton_bmm_8415 0.0175 ms 94.9%
+  triton_bmm_8409 0.0178 ms 93.5%
+  triton_bmm_8406 0.0180 ms 92.5%
+  triton_bmm_8410 0.0180 ms 92.2%
+  triton_bmm_8412 0.0181 ms 91.9%
+  triton_bmm_8413 0.0182 ms 91.4%
+SingleProcess AUTOTUNE takes 4.6723 seconds
+AUTOTUNE bmm(512x1x91, 512x91x96)
+  triton_bmm_8439 0.0172 ms 100.0%
+  triton_bmm_8429 0.0173 ms 99.3%
+  triton_bmm_8435 0.0173 ms 99.3%
+  triton_bmm_8428 0.0174 ms 99.1%
+  triton_bmm_8430 0.0177 ms 97.3%
+  triton_bmm_8438 0.0182 ms 94.4%
+  triton_bmm_8433 0.0187 ms 92.1%
+  triton_bmm_8432 0.0187 ms 92.0%
+  triton_bmm_8437 0.0189 ms 91.0%
+  triton_bmm_8434 0.0193 ms 89.2%
+SingleProcess AUTOTUNE takes 3.6319 seconds
+AUTOTUNE bmm(512x1x96, 512x96x92)
+  triton_bmm_8503 0.0162 ms 100.0%
+  triton_bmm_8500 0.0163 ms 99.0%
+  triton_bmm_8507 0.0166 ms 97.3%
+  triton_bmm_8504 0.0169 ms 95.6%
+  triton_bmm_8511 0.0173 ms 93.2%
+  triton_bmm_8505 0.0176 ms 91.8%
+  triton_bmm_8508 0.0176 ms 91.8%
+  triton_bmm_8509 0.0177 ms 91.5%
+  triton_bmm_8510 0.0178 ms 90.8%
+  triton_bmm_8501 0.0179 ms 90.1%
+SingleProcess AUTOTUNE takes 3.6899 seconds
+AUTOTUNE bmm(512x1x92, 512x92x96)
+  triton_bmm_8525 0.0158 ms 100.0%
+  triton_bmm_8531 0.0163 ms 97.1%
+  triton_bmm_8524 0.0167 ms 94.6%
+  triton_bmm_8526 0.0168 ms 94.3%
+  triton_bmm_8534 0.0168 ms 94.1%
+  triton_bmm_8535 0.0169 ms 93.7%
+  triton_bmm_8527 0.0175 ms 90.2%
+  triton_bmm_8532 0.0179 ms 88.3%
+  triton_bmm_8528 0.0181 ms 87.3%
+  triton_bmm_8530 0.0182 ms 86.9%
+SingleProcess AUTOTUNE takes 3.9304 seconds
+AUTOTUNE bmm(512x1x96, 512x96x93)
+  triton_bmm_8599 0.0167 ms 100.0%
+  triton_bmm_8596 0.0171 ms 97.9%
+  triton_bmm_8607 0.0171 ms 97.9%
+  triton_bmm_8603 0.0171 ms 97.8%
+  triton_bmm_8602 0.0176 ms 94.9%
+  triton_bmm_8604 0.0178 ms 94.1%
+  triton_bmm_8605 0.0179 ms 93.7%
+  triton_bmm_8600 0.0179 ms 93.6%
+  triton_bmm_8601 0.0180 ms 93.1%
+  triton_bmm_8606 0.0184 ms 90.8%
+SingleProcess AUTOTUNE takes 3.8768 seconds
+AUTOTUNE bmm(512x1x93, 512x93x96)
+  triton_bmm_8621 0.0169 ms 100.0%
+  triton_bmm_8627 0.0172 ms 98.5%
+  triton_bmm_8620 0.0179 ms 94.5%
+  triton_bmm_8631 0.0180 ms 93.8%
+  triton_bmm_8622 0.0185 ms 91.4%
+  triton_bmm_8630 0.0187 ms 90.4%
+  triton_bmm_8625 0.0189 ms 89.4%
+  triton_bmm_8626 0.0193 ms 87.6%
+  triton_bmm_8629 0.0194 ms 87.3%
+  triton_bmm_8624 0.0195 ms 86.9%
+SingleProcess AUTOTUNE takes 3.6842 seconds
+AUTOTUNE bmm(512x1x96, 512x96x94)
+  triton_bmm_8699 0.0165 ms 100.0%
+  triton_bmm_8695 0.0166 ms 99.6%
+  triton_bmm_8692 0.0169 ms 98.1%
+  triton_bmm_8703 0.0176 ms 94.0%
+  triton_bmm_8697 0.0178 ms 92.8%
+  triton_bmm_8693 0.0179 ms 92.7%
+  triton_bmm_8700 0.0179 ms 92.7%
+  triton_bmm_8696 0.0179 ms 92.3%
+  triton_bmm_8702 0.0182 ms 90.9%
+  triton_bmm_8694 0.0185 ms 89.5%
+SingleProcess AUTOTUNE takes 3.8475 seconds
+AUTOTUNE bmm(512x1x94, 512x94x96)
+  triton_bmm_8723 0.0162 ms 100.0%
+  triton_bmm_8716 0.0166 ms 97.8%
+  triton_bmm_8717 0.0168 ms 96.3%
+  triton_bmm_8718 0.0169 ms 95.7%
+  triton_bmm_8720 0.0169 ms 95.7%
+  triton_bmm_8726 0.0169 ms 95.7%
+  triton_bmm_8722 0.0176 ms 92.1%
+  triton_bmm_8727 0.0176 ms 92.1%
+  triton_bmm_8721 0.0177 ms 91.4%
+  triton_bmm_8719 0.0183 ms 88.7%
+SingleProcess AUTOTUNE takes 4.2773 seconds
+AUTOTUNE bmm(512x1x96, 512x96x95)
+  triton_bmm_8799 0.0172 ms 100.0%
+  triton_bmm_8795 0.0175 ms 98.5%
+  triton_bmm_8791 0.0175 ms 98.4%
+  triton_bmm_8792 0.0176 ms 97.6%
+  triton_bmm_8788 0.0177 ms 97.5%
+  triton_bmm_8794 0.0178 ms 96.8%
+  triton_bmm_8790 0.0181 ms 95.2%
+  triton_bmm_8797 0.0183 ms 94.2%
+  triton_bmm_8796 0.0184 ms 93.3%
+  triton_bmm_8793 0.0186 ms 92.5%
+SingleProcess AUTOTUNE takes 3.8559 seconds
+AUTOTUNE bmm(512x1x95, 512x95x96)
+  triton_bmm_8813 0.0171 ms 100.0%
+  triton_bmm_8819 0.0177 ms 96.6%
+  triton_bmm_8814 0.0181 ms 94.5%
+  triton_bmm_8812 0.0184 ms 93.2%
+  triton_bmm_8822 0.0195 ms 88.0%
+  triton_bmm_8821 0.0195 ms 87.8%
+  triton_bmm_8823 0.0197 ms 87.0%
+  triton_bmm_8815 0.0201 ms 85.3%
+  triton_bmm_8820 0.0202 ms 84.9%
+  triton_bmm_8818 0.0203 ms 84.4%
+SingleProcess AUTOTUNE takes 3.6375 seconds
+AUTOTUNE bmm(512x1x96, 512x96x96)
+  triton_bmm_8884 0.0167 ms 100.0%
+  triton_bmm_8891 0.0169 ms 98.7%
+  triton_bmm_8893 0.0169 ms 98.7%
+  triton_bmm_8895 0.0170 ms 98.3%
+  triton_bmm_8887 0.0170 ms 98.1%
+  triton_bmm_8888 0.0176 ms 94.7%
+  triton_bmm_8892 0.0179 ms 93.4%
+  triton_bmm_8886 0.0180 ms 93.0%
+  triton_bmm_8890 0.0180 ms 93.0%
+  triton_bmm_8894 0.0180 ms 93.0%
+SingleProcess AUTOTUNE takes 4.0978 seconds
+AUTOTUNE bmm(512x1x96, 512x96x96)
+  triton_bmm_8915 0.0162 ms 100.0%
+  triton_bmm_8908 0.0165 ms 98.3%
+  triton_bmm_8919 0.0166 ms 97.7%
+  triton_bmm_8918 0.0168 ms 96.4%
+  triton_bmm_8909 0.0170 ms 95.5%
+  triton_bmm_8910 0.0178 ms 91.0%
+  triton_bmm_8912 0.0183 ms 88.6%
+  triton_bmm_8911 0.0184 ms 88.2%
+  triton_bmm_8917 0.0184 ms 88.2%
+  triton_bmm_8914 0.0186 ms 87.4%
+SingleProcess AUTOTUNE takes 4.0138 seconds
+AUTOTUNE bmm(512x1x96, 512x96x97)
+  triton_bmm_8983 0.0171 ms 100.0%
+  triton_bmm_8987 0.0176 ms 97.1%
+  triton_bmm_8980 0.0178 ms 96.4%
+  triton_bmm_8986 0.0180 ms 95.0%
+  triton_bmm_8988 0.0182 ms 94.2%
+  triton_bmm_8984 0.0183 ms 93.5%
+  triton_bmm_8990 0.0187 ms 91.6%
+  triton_bmm_8991 0.0188 ms 90.8%
+  triton_bmm_8985 0.0191 ms 89.6%
+  triton_bmm_8981 0.0193 ms 88.7%
+SingleProcess AUTOTUNE takes 3.8866 seconds
+AUTOTUNE bmm(512x1x97, 512x97x96)
+  triton_bmm_9015 0.0185 ms 100.0%
+  triton_bmm_9011 0.0186 ms 99.5%
+  triton_bmm_9005 0.0191 ms 96.6%
+  triton_bmm_9004 0.0193 ms 95.7%
+  triton_bmm_9006 0.0196 ms 94.0%
+  triton_bmm_9014 0.0198 ms 93.4%
+  bmm 0.0202 ms 91.3%
+  triton_bmm_9010 0.0204 ms 90.6%
+  triton_bmm_9009 0.0206 ms 89.7%
+  triton_bmm_9012 0.0206 ms 89.6%
+SingleProcess AUTOTUNE takes 3.7722 seconds
+AUTOTUNE bmm(512x1x96, 512x96x98)
+  triton_bmm_9079 0.0173 ms 100.0%
+  triton_bmm_9083 0.0175 ms 98.9%
+  triton_bmm_9076 0.0178 ms 97.3%
+  triton_bmm_9087 0.0183 ms 94.6%
+  triton_bmm_9080 0.0183 ms 94.4%
+  triton_bmm_9084 0.0188 ms 92.3%
+  triton_bmm_9077 0.0188 ms 92.0%
+  triton_bmm_9078 0.0190 ms 91.3%
+  triton_bmm_9082 0.0190 ms 91.1%
+  triton_bmm_9086 0.0191 ms 90.8%
+SingleProcess AUTOTUNE takes 4.0346 seconds
+AUTOTUNE bmm(512x1x98, 512x98x96)
+  triton_bmm_9107 0.0173 ms 100.0%
+  triton_bmm_9110 0.0176 ms 98.4%
+  triton_bmm_9101 0.0181 ms 95.9%
+  triton_bmm_9111 0.0182 ms 95.4%
+  triton_bmm_9102 0.0185 ms 93.8%
+  triton_bmm_9105 0.0186 ms 93.3%
+  triton_bmm_9100 0.0187 ms 92.6%
+  triton_bmm_9104 0.0188 ms 92.3%
+  triton_bmm_9106 0.0188 ms 92.3%
+  triton_bmm_9108 0.0190 ms 91.1%
+SingleProcess AUTOTUNE takes 3.8022 seconds
+AUTOTUNE bmm(512x1x96, 512x96x99)
+  triton_bmm_9175 0.0172 ms 100.0%
+  triton_bmm_9179 0.0178 ms 96.8%
+  triton_bmm_9172 0.0180 ms 95.6%
+  triton_bmm_9176 0.0185 ms 93.4%
+  triton_bmm_9178 0.0187 ms 92.3%
+  triton_bmm_9180 0.0188 ms 91.8%
+  triton_bmm_9183 0.0191 ms 90.3%
+  triton_bmm_9174 0.0193 ms 89.5%
+  triton_bmm_9182 0.0193 ms 89.2%
+  triton_bmm_9181 0.0195 ms 88.4%
+SingleProcess AUTOTUNE takes 4.0122 seconds
+AUTOTUNE bmm(512x1x99, 512x99x96)
+  triton_bmm_9207 0.0184 ms 100.0%
+  triton_bmm_9197 0.0186 ms 98.7%
+  triton_bmm_9203 0.0187 ms 98.6%
+  triton_bmm_9196 0.0194 ms 94.9%
+  triton_bmm_9198 0.0196 ms 93.8%
+  triton_bmm_9206 0.0199 ms 92.6%
+  triton_bmm_9201 0.0203 ms 90.8%
+  triton_bmm_9204 0.0208 ms 88.5%
+  triton_bmm_9202 0.0211 ms 87.3%
+  triton_bmm_9199 0.0213 ms 86.5%
+SingleProcess AUTOTUNE takes 3.9427 seconds
+AUTOTUNE bmm(512x1x96, 512x96x100)
+  triton_bmm_9275 0.0174 ms 100.0%
+  triton_bmm_9268 0.0175 ms 99.6%
+  triton_bmm_9271 0.0175 ms 99.6%
+  triton_bmm_9274 0.0181 ms 96.4%
+  triton_bmm_9272 0.0181 ms 96.3%
+  triton_bmm_9270 0.0182 ms 95.8%
+  triton_bmm_9276 0.0183 ms 94.9%
+  triton_bmm_9279 0.0183 ms 94.9%
+  triton_bmm_9278 0.0189 ms 92.0%
+  triton_bmm_9277 0.0190 ms 91.7%
+SingleProcess AUTOTUNE takes 3.9420 seconds
+AUTOTUNE bmm(512x1x100, 512x100x96)
+  triton_bmm_9303 0.0179 ms 100.0%
+  triton_bmm_9299 0.0180 ms 99.3%
+  triton_bmm_9293 0.0181 ms 98.8%
+  triton_bmm_9302 0.0184 ms 97.2%
+  triton_bmm_9292 0.0185 ms 96.6%
+  triton_bmm_9294 0.0190 ms 94.1%
+  triton_bmm_9297 0.0192 ms 93.0%
+  triton_bmm_9300 0.0193 ms 92.7%
+  triton_bmm_9295 0.0196 ms 91.3%
+  triton_bmm_9298 0.0199 ms 89.7%
+SingleProcess AUTOTUNE takes 3.7187 seconds
+AUTOTUNE bmm(512x1x96, 512x96x101)
+  triton_bmm_9371 0.0175 ms 100.0%
+  triton_bmm_9364 0.0178 ms 98.2%
+  triton_bmm_9367 0.0180 ms 97.5%
+  triton_bmm_9370 0.0183 ms 95.5%
+  triton_bmm_9368 0.0187 ms 93.8%
+  triton_bmm_9375 0.0187 ms 93.5%
+  triton_bmm_9372 0.0190 ms 92.2%
+  triton_bmm_9373 0.0193 ms 90.9%
+  triton_bmm_9369 0.0194 ms 90.1%
+  triton_bmm_9374 0.0196 ms 89.4%
+SingleProcess AUTOTUNE takes 3.9477 seconds
+AUTOTUNE bmm(512x1x101, 512x101x96)
+  triton_bmm_9399 0.0186 ms 100.0%
+  triton_bmm_9389 0.0193 ms 96.7%
+  triton_bmm_9395 0.0193 ms 96.5%
+  triton_bmm_9390 0.0197 ms 94.3%
+  triton_bmm_9388 0.0198 ms 94.0%
+  triton_bmm_9398 0.0205 ms 90.9%
+  triton_bmm_9394 0.0209 ms 89.1%
+  triton_bmm_9393 0.0210 ms 88.7%
+  triton_bmm_9396 0.0211 ms 88.2%
+  triton_bmm_9397 0.0217 ms 85.7%
+SingleProcess AUTOTUNE takes 4.1170 seconds
+AUTOTUNE bmm(512x1x96, 512x96x102)
+  triton_bmm_9460 0.0176 ms 100.0%
+  triton_bmm_9463 0.0179 ms 98.4%
+  triton_bmm_9467 0.0179 ms 98.0%
+  triton_bmm_9471 0.0186 ms 94.3%
+  triton_bmm_9464 0.0187 ms 94.2%
+  triton_bmm_9470 0.0188 ms 93.2%
+  triton_bmm_9468 0.0190 ms 92.4%
+  triton_bmm_9466 0.0191 ms 91.8%
+  triton_bmm_9465 0.0193 ms 90.9%
+  triton_bmm_9461 0.0194 ms 90.6%
+SingleProcess AUTOTUNE takes 4.2332 seconds
+AUTOTUNE bmm(512x1x102, 512x102x96)
+  triton_bmm_9491 0.0175 ms 100.0%
+  triton_bmm_9494 0.0176 ms 99.5%
+  triton_bmm_9495 0.0179 ms 97.9%
+  triton_bmm_9485 0.0184 ms 95.6%
+  triton_bmm_9489 0.0188 ms 93.4%
+  triton_bmm_9486 0.0188 ms 93.2%
+  triton_bmm_9484 0.0189 ms 92.9%
+  triton_bmm_9488 0.0190 ms 92.4%
+  triton_bmm_9490 0.0195 ms 89.9%
+  triton_bmm_9492 0.0196 ms 89.6%
+SingleProcess AUTOTUNE takes 3.9798 seconds
+AUTOTUNE bmm(512x1x96, 512x96x103)
+  triton_bmm_9563 0.0178 ms 100.0%
+  triton_bmm_9556 0.0180 ms 98.9%
+  triton_bmm_9559 0.0181 ms 98.2%
+  triton_bmm_9562 0.0184 ms 96.5%
+  triton_bmm_9560 0.0188 ms 94.9%
+  triton_bmm_9564 0.0188 ms 94.7%
+  triton_bmm_9567 0.0188 ms 94.4%
+  triton_bmm_9565 0.0194 ms 91.6%
+  triton_bmm_9566 0.0198 ms 90.0%
+  triton_bmm_9557 0.0200 ms 88.8%
+SingleProcess AUTOTUNE takes 3.9785 seconds
+AUTOTUNE bmm(512x1x103, 512x103x96)
+  triton_bmm_9591 0.0187 ms 100.0%
+  triton_bmm_9581 0.0187 ms 99.8%
+  triton_bmm_9587 0.0189 ms 98.8%
+  triton_bmm_9580 0.0196 ms 95.3%
+  triton_bmm_9582 0.0198 ms 94.3%
+  triton_bmm_9590 0.0202 ms 92.7%
+  triton_bmm_9585 0.0213 ms 87.7%
+  triton_bmm_9584 0.0215 ms 86.8%
+  triton_bmm_9583 0.0216 ms 86.5%
+  triton_bmm_9586 0.0216 ms 86.4%
+SingleProcess AUTOTUNE takes 3.6648 seconds
+AUTOTUNE bmm(512x1x96, 512x96x104)
+  triton_bmm_9659 0.0172 ms 100.0%
+  triton_bmm_9655 0.0174 ms 99.0%
+  triton_bmm_9652 0.0179 ms 96.2%
+  triton_bmm_9658 0.0184 ms 93.9%
+  triton_bmm_9662 0.0185 ms 93.1%
+  triton_bmm_9656 0.0186 ms 92.6%
+  triton_bmm_9661 0.0190 ms 90.9%
+  triton_bmm_9663 0.0190 ms 90.7%
+  triton_bmm_9660 0.0191 ms 90.3%
+  triton_bmm_9654 0.0192 ms 90.0%
+SingleProcess AUTOTUNE takes 3.7776 seconds
+AUTOTUNE bmm(512x1x104, 512x104x96)
+  triton_bmm_9676 0.0182 ms 100.0%
+  triton_bmm_9687 0.0183 ms 99.7%
+  triton_bmm_9683 0.0184 ms 99.0%
+  triton_bmm_9677 0.0186 ms 97.9%
+  triton_bmm_9686 0.0187 ms 97.8%
+  triton_bmm_9682 0.0196 ms 93.1%
+  triton_bmm_9685 0.0197 ms 92.8%
+  triton_bmm_9678 0.0197 ms 92.4%
+  triton_bmm_9684 0.0198 ms 92.2%
+  triton_bmm_9680 0.0202 ms 90.5%
+SingleProcess AUTOTUNE takes 3.7227 seconds
+AUTOTUNE bmm(512x1x96, 512x96x105)
+  triton_bmm_9755 0.0179 ms 100.0%
+  triton_bmm_9751 0.0184 ms 97.4%
+  triton_bmm_9752 0.0185 ms 96.9%
+  triton_bmm_9748 0.0186 ms 96.4%
+  triton_bmm_9754 0.0188 ms 95.4%
+  triton_bmm_9759 0.0191 ms 94.0%
+  triton_bmm_9757 0.0194 ms 92.3%
+  triton_bmm_9756 0.0195 ms 92.1%
+  triton_bmm_9758 0.0199 ms 90.2%
+  triton_bmm_9749 0.0200 ms 89.6%
+SingleProcess AUTOTUNE takes 4.2155 seconds
+AUTOTUNE bmm(512x1x105, 512x105x96)
+  triton_bmm_9783 0.0182 ms 100.0%
+  triton_bmm_9773 0.0193 ms 94.4%
+  triton_bmm_9779 0.0193 ms 94.4%
+  triton_bmm_9772 0.0197 ms 92.4%
+  triton_bmm_9782 0.0201 ms 90.7%
+  triton_bmm_9774 0.0204 ms 89.3%
+  triton_bmm_9777 0.0210 ms 86.9%
+  triton_bmm_9778 0.0214 ms 85.1%
+  triton_bmm_9780 0.0220 ms 82.8%
+  triton_bmm_9781 0.0220 ms 82.7%
+SingleProcess AUTOTUNE takes 3.9194 seconds
+AUTOTUNE bmm(512x1x96, 512x96x106)
+  triton_bmm_9847 0.0177 ms 100.0%
+  triton_bmm_9851 0.0178 ms 99.5%
+  triton_bmm_9844 0.0186 ms 95.0%
+  triton_bmm_9852 0.0190 ms 93.1%
+  triton_bmm_9848 0.0192 ms 92.3%
+  triton_bmm_9854 0.0192 ms 92.3%
+  triton_bmm_9855 0.0195 ms 91.0%
+  triton_bmm_9850 0.0197 ms 89.9%
+  triton_bmm_9846 0.0199 ms 89.0%
+  triton_bmm_9845 0.0202 ms 87.8%
+SingleProcess AUTOTUNE takes 3.8680 seconds
+AUTOTUNE bmm(512x1x106, 512x106x96)
+  triton_bmm_9875 0.0178 ms 100.0%
+  triton_bmm_9869 0.0186 ms 95.7%
+  triton_bmm_9868 0.0187 ms 95.5%
+  triton_bmm_9878 0.0187 ms 95.4%
+  triton_bmm_9879 0.0189 ms 94.2%
+  triton_bmm_9874 0.0192 ms 92.8%
+  triton_bmm_9870 0.0196 ms 91.0%
+  triton_bmm_9873 0.0196 ms 90.9%
+  triton_bmm_9876 0.0198 ms 90.0%
+  triton_bmm_9872 0.0198 ms 89.8%
+SingleProcess AUTOTUNE takes 3.8884 seconds
+AUTOTUNE bmm(512x1x96, 512x96x107)
+  triton_bmm_9943 0.0180 ms 100.0%
+  triton_bmm_9947 0.0181 ms 99.5%
+  triton_bmm_9940 0.0189 ms 95.3%
+  triton_bmm_9944 0.0192 ms 93.8%
+  triton_bmm_9946 0.0194 ms 92.9%
+  triton_bmm_9950 0.0195 ms 92.1%
+  triton_bmm_9948 0.0196 ms 92.0%
+  triton_bmm_9951 0.0197 ms 91.4%
+  triton_bmm_9945 0.0199 ms 90.5%
+  triton_bmm_9941 0.0202 ms 89.1%
+SingleProcess AUTOTUNE takes 3.8100 seconds
+AUTOTUNE bmm(512x1x107, 512x107x96)
+  triton_bmm_9971 0.0190 ms 100.0%
+  triton_bmm_9975 0.0190 ms 99.8%
+  triton_bmm_9965 0.0195 ms 97.4%
+  triton_bmm_9964 0.0204 ms 93.0%
+  triton_bmm_9966 0.0205 ms 92.5%
+  triton_bmm_9974 0.0208 ms 91.2%
+  triton_bmm_9969 0.0212 ms 89.6%
+  triton_bmm_9968 0.0219 ms 86.8%
+  triton_bmm_9970 0.0222 ms 85.6%
+  triton_bmm_9972 0.0222 ms 85.5%
+SingleProcess AUTOTUNE takes 3.8375 seconds
+AUTOTUNE bmm(512x1x96, 512x96x108)
+  triton_bmm_10043 0.0177 ms 100.0%
+  triton_bmm_10040 0.0183 ms 96.8%
+  triton_bmm_10036 0.0184 ms 96.3%
+  triton_bmm_10039 0.0184 ms 96.2%
+  triton_bmm_10042 0.0186 ms 95.2%
+  triton_bmm_10047 0.0190 ms 93.3%
+  triton_bmm_10044 0.0192 ms 92.2%
+  triton_bmm_10045 0.0195 ms 90.7%
+  triton_bmm_10046 0.0196 ms 90.2%
+  triton_bmm_10041 0.0198 ms 89.2%
+SingleProcess AUTOTUNE takes 3.8410 seconds
+AUTOTUNE bmm(512x1x108, 512x108x96)
+  triton_bmm_10067 0.0188 ms 100.0%
+  triton_bmm_10070 0.0189 ms 99.8%
+  triton_bmm_10061 0.0190 ms 99.2%
+  triton_bmm_10071 0.0191 ms 98.7%
+  triton_bmm_10060 0.0192 ms 98.2%
+  triton_bmm_10065 0.0197 ms 95.5%
+  triton_bmm_10068 0.0199 ms 94.9%
+  triton_bmm_10062 0.0200 ms 94.2%
+  triton_bmm_10066 0.0205 ms 91.7%
+  triton_bmm_10063 0.0206 ms 91.5%
+SingleProcess AUTOTUNE takes 3.7952 seconds
+AUTOTUNE bmm(512x1x96, 512x96x109)
+  triton_bmm_10139 0.0184 ms 100.0%
+  triton_bmm_10135 0.0188 ms 98.1%
+  triton_bmm_10132 0.0190 ms 96.8%
+  triton_bmm_10138 0.0191 ms 96.3%
+  triton_bmm_10140 0.0192 ms 95.7%
+  triton_bmm_10143 0.0194 ms 95.0%
+  triton_bmm_10136 0.0194 ms 94.9%
+  triton_bmm_10142 0.0196 ms 93.8%
+  triton_bmm_10141 0.0197 ms 93.3%
+  triton_bmm_10137 0.0200 ms 91.8%
+SingleProcess AUTOTUNE takes 4.1367 seconds
+AUTOTUNE bmm(512x1x109, 512x109x96)
+  triton_bmm_10163 0.0191 ms 100.0%
+  triton_bmm_10167 0.0193 ms 99.3%
+  triton_bmm_10157 0.0196 ms 97.3%
+  triton_bmm_10156 0.0200 ms 95.6%
+  triton_bmm_10158 0.0207 ms 92.5%
+  triton_bmm_10166 0.0211 ms 90.8%
+  triton_bmm_10161 0.0215 ms 88.9%
+  triton_bmm_10164 0.0219 ms 87.5%
+  triton_bmm_10165 0.0222 ms 86.0%
+  triton_bmm_10162 0.0224 ms 85.5%
+SingleProcess AUTOTUNE takes 3.8090 seconds
+AUTOTUNE bmm(512x1x96, 512x96x110)
+  triton_bmm_10231 0.0184 ms 100.0%
+  triton_bmm_10235 0.0187 ms 98.6%
+  triton_bmm_10228 0.0189 ms 97.5%
+  triton_bmm_10232 0.0189 ms 97.1%
+  triton_bmm_10236 0.0192 ms 95.6%
+  triton_bmm_10239 0.0193 ms 95.4%
+  triton_bmm_10234 0.0199 ms 92.4%
+  triton_bmm_10238 0.0200 ms 92.0%
+  triton_bmm_10229 0.0202 ms 91.0%
+  triton_bmm_10230 0.0203 ms 90.8%
+SingleProcess AUTOTUNE takes 3.7567 seconds
+AUTOTUNE bmm(512x1x110, 512x110x96)
+  triton_bmm_10259 0.0181 ms 100.0%
+  triton_bmm_10253 0.0190 ms 95.6%
+  triton_bmm_10252 0.0190 ms 95.5%
+  triton_bmm_10262 0.0191 ms 94.8%
+  triton_bmm_10254 0.0196 ms 92.3%
+  triton_bmm_10256 0.0197 ms 92.2%
+  triton_bmm_10257 0.0199 ms 91.3%
+  triton_bmm_10263 0.0199 ms 91.2%
+  triton_bmm_10258 0.0201 ms 90.4%
+  triton_bmm_10255 0.0202 ms 90.0%
+SingleProcess AUTOTUNE takes 4.1755 seconds
+AUTOTUNE bmm(512x1x96, 512x96x111)
+  triton_bmm_10327 0.0189 ms 100.0%
+  triton_bmm_10328 0.0190 ms 99.5%
+  triton_bmm_10331 0.0191 ms 98.8%
+  triton_bmm_10324 0.0193 ms 98.0%
+  triton_bmm_10330 0.0193 ms 97.8%
+  triton_bmm_10334 0.0199 ms 94.9%
+  triton_bmm_10332 0.0200 ms 94.6%
+  triton_bmm_10333 0.0200 ms 94.6%
+  triton_bmm_10335 0.0200 ms 94.2%
+  triton_bmm_10325 0.0206 ms 91.6%
+SingleProcess AUTOTUNE takes 3.8463 seconds
+AUTOTUNE bmm(512x1x111, 512x111x96)
+  triton_bmm_10355 0.0192 ms 100.0%
+  triton_bmm_10349 0.0194 ms 99.3%
+  triton_bmm_10348 0.0203 ms 94.8%
+  triton_bmm_10350 0.0206 ms 93.3%
+  triton_bmm_10359 0.0214 ms 89.8%
+  triton_bmm_10358 0.0216 ms 89.2%
+  triton_bmm_10353 0.0218 ms 88.3%
+  triton_bmm_10351 0.0220 ms 87.2%
+  triton_bmm_10357 0.0226 ms 85.2%
+  triton_bmm_10356 0.0226 ms 85.0%
+SingleProcess AUTOTUNE takes 3.7591 seconds
+AUTOTUNE bmm(512x1x96, 512x96x112)
+  triton_bmm_10420 0.0182 ms 100.0%
+  triton_bmm_10423 0.0182 ms 100.0%
+  triton_bmm_10427 0.0185 ms 98.6%
+  triton_bmm_10426 0.0190 ms 95.8%
+  triton_bmm_10431 0.0191 ms 95.5%
+  triton_bmm_10429 0.0192 ms 95.2%
+  triton_bmm_10428 0.0193 ms 94.5%
+  triton_bmm_10424 0.0193 ms 94.4%
+  triton_bmm_10430 0.0194 ms 94.2%
+  triton_bmm_10425 0.0199 ms 91.5%
+SingleProcess AUTOTUNE takes 4.0105 seconds
+AUTOTUNE bmm(512x1x112, 512x112x96)
+  triton_bmm_10455 0.0182 ms 100.0%
+  triton_bmm_10451 0.0183 ms 99.8%
+  triton_bmm_10454 0.0192 ms 95.2%
+  triton_bmm_10445 0.0194 ms 94.2%
+  triton_bmm_10444 0.0196 ms 93.3%
+  triton_bmm_10450 0.0201 ms 90.9%
+  triton_bmm_10446 0.0201 ms 90.8%
+  triton_bmm_10452 0.0201 ms 90.6%
+  triton_bmm_10449 0.0203 ms 89.8%
+  triton_bmm_10453 0.0208 ms 87.6%
+SingleProcess AUTOTUNE takes 3.8849 seconds
+AUTOTUNE bmm(512x1x96, 512x96x113)
+  triton_bmm_10523 0.0188 ms 100.0%
+  triton_bmm_10519 0.0189 ms 99.7%
+  triton_bmm_10520 0.0191 ms 98.4%
+  triton_bmm_10516 0.0196 ms 96.1%
+  triton_bmm_10524 0.0197 ms 95.5%
+  triton_bmm_10522 0.0199 ms 94.4%
+  triton_bmm_10527 0.0202 ms 93.2%
+  triton_bmm_10521 0.0204 ms 92.5%
+  triton_bmm_10526 0.0206 ms 91.3%
+  triton_bmm_10525 0.0207 ms 90.7%
+SingleProcess AUTOTUNE takes 4.7090 seconds
+AUTOTUNE bmm(512x1x113, 512x113x96)
+  triton_bmm_10547 0.0193 ms 100.0%
+  triton_bmm_10541 0.0194 ms 99.7%
+  triton_bmm_10551 0.0197 ms 97.7%
+  triton_bmm_10540 0.0205 ms 94.2%
+  triton_bmm_10542 0.0205 ms 94.1%
+  triton_bmm_10545 0.0220 ms 87.6%
+  triton_bmm_10550 0.0222 ms 86.9%
+  triton_bmm_10549 0.0225 ms 85.8%
+  triton_bmm_10543 0.0226 ms 85.3%
+  triton_bmm_10544 0.0227 ms 85.2%
+SingleProcess AUTOTUNE takes 3.6642 seconds
+AUTOTUNE bmm(512x1x96, 512x96x114)
+  triton_bmm_10612 0.0187 ms 100.0%
+  triton_bmm_10616 0.0192 ms 97.4%
+  triton_bmm_10619 0.0193 ms 96.5%
+  triton_bmm_10615 0.0194 ms 96.0%
+  triton_bmm_10618 0.0195 ms 95.9%
+  triton_bmm_10620 0.0197 ms 94.6%
+  triton_bmm_10622 0.0198 ms 94.3%
+  triton_bmm_10623 0.0201 ms 92.8%
+  triton_bmm_10617 0.0202 ms 92.5%
+  triton_bmm_10613 0.0207 ms 90.0%
+SingleProcess AUTOTUNE takes 3.6783 seconds
+AUTOTUNE bmm(512x1x114, 512x114x96)
+  triton_bmm_10643 0.0184 ms 100.0%
+  triton_bmm_10636 0.0193 ms 95.2%
+  triton_bmm_10646 0.0193 ms 95.0%
+  triton_bmm_10647 0.0195 ms 94.1%
+  triton_bmm_10637 0.0196 ms 93.8%
+  triton_bmm_10641 0.0196 ms 93.6%
+  triton_bmm_10638 0.0202 ms 90.8%
+  triton_bmm_10640 0.0203 ms 90.5%
+  triton_bmm_10642 0.0204 ms 90.1%
+  triton_bmm_10639 0.0204 ms 89.8%
+SingleProcess AUTOTUNE takes 3.9243 seconds
+AUTOTUNE bmm(512x1x96, 512x96x115)
+  triton_bmm_10715 0.0189 ms 100.0%
+  triton_bmm_10708 0.0189 ms 99.8%
+  triton_bmm_10712 0.0193 ms 98.0%
+  triton_bmm_10711 0.0193 ms 97.8%
+  triton_bmm_10714 0.0196 ms 96.7%
+  triton_bmm_10716 0.0198 ms 95.6%
+  triton_bmm_10719 0.0204 ms 92.9%
+  triton_bmm_10717 0.0204 ms 92.5%
+  triton_bmm_10718 0.0207 ms 91.2%
+  triton_bmm_10713 0.0211 ms 89.5%
+SingleProcess AUTOTUNE takes 3.6931 seconds
+AUTOTUNE bmm(512x1x115, 512x115x96)
+  triton_bmm_10733 0.0200 ms 100.0%
+  triton_bmm_10739 0.0201 ms 99.8%
+  triton_bmm_10743 0.0204 ms 98.4%
+  triton_bmm_10732 0.0205 ms 97.5%
+  triton_bmm_10734 0.0207 ms 96.6%
+  triton_bmm_10741 0.0223 ms 89.7%
+  triton_bmm_10740 0.0224 ms 89.3%
+  triton_bmm_10742 0.0225 ms 89.2%
+  triton_bmm_10736 0.0228 ms 87.9%
+  triton_bmm_10735 0.0229 ms 87.6%
+SingleProcess AUTOTUNE takes 3.7401 seconds
+AUTOTUNE bmm(512x1x96, 512x96x116)
+  triton_bmm_10804 0.0185 ms 100.0%
+  triton_bmm_10811 0.0192 ms 95.9%
+  triton_bmm_10807 0.0194 ms 95.4%
+  triton_bmm_10814 0.0197 ms 93.8%
+  triton_bmm_10808 0.0197 ms 93.7%
+  triton_bmm_10810 0.0199 ms 92.6%
+  triton_bmm_10815 0.0202 ms 91.3%
+  triton_bmm_10812 0.0204 ms 90.7%
+  triton_bmm_10813 0.0204 ms 90.7%
+  triton_bmm_10806 0.0206 ms 89.6%
+SingleProcess AUTOTUNE takes 3.7577 seconds
+AUTOTUNE bmm(512x1x116, 512x116x96)
+  triton_bmm_10835 0.0187 ms 100.0%
+  triton_bmm_10828 0.0193 ms 96.7%
+  triton_bmm_10829 0.0196 ms 95.2%
+  triton_bmm_10839 0.0201 ms 92.7%
+  triton_bmm_10838 0.0202 ms 92.4%
+  triton_bmm_10833 0.0203 ms 92.0%
+  triton_bmm_10830 0.0209 ms 89.3%
+  triton_bmm_10831 0.0210 ms 88.9%
+  triton_bmm_10834 0.0211 ms 88.6%
+  triton_bmm_10836 0.0213 ms 87.7%
+SingleProcess AUTOTUNE takes 3.7955 seconds
+AUTOTUNE bmm(512x1x96, 512x96x117)
+  triton_bmm_10907 0.0191 ms 100.0%
+  triton_bmm_10903 0.0193 ms 98.8%
+  triton_bmm_10906 0.0196 ms 97.0%
+  triton_bmm_10900 0.0197 ms 96.7%
+  triton_bmm_10911 0.0200 ms 95.1%
+  triton_bmm_10904 0.0201 ms 94.8%
+  triton_bmm_10908 0.0201 ms 94.6%
+  triton_bmm_10909 0.0203 ms 93.9%
+  triton_bmm_10910 0.0204 ms 93.6%
+  triton_bmm_10905 0.0206 ms 92.3%
+SingleProcess AUTOTUNE takes 4.1196 seconds
+AUTOTUNE bmm(512x1x117, 512x117x96)
+  triton_bmm_10935 0.0202 ms 100.0%
+  triton_bmm_10925 0.0205 ms 98.1%
+  triton_bmm_10931 0.0206 ms 98.0%
+  triton_bmm_10924 0.0209 ms 96.3%
+  triton_bmm_10926 0.0213 ms 94.6%
+  triton_bmm_10929 0.0225 ms 89.5%
+  triton_bmm_10933 0.0228 ms 88.6%
+  triton_bmm_10934 0.0228 ms 88.4%
+  triton_bmm_10930 0.0229 ms 88.1%
+  triton_bmm_10928 0.0229 ms 87.9%
+SingleProcess AUTOTUNE takes 3.6125 seconds
+AUTOTUNE bmm(512x1x96, 512x96x118)
+  triton_bmm_10996 0.0191 ms 100.0%
+  triton_bmm_10999 0.0195 ms 98.1%
+  triton_bmm_11003 0.0196 ms 97.3%
+  triton_bmm_11000 0.0201 ms 95.0%
+  triton_bmm_11006 0.0202 ms 94.7%
+  triton_bmm_11007 0.0204 ms 93.5%
+  triton_bmm_11002 0.0206 ms 92.6%
+  triton_bmm_11004 0.0206 ms 92.6%
+  triton_bmm_10997 0.0211 ms 90.5%
+  triton_bmm_11001 0.0212 ms 90.1%
+SingleProcess AUTOTUNE takes 3.6927 seconds
+AUTOTUNE bmm(512x1x118, 512x118x96)
+  triton_bmm_11027 0.0194 ms 100.0%
+  triton_bmm_11021 0.0196 ms 98.6%
+  triton_bmm_11030 0.0196 ms 98.5%
+  triton_bmm_11020 0.0200 ms 96.6%
+  triton_bmm_11025 0.0201 ms 96.2%
+  triton_bmm_11026 0.0204 ms 95.1%
+  triton_bmm_11031 0.0204 ms 95.0%
+  triton_bmm_11022 0.0206 ms 94.2%
+  triton_bmm_11024 0.0210 ms 92.3%
+  triton_bmm_11023 0.0213 ms 91.0%
+SingleProcess AUTOTUNE takes 3.7763 seconds
+AUTOTUNE bmm(512x1x96, 512x96x119)
+  triton_bmm_11099 0.0193 ms 100.0%
+  triton_bmm_11092 0.0194 ms 99.5%
+  triton_bmm_11095 0.0196 ms 98.1%
+  triton_bmm_11098 0.0200 ms 96.6%
+  triton_bmm_11096 0.0202 ms 95.6%
+  triton_bmm_11103 0.0207 ms 93.2%
+  triton_bmm_11097 0.0208 ms 92.6%
+  triton_bmm_11100 0.0209 ms 92.3%
+  triton_bmm_11102 0.0210 ms 91.6%
+  triton_bmm_11101 0.0211 ms 91.2%
+SingleProcess AUTOTUNE takes 4.1911 seconds
+AUTOTUNE bmm(512x1x119, 512x119x96)
+  triton_bmm_11123 0.0200 ms 100.0%
+  triton_bmm_11117 0.0204 ms 98.3%
+  triton_bmm_11127 0.0208 ms 96.5%
+  triton_bmm_11116 0.0209 ms 95.7%
+  triton_bmm_11118 0.0211 ms 95.0%
+  triton_bmm_11125 0.0225 ms 89.0%
+  triton_bmm_11120 0.0226 ms 88.6%
+  triton_bmm_11126 0.0227 ms 88.2%
+  triton_bmm_11121 0.0228 ms 87.9%
+  triton_bmm_11122 0.0231 ms 86.6%
+SingleProcess AUTOTUNE takes 3.7999 seconds
+AUTOTUNE bmm(512x1x96, 512x96x120)
+  triton_bmm_11195 0.0187 ms 100.0%
+  triton_bmm_11191 0.0192 ms 97.3%
+  triton_bmm_11188 0.0194 ms 96.6%
+  triton_bmm_11194 0.0196 ms 95.3%
+  triton_bmm_11192 0.0202 ms 92.7%
+  triton_bmm_11197 0.0202 ms 92.6%
+  triton_bmm_11199 0.0203 ms 92.1%
+  triton_bmm_11198 0.0205 ms 91.4%
+  triton_bmm_11189 0.0207 ms 90.3%
+  triton_bmm_11196 0.0208 ms 90.1%
+SingleProcess AUTOTUNE takes 4.4358 seconds
+AUTOTUNE bmm(512x1x120, 512x120x96)
+  triton_bmm_11219 0.0193 ms 100.0%
+  triton_bmm_11212 0.0198 ms 97.7%
+  triton_bmm_11223 0.0199 ms 97.1%
+  triton_bmm_11213 0.0200 ms 96.6%
+  triton_bmm_11222 0.0202 ms 95.7%
+  triton_bmm_11214 0.0208 ms 92.8%
+  triton_bmm_11215 0.0210 ms 92.2%
+  triton_bmm_11218 0.0213 ms 90.8%
+  triton_bmm_11216 0.0215 ms 89.9%
+  triton_bmm_11220 0.0216 ms 89.5%
+SingleProcess AUTOTUNE takes 4.2147 seconds
+AUTOTUNE bmm(512x1x96, 512x96x121)
+  triton_bmm_11284 0.0195 ms 100.0%
+  triton_bmm_11290 0.0201 ms 97.1%
+  triton_bmm_11291 0.0201 ms 97.1%
+  triton_bmm_11287 0.0202 ms 96.7%
+  triton_bmm_11288 0.0205 ms 95.2%
+  triton_bmm_11294 0.0207 ms 94.3%
+  triton_bmm_11289 0.0208 ms 93.7%
+  triton_bmm_11295 0.0208 ms 93.5%
+  triton_bmm_11292 0.0211 ms 92.6%
+  triton_bmm_11293 0.0212 ms 91.7%
+SingleProcess AUTOTUNE takes 4.3842 seconds
+AUTOTUNE bmm(512x1x121, 512x121x96)
+  triton_bmm_11315 0.0205 ms 100.0%
+  triton_bmm_11319 0.0210 ms 97.9%
+  triton_bmm_11309 0.0210 ms 97.7%
+  triton_bmm_11310 0.0217 ms 94.4%
+  triton_bmm_11308 0.0219 ms 93.6%
+  triton_bmm_11312 0.0227 ms 90.4%
+  triton_bmm_11318 0.0228 ms 90.2%
+  triton_bmm_11316 0.0236 ms 87.1%
+  triton_bmm_11311 0.0236 ms 86.8%
+  triton_bmm_11313 0.0237 ms 86.7%
+SingleProcess AUTOTUNE takes 4.1911 seconds
+AUTOTUNE bmm(512x1x96, 512x96x122)
+  triton_bmm_11387 0.0194 ms 100.0%
+  triton_bmm_11383 0.0198 ms 98.0%
+  triton_bmm_11380 0.0200 ms 97.0%
+  triton_bmm_11391 0.0202 ms 95.9%
+  triton_bmm_11386 0.0202 ms 95.7%
+  triton_bmm_11390 0.0205 ms 94.5%
+  triton_bmm_11384 0.0206 ms 94.1%
+  triton_bmm_11388 0.0208 ms 92.9%
+  triton_bmm_11385 0.0214 ms 90.4%
+  triton_bmm_11382 0.0216 ms 89.6%
+SingleProcess AUTOTUNE takes 4.1220 seconds
+AUTOTUNE bmm(512x1x122, 512x122x96)
+  triton_bmm_11411 0.0198 ms 100.0%
+  triton_bmm_11405 0.0201 ms 98.3%
+  triton_bmm_11409 0.0204 ms 96.7%
+  triton_bmm_11410 0.0206 ms 96.1%
+  triton_bmm_11414 0.0206 ms 96.1%
+  triton_bmm_11404 0.0207 ms 95.4%
+  triton_bmm_11415 0.0207 ms 95.4%
+  triton_bmm_11406 0.0211 ms 93.6%
+  triton_bmm_11408 0.0213 ms 92.7%
+  triton_bmm_11407 0.0215 ms 92.0%
+SingleProcess AUTOTUNE takes 3.8930 seconds
+AUTOTUNE bmm(512x1x96, 512x96x123)
+  triton_bmm_11483 0.0195 ms 100.0%
+  triton_bmm_11479 0.0200 ms 97.8%
+  triton_bmm_11480 0.0201 ms 97.0%
+  triton_bmm_11482 0.0203 ms 96.4%
+  triton_bmm_11476 0.0203 ms 96.1%
+  triton_bmm_11485 0.0207 ms 94.3%
+  triton_bmm_11484 0.0208 ms 94.0%
+  triton_bmm_11486 0.0209 ms 93.6%
+  triton_bmm_11487 0.0210 ms 92.8%
+  triton_bmm_11481 0.0211 ms 92.4%
+SingleProcess AUTOTUNE takes 3.7396 seconds
+AUTOTUNE bmm(512x1x123, 512x123x96)
+  triton_bmm_11501 0.0204 ms 100.0%
+  triton_bmm_11511 0.0209 ms 98.0%
+  triton_bmm_11507 0.0210 ms 97.6%
+  triton_bmm_11500 0.0213 ms 96.1%
+  triton_bmm_11502 0.0216 ms 94.8%
+  triton_bmm_11510 0.0229 ms 89.2%
+  triton_bmm_11504 0.0229 ms 89.1%
+  triton_bmm_11509 0.0232 ms 88.1%
+  triton_bmm_11503 0.0233 ms 87.7%
+  triton_bmm_11506 0.0235 ms 87.2%
+SingleProcess AUTOTUNE takes 4.1768 seconds
+AUTOTUNE bmm(512x1x96, 512x96x124)
+  triton_bmm_11572 0.0197 ms 100.0%
+  triton_bmm_11579 0.0197 ms 100.0%
+  triton_bmm_11578 0.0201 ms 98.2%
+  triton_bmm_11575 0.0201 ms 97.9%
+  triton_bmm_11576 0.0207 ms 95.2%
+  triton_bmm_11583 0.0209 ms 94.5%
+  triton_bmm_11582 0.0209 ms 94.2%
+  triton_bmm_11573 0.0213 ms 92.6%
+  triton_bmm_11580 0.0213 ms 92.6%
+  triton_bmm_11581 0.0213 ms 92.4%
+SingleProcess AUTOTUNE takes 3.7999 seconds
+AUTOTUNE bmm(512x1x124, 512x124x96)
+  triton_bmm_11603 0.0200 ms 100.0%
+  triton_bmm_11607 0.0201 ms 99.5%
+  triton_bmm_11597 0.0202 ms 99.1%
+  triton_bmm_11606 0.0206 ms 97.2%
+  triton_bmm_11596 0.0209 ms 95.7%
+  triton_bmm_11601 0.0211 ms 95.1%
+  triton_bmm_11602 0.0212 ms 94.6%
+  triton_bmm_11598 0.0212 ms 94.3%
+  triton_bmm_11599 0.0214 ms 93.6%
+  triton_bmm_11600 0.0217 ms 92.3%
+SingleProcess AUTOTUNE takes 3.9797 seconds
+AUTOTUNE bmm(512x1x96, 512x96x125)
+  triton_bmm_11675 0.0198 ms 100.0%
+  triton_bmm_11668 0.0198 ms 99.9%
+  triton_bmm_11672 0.0204 ms 97.1%
+  triton_bmm_11671 0.0207 ms 95.7%
+  triton_bmm_11674 0.0209 ms 94.9%
+  triton_bmm_11678 0.0209 ms 94.7%
+  triton_bmm_11676 0.0211 ms 93.8%
+  triton_bmm_11679 0.0212 ms 93.6%
+  triton_bmm_11673 0.0213 ms 93.1%
+  triton_bmm_11677 0.0214 ms 92.6%
+SingleProcess AUTOTUNE takes 3.7686 seconds
+AUTOTUNE bmm(512x1x125, 512x125x96)
+  triton_bmm_11699 0.0206 ms 100.0%
+  triton_bmm_11703 0.0207 ms 99.7%
+  triton_bmm_11693 0.0211 ms 97.7%
+  triton_bmm_11692 0.0215 ms 96.0%
+  triton_bmm_11694 0.0222 ms 92.9%
+  triton_bmm_11702 0.0227 ms 91.0%
+  triton_bmm_11696 0.0232 ms 88.8%
+  triton_bmm_11695 0.0235 ms 87.9%
+  triton_bmm_11698 0.0238 ms 86.8%
+  triton_bmm_11700 0.0239 ms 86.3%
+SingleProcess AUTOTUNE takes 3.9522 seconds
+AUTOTUNE bmm(512x1x96, 512x96x126)
+  triton_bmm_11764 0.0198 ms 100.0%
+  triton_bmm_11767 0.0203 ms 97.3%
+  triton_bmm_11771 0.0205 ms 96.5%
+  triton_bmm_11775 0.0206 ms 96.0%
+  triton_bmm_11770 0.0206 ms 96.0%
+  triton_bmm_11774 0.0208 ms 95.2%
+  triton_bmm_11768 0.0212 ms 93.4%
+  triton_bmm_11769 0.0213 ms 93.0%
+  triton_bmm_11772 0.0214 ms 92.3%
+  triton_bmm_11766 0.0216 ms 91.6%
+SingleProcess AUTOTUNE takes 3.8778 seconds
+AUTOTUNE bmm(512x1x126, 512x126x96)
+  triton_bmm_11795 0.0197 ms 100.0%
+  triton_bmm_11798 0.0204 ms 97.0%
+  triton_bmm_11789 0.0207 ms 95.4%
+  triton_bmm_11788 0.0209 ms 94.3%
+  triton_bmm_11799 0.0212 ms 93.3%
+  triton_bmm_11793 0.0212 ms 92.9%
+  triton_bmm_11794 0.0215 ms 92.0%
+  triton_bmm_11790 0.0215 ms 91.7%
+  triton_bmm_11791 0.0215 ms 91.7%
+  triton_bmm_11792 0.0216 ms 91.5%
+SingleProcess AUTOTUNE takes 3.7736 seconds
+AUTOTUNE bmm(512x1x96, 512x96x127)
+  triton_bmm_11867 0.0200 ms 100.0%
+  triton_bmm_11860 0.0201 ms 99.8%
+  triton_bmm_11863 0.0204 ms 98.0%
+  triton_bmm_11864 0.0207 ms 96.8%
+  triton_bmm_11869 0.0209 ms 95.7%
+  triton_bmm_11866 0.0212 ms 94.3%
+  triton_bmm_11871 0.0214 ms 93.7%
+  triton_bmm_11868 0.0214 ms 93.4%
+  triton_bmm_11870 0.0217 ms 92.2%
+  triton_bmm_11862 0.0219 ms 91.4%
+SingleProcess AUTOTUNE takes 3.6934 seconds
+AUTOTUNE bmm(512x1x127, 512x127x96)
+  triton_bmm_11885 0.0212 ms 100.0%
+  triton_bmm_11891 0.0214 ms 99.0%
+  triton_bmm_11884 0.0223 ms 95.0%
+  triton_bmm_11886 0.0225 ms 94.0%
+  triton_bmm_11888 0.0234 ms 90.4%
+  triton_bmm_11892 0.0236 ms 89.4%
+  triton_bmm_11895 0.0237 ms 89.3%
+  triton_bmm_11887 0.0238 ms 89.0%
+  triton_bmm_11894 0.0238 ms 88.8%
+  triton_bmm_11893 0.0243 ms 87.1%
+SingleProcess AUTOTUNE takes 4.2347 seconds
+AUTOTUNE bmm(512x1x96, 512x96x128)
+  triton_bmm_11963 0.0200 ms 100.0%
+  triton_bmm_11956 0.0201 ms 99.7%
+  triton_bmm_11966 0.0205 ms 97.6%
+  triton_bmm_11959 0.0206 ms 97.2%
+  triton_bmm_11965 0.0206 ms 97.1%
+  triton_bmm_11967 0.0209 ms 95.8%
+  triton_bmm_11962 0.0211 ms 95.0%
+  triton_bmm_11960 0.0213 ms 94.0%
+  triton_bmm_11961 0.0213 ms 93.8%
+  triton_bmm_11958 0.0214 ms 93.4%
+SingleProcess AUTOTUNE takes 4.1639 seconds
+AUTOTUNE bmm(512x1x128, 512x128x96)
+  triton_bmm_11987 0.0205 ms 100.0%
+  triton_bmm_11981 0.0205 ms 99.7%
+  triton_bmm_11980 0.0208 ms 98.5%
+  triton_bmm_11991 0.0213 ms 96.2%
+  triton_bmm_11990 0.0214 ms 95.7%
+  triton_bmm_11982 0.0216 ms 94.6%
+  triton_bmm_11988 0.0221 ms 92.8%
+  triton_bmm_11983 0.0221 ms 92.6%
+  triton_bmm_11985 0.0221 ms 92.5%
+  triton_bmm_11986 0.0221 ms 92.5%
+SingleProcess AUTOTUNE takes 3.8445 seconds
+AUTOTUNE bmm(512x1x96, 512x96x129)
+  triton_bmm_12056 0.0220 ms 100.0%
+  triton_bmm_12063 0.0220 ms 100.0%
+  triton_bmm_12061 0.0222 ms 99.3%
+  triton_bmm_12052 0.0226 ms 97.6%
+  triton_bmm_12059 0.0227 ms 97.2%
+  triton_bmm_12055 0.0227 ms 97.0%
+  triton_bmm_12060 0.0228 ms 96.5%
+  triton_bmm_12058 0.0230 ms 96.0%
+  triton_bmm_12062 0.0234 ms 94.1%
+  triton_bmm_12054 0.0235 ms 94.0%
+SingleProcess AUTOTUNE takes 3.9294 seconds
+AUTOTUNE bmm(512x1x129, 512x129x96)
+  triton_bmm_12087 0.0217 ms 100.0%
+  triton_bmm_12077 0.0226 ms 96.3%
+  triton_bmm_12083 0.0226 ms 96.0%
+  triton_bmm_12078 0.0232 ms 93.5%
+  triton_bmm_12076 0.0235 ms 92.5%
+  triton_bmm_12086 0.0240 ms 90.7%
+  triton_bmm_12080 0.0249 ms 87.4%
+  triton_bmm_12079 0.0249 ms 87.3%
+  bmm 0.0256 ms 85.0%
+  triton_bmm_12082 0.0259 ms 83.9%
+SingleProcess AUTOTUNE takes 3.9843 seconds
+AUTOTUNE bmm(512x1x96, 512x96x130)
+  triton_bmm_12159 0.0218 ms 100.0%
+  triton_bmm_12152 0.0219 ms 99.9%
+  triton_bmm_12154 0.0229 ms 95.2%
+  triton_bmm_12148 0.0230 ms 95.0%
+  triton_bmm_12151 0.0231 ms 94.6%
+  triton_bmm_12155 0.0232 ms 94.1%
+  triton_bmm_12150 0.0234 ms 93.4%
+  triton_bmm_12156 0.0234 ms 93.4%
+  triton_bmm_12153 0.0236 ms 92.7%
+  triton_bmm_12158 0.0238 ms 91.7%
+SingleProcess AUTOTUNE takes 4.5532 seconds
+AUTOTUNE bmm(512x1x130, 512x130x96)
+  triton_bmm_12179 0.0212 ms 100.0%
+  triton_bmm_12173 0.0214 ms 99.1%
+  triton_bmm_12182 0.0219 ms 97.1%
+  triton_bmm_12183 0.0219 ms 96.8%
+  triton_bmm_12175 0.0222 ms 95.4%
+  triton_bmm_12178 0.0223 ms 95.0%
+  triton_bmm_12174 0.0224 ms 94.6%
+  triton_bmm_12176 0.0226 ms 93.9%
+  triton_bmm_12172 0.0227 ms 93.4%
+  triton_bmm_12177 0.0228 ms 93.0%
+SingleProcess AUTOTUNE takes 3.9261 seconds
+AUTOTUNE bmm(512x1x96, 512x96x131)
+  triton_bmm_12255 0.0223 ms 100.0%
+  triton_bmm_12253 0.0223 ms 99.9%
+  triton_bmm_12248 0.0226 ms 98.6%
+  triton_bmm_12251 0.0229 ms 97.3%
+  triton_bmm_12252 0.0230 ms 97.0%
+  triton_bmm_12250 0.0232 ms 95.9%
+  triton_bmm_12246 0.0233 ms 95.7%
+  triton_bmm_12249 0.0233 ms 95.5%
+  triton_bmm_12244 0.0234 ms 95.3%
+  triton_bmm_12247 0.0235 ms 95.0%
+SingleProcess AUTOTUNE takes 4.0434 seconds
+AUTOTUNE bmm(512x1x131, 512x131x96)
+  triton_bmm_12279 0.0222 ms 100.0%
+  triton_bmm_12269 0.0225 ms 98.6%
+  triton_bmm_12275 0.0227 ms 97.9%
+  triton_bmm_12268 0.0236 ms 94.3%
+  triton_bmm_12270 0.0239 ms 92.9%
+  triton_bmm_12278 0.0245 ms 90.7%
+  triton_bmm_12272 0.0245 ms 90.5%
+  triton_bmm_12271 0.0250 ms 89.0%
+  triton_bmm_12276 0.0254 ms 87.4%
+  triton_bmm_12273 0.0263 ms 84.4%
+SingleProcess AUTOTUNE takes 4.2077 seconds
+AUTOTUNE bmm(512x1x96, 512x96x132)
+  triton_bmm_12349 0.0220 ms 100.0%
+  triton_bmm_12351 0.0225 ms 97.7%
+  triton_bmm_12343 0.0226 ms 97.4%
+  triton_bmm_12344 0.0226 ms 97.4%
+  triton_bmm_12350 0.0229 ms 96.1%
+  triton_bmm_12347 0.0231 ms 95.3%
+  triton_bmm_12346 0.0233 ms 94.5%
+  triton_bmm_12348 0.0234 ms 94.0%
+  triton_bmm_12340 0.0235 ms 93.7%
+  triton_bmm_12342 0.0235 ms 93.7%
+SingleProcess AUTOTUNE takes 3.8743 seconds
+AUTOTUNE bmm(512x1x132, 512x132x96)
+  triton_bmm_12371 0.0215 ms 100.0%
+  triton_bmm_12365 0.0219 ms 98.2%
+  triton_bmm_12374 0.0219 ms 98.2%
+  triton_bmm_12375 0.0220 ms 97.8%
+  triton_bmm_12364 0.0227 ms 94.6%
+  triton_bmm_12367 0.0229 ms 93.7%
+  triton_bmm_12368 0.0230 ms 93.6%
+  triton_bmm_12366 0.0230 ms 93.5%
+  triton_bmm_12370 0.0234 ms 91.9%
+  triton_bmm_12369 0.0237 ms 90.6%
+SingleProcess AUTOTUNE takes 4.7340 seconds
+AUTOTUNE bmm(512x1x96, 512x96x133)
+  triton_bmm_12445 0.0223 ms 100.0%
+  triton_bmm_12440 0.0223 ms 99.7%
+  triton_bmm_12447 0.0224 ms 99.4%
+  triton_bmm_12436 0.0230 ms 96.7%
+  triton_bmm_12439 0.0231 ms 96.3%
+  triton_bmm_12444 0.0232 ms 96.0%
+  triton_bmm_12442 0.0233 ms 95.5%
+  triton_bmm_12438 0.0235 ms 94.7%
+  triton_bmm_12443 0.0237 ms 93.9%
+  triton_bmm_12446 0.0238 ms 93.7%
+SingleProcess AUTOTUNE takes 4.2693 seconds
+AUTOTUNE bmm(512x1x133, 512x133x96)
+  triton_bmm_12471 0.0221 ms 100.0%
+  triton_bmm_12467 0.0223 ms 99.1%
+  triton_bmm_12461 0.0227 ms 97.5%
+  triton_bmm_12460 0.0232 ms 95.0%
+  triton_bmm_12462 0.0238 ms 92.7%
+  triton_bmm_12470 0.0247 ms 89.3%
+  triton_bmm_12464 0.0249 ms 88.6%
+  triton_bmm_12463 0.0257 ms 85.9%
+  triton_bmm_12468 0.0261 ms 84.5%
+  bmm 0.0263 ms 84.0%
+SingleProcess AUTOTUNE takes 3.8103 seconds
+AUTOTUNE bmm(512x1x96, 512x96x134)
+  triton_bmm_12536 0.0222 ms 100.0%
+  triton_bmm_12543 0.0228 ms 97.2%
+  triton_bmm_12539 0.0232 ms 95.7%
+  triton_bmm_12532 0.0232 ms 95.6%
+  triton_bmm_12540 0.0233 ms 95.1%
+  triton_bmm_12537 0.0234 ms 94.8%
+  triton_bmm_12535 0.0234 ms 94.7%
+  triton_bmm_12534 0.0235 ms 94.3%
+  triton_bmm_12538 0.0238 ms 93.1%
+  triton_bmm_12542 0.0242 ms 91.8%
+SingleProcess AUTOTUNE takes 4.0388 seconds
+AUTOTUNE bmm(512x1x134, 512x134x96)
+  triton_bmm_12563 0.0213 ms 100.0%
+  triton_bmm_12557 0.0218 ms 97.9%
+  triton_bmm_12567 0.0219 ms 97.2%
+  triton_bmm_12566 0.0221 ms 96.2%
+  triton_bmm_12561 0.0226 ms 94.2%
+  triton_bmm_12562 0.0227 ms 93.7%
+  triton_bmm_12556 0.0228 ms 93.7%
+  triton_bmm_12559 0.0230 ms 92.5%
+  triton_bmm_12560 0.0231 ms 92.2%
+  triton_bmm_12558 0.0232 ms 92.0%
+SingleProcess AUTOTUNE takes 3.9646 seconds
+AUTOTUNE bmm(512x1x96, 512x96x135)
+  triton_bmm_12637 0.0225 ms 100.0%
+  triton_bmm_12632 0.0226 ms 99.6%
+  triton_bmm_12634 0.0231 ms 97.1%
+  triton_bmm_12639 0.0231 ms 97.1%
+  triton_bmm_12628 0.0235 ms 95.5%
+  triton_bmm_12631 0.0237 ms 94.6%
+  triton_bmm_12630 0.0238 ms 94.5%
+  triton_bmm_12636 0.0239 ms 94.0%
+  triton_bmm_12635 0.0239 ms 93.9%
+  triton_bmm_12638 0.0240 ms 93.7%
+SingleProcess AUTOTUNE takes 4.1848 seconds
+AUTOTUNE bmm(512x1x135, 512x135x96)
+  triton_bmm_12663 0.0220 ms 100.0%
+  triton_bmm_12653 0.0227 ms 97.3%
+  triton_bmm_12659 0.0230 ms 96.0%
+  triton_bmm_12652 0.0239 ms 92.1%
+  triton_bmm_12654 0.0240 ms 91.7%
+  triton_bmm_12662 0.0244 ms 90.5%
+  triton_bmm_12656 0.0252 ms 87.7%
+  triton_bmm_12655 0.0254 ms 86.9%
+  triton_bmm_12660 0.0259 ms 85.1%
+  triton_bmm_12657 0.0266 ms 82.9%
+SingleProcess AUTOTUNE takes 4.1672 seconds
+AUTOTUNE bmm(512x1x96, 512x96x136)
+  triton_bmm_12735 0.0220 ms 100.0%
+  triton_bmm_12733 0.0224 ms 98.4%
+  triton_bmm_12728 0.0226 ms 97.2%
+  triton_bmm_12727 0.0228 ms 96.5%
+  triton_bmm_12734 0.0230 ms 95.7%
+  triton_bmm_12724 0.0232 ms 95.0%
+  triton_bmm_12731 0.0233 ms 94.4%
+  triton_bmm_12732 0.0235 ms 93.9%
+  triton_bmm_12730 0.0235 ms 93.6%
+  triton_bmm_12726 0.0236 ms 93.4%
+SingleProcess AUTOTUNE takes 3.8479 seconds
+AUTOTUNE bmm(512x1x136, 512x136x96)
+  triton_bmm_12758 0.0217 ms 100.0%
+  triton_bmm_12749 0.0218 ms 99.6%
+  triton_bmm_12755 0.0222 ms 97.6%
+  triton_bmm_12759 0.0223 ms 97.1%
+  triton_bmm_12751 0.0229 ms 94.4%
+  triton_bmm_12748 0.0230 ms 94.3%
+  triton_bmm_12750 0.0233 ms 92.9%
+  triton_bmm_12754 0.0235 ms 92.1%
+  triton_bmm_12752 0.0238 ms 91.1%
+  triton_bmm_12756 0.0242 ms 89.4%
+SingleProcess AUTOTUNE takes 3.8417 seconds
+AUTOTUNE bmm(512x1x96, 512x96x137)
+  triton_bmm_12829 0.0228 ms 100.0%
+  triton_bmm_12831 0.0229 ms 99.4%
+  triton_bmm_12824 0.0233 ms 97.7%
+  triton_bmm_12823 0.0236 ms 96.6%
+  triton_bmm_12827 0.0236 ms 96.5%
+  triton_bmm_12826 0.0238 ms 95.7%
+  triton_bmm_12828 0.0238 ms 95.7%
+  triton_bmm_12820 0.0239 ms 95.3%
+  triton_bmm_12825 0.0242 ms 94.0%
+  triton_bmm_12822 0.0244 ms 93.3%
+SingleProcess AUTOTUNE takes 4.0874 seconds
+AUTOTUNE bmm(512x1x137, 512x137x96)
+  triton_bmm_12851 0.0226 ms 100.0%
+  triton_bmm_12855 0.0227 ms 99.7%
+  triton_bmm_12845 0.0227 ms 99.6%
+  triton_bmm_12844 0.0236 ms 95.9%
+  triton_bmm_12846 0.0245 ms 92.3%
+  triton_bmm_12854 0.0245 ms 92.2%
+  triton_bmm_12848 0.0257 ms 87.8%
+  triton_bmm_12852 0.0259 ms 87.2%
+  triton_bmm_12847 0.0260 ms 87.1%
+  triton_bmm_12850 0.0271 ms 83.4%
+SingleProcess AUTOTUNE takes 3.8695 seconds
+AUTOTUNE bmm(512x1x96, 512x96x138)
+  triton_bmm_12927 0.0227 ms 100.0%
+  triton_bmm_12920 0.0232 ms 97.8%
+  triton_bmm_12919 0.0234 ms 97.1%
+  triton_bmm_12922 0.0235 ms 96.6%
+  triton_bmm_12921 0.0237 ms 95.8%
+  triton_bmm_12926 0.0239 ms 94.9%
+  triton_bmm_12916 0.0240 ms 94.4%
+  triton_bmm_12923 0.0241 ms 94.2%
+  triton_bmm_12918 0.0242 ms 93.9%
+  triton_bmm_12924 0.0242 ms 93.9%
+SingleProcess AUTOTUNE takes 3.7439 seconds
+AUTOTUNE bmm(512x1x138, 512x138x96)
+  triton_bmm_12941 0.0220 ms 100.0%
+  triton_bmm_12950 0.0223 ms 98.7%
+  triton_bmm_12947 0.0224 ms 98.1%
+  triton_bmm_12951 0.0227 ms 97.2%
+  triton_bmm_12942 0.0230 ms 95.6%
+  triton_bmm_12940 0.0235 ms 93.9%
+  triton_bmm_12946 0.0235 ms 93.9%
+  triton_bmm_12943 0.0235 ms 93.7%
+  triton_bmm_12945 0.0235 ms 93.7%
+  triton_bmm_12944 0.0237 ms 93.0%
+SingleProcess AUTOTUNE takes 3.8879 seconds
+AUTOTUNE bmm(512x1x96, 512x96x139)
+  triton_bmm_13023 0.0230 ms 100.0%
+  triton_bmm_13016 0.0232 ms 99.2%
+  triton_bmm_13021 0.0235 ms 98.0%
+  triton_bmm_13012 0.0237 ms 97.0%
+  triton_bmm_13020 0.0237 ms 97.0%
+  triton_bmm_13015 0.0237 ms 96.9%
+  triton_bmm_13017 0.0238 ms 96.4%
+  triton_bmm_13019 0.0238 ms 96.4%
+  triton_bmm_13018 0.0240 ms 95.9%
+  triton_bmm_13014 0.0244 ms 94.3%
+SingleProcess AUTOTUNE takes 4.1439 seconds
+AUTOTUNE bmm(512x1x139, 512x139x96)
+  triton_bmm_13037 0.0228 ms 100.0%
+  triton_bmm_13047 0.0229 ms 99.4%
+  triton_bmm_13043 0.0235 ms 97.2%
+  triton_bmm_13036 0.0238 ms 95.8%
+  triton_bmm_13038 0.0248 ms 92.1%
+  triton_bmm_13046 0.0257 ms 88.8%
+  triton_bmm_13039 0.0257 ms 88.6%
+  triton_bmm_13040 0.0259 ms 88.0%
+  triton_bmm_13044 0.0265 ms 86.1%
+  triton_bmm_13042 0.0274 ms 83.3%
+SingleProcess AUTOTUNE takes 4.0585 seconds
+AUTOTUNE bmm(512x1x96, 512x96x140)
+  triton_bmm_13117 0.0228 ms 100.0%
+  triton_bmm_13119 0.0228 ms 100.0%
+  triton_bmm_13112 0.0233 ms 97.9%
+  triton_bmm_13114 0.0233 ms 97.7%
+  triton_bmm_13118 0.0236 ms 96.5%
+  triton_bmm_13113 0.0237 ms 96.0%
+  triton_bmm_13111 0.0238 ms 95.7%
+  triton_bmm_13116 0.0240 ms 95.1%
+  triton_bmm_13115 0.0240 ms 95.1%
+  triton_bmm_13108 0.0242 ms 94.2%
+SingleProcess AUTOTUNE takes 3.9720 seconds
+AUTOTUNE bmm(512x1x140, 512x140x96)
+  triton_bmm_13142 0.0221 ms 100.0%
+  triton_bmm_13139 0.0222 ms 99.9%
+  triton_bmm_13133 0.0225 ms 98.4%
+  triton_bmm_13143 0.0227 ms 97.7%
+  triton_bmm_13132 0.0232 ms 95.5%
+  triton_bmm_13134 0.0233 ms 95.1%
+  triton_bmm_13135 0.0237 ms 93.5%
+  triton_bmm_13136 0.0238 ms 93.0%
+  triton_bmm_13138 0.0239 ms 92.6%
+  triton_bmm_13137 0.0242 ms 91.4%
+SingleProcess AUTOTUNE takes 3.8798 seconds
+AUTOTUNE bmm(512x1x96, 512x96x141)
+  triton_bmm_13213 0.0230 ms 100.0%
+  triton_bmm_13210 0.0236 ms 97.7%
+  triton_bmm_13207 0.0238 ms 96.9%
+  triton_bmm_13215 0.0238 ms 96.9%
+  triton_bmm_13208 0.0239 ms 96.3%
+  triton_bmm_13209 0.0241 ms 95.6%
+  triton_bmm_13212 0.0241 ms 95.4%
+  triton_bmm_13204 0.0243 ms 95.0%
+  triton_bmm_13211 0.0245 ms 93.9%
+  triton_bmm_13214 0.0246 ms 93.7%
+SingleProcess AUTOTUNE takes 4.2050 seconds
+AUTOTUNE bmm(512x1x141, 512x141x96)
+  triton_bmm_13235 0.0229 ms 100.0%
+  triton_bmm_13239 0.0230 ms 99.6%
+  triton_bmm_13229 0.0233 ms 98.3%
+  triton_bmm_13228 0.0239 ms 95.8%
+  triton_bmm_13230 0.0246 ms 93.1%
+  triton_bmm_13238 0.0254 ms 90.2%
+  triton_bmm_13232 0.0257 ms 89.1%
+  triton_bmm_13236 0.0261 ms 87.5%
+  triton_bmm_13231 0.0262 ms 87.3%
+  triton_bmm_13234 0.0275 ms 83.2%
+SingleProcess AUTOTUNE takes 4.0651 seconds
+AUTOTUNE bmm(512x1x96, 512x96x142)
+  triton_bmm_13311 0.0230 ms 100.0%
+  triton_bmm_13304 0.0235 ms 97.9%
+  triton_bmm_13303 0.0236 ms 97.3%
+  triton_bmm_13300 0.0240 ms 95.9%
+  triton_bmm_13308 0.0240 ms 95.9%
+  triton_bmm_13306 0.0243 ms 94.7%
+  triton_bmm_13310 0.0243 ms 94.7%
+  triton_bmm_13305 0.0245 ms 93.9%
+  triton_bmm_13302 0.0245 ms 93.7%
+  triton_bmm_13307 0.0246 ms 93.5%
+SingleProcess AUTOTUNE takes 4.5221 seconds
+AUTOTUNE bmm(512x1x142, 512x142x96)
+  triton_bmm_13325 0.0223 ms 100.0%
+  triton_bmm_13334 0.0224 ms 99.5%
+  triton_bmm_13331 0.0228 ms 97.8%
+  triton_bmm_13335 0.0230 ms 96.7%
+  triton_bmm_13324 0.0235 ms 94.6%
+  triton_bmm_13326 0.0236 ms 94.1%
+  triton_bmm_13328 0.0237 ms 94.0%
+  triton_bmm_13330 0.0237 ms 93.7%
+  triton_bmm_13327 0.0238 ms 93.4%
+  triton_bmm_13329 0.0242 ms 92.1%
+SingleProcess AUTOTUNE takes 4.1496 seconds
+AUTOTUNE bmm(512x1x96, 512x96x143)
+  triton_bmm_13405 0.0237 ms 100.0%
+  triton_bmm_13402 0.0239 ms 99.5%
+  triton_bmm_13407 0.0239 ms 99.3%
+  triton_bmm_13400 0.0241 ms 98.4%
+  triton_bmm_13399 0.0245 ms 96.7%
+  triton_bmm_13396 0.0246 ms 96.6%
+  triton_bmm_13403 0.0247 ms 96.1%
+  triton_bmm_13404 0.0247 ms 96.1%
+  triton_bmm_13401 0.0249 ms 95.5%
+  triton_bmm_13398 0.0250 ms 95.0%
+SingleProcess AUTOTUNE takes 3.8879 seconds
+AUTOTUNE bmm(512x1x143, 512x143x96)
+  triton_bmm_13427 0.0230 ms 100.0%
+  triton_bmm_13421 0.0237 ms 97.3%
+  triton_bmm_13420 0.0241 ms 95.7%
+  triton_bmm_13422 0.0248 ms 92.9%
+  triton_bmm_13430 0.0254 ms 90.6%
+  triton_bmm_13431 0.0258 ms 89.4%
+  triton_bmm_13424 0.0258 ms 89.2%
+  triton_bmm_13428 0.0262 ms 87.8%
+  triton_bmm_13423 0.0263 ms 87.6%
+  triton_bmm_13425 0.0276 ms 83.5%
+SingleProcess AUTOTUNE takes 3.8402 seconds
+AUTOTUNE bmm(512x1x96, 512x96x144)
+  triton_bmm_13501 0.0226 ms 100.0%
+  triton_bmm_13503 0.0228 ms 99.0%
+  triton_bmm_13496 0.0234 ms 96.3%
+  triton_bmm_13499 0.0236 ms 95.8%
+  triton_bmm_13502 0.0236 ms 95.4%
+  triton_bmm_13492 0.0238 ms 94.6%
+  triton_bmm_13498 0.0239 ms 94.5%
+  triton_bmm_13497 0.0241 ms 93.5%
+  triton_bmm_13495 0.0241 ms 93.4%
+  triton_bmm_13494 0.0242 ms 93.4%
+SingleProcess AUTOTUNE takes 3.6345 seconds
+AUTOTUNE bmm(512x1x144, 512x144x96)
+  triton_bmm_13527 0.0221 ms 100.0%
+  triton_bmm_13517 0.0225 ms 98.4%
+  triton_bmm_13523 0.0227 ms 97.3%
+  triton_bmm_13526 0.0230 ms 96.0%
+  triton_bmm_13516 0.0235 ms 94.1%
+  triton_bmm_13518 0.0237 ms 93.3%
+  triton_bmm_13520 0.0239 ms 92.5%
+  triton_bmm_13519 0.0239 ms 92.4%
+  triton_bmm_13522 0.0240 ms 92.1%
+  triton_bmm_13524 0.0248 ms 89.0%
+SingleProcess AUTOTUNE takes 4.0374 seconds
+AUTOTUNE bmm(512x1x96, 512x96x145)
+  triton_bmm_13597 0.0238 ms 100.0%
+  triton_bmm_13599 0.0241 ms 98.8%
+  triton_bmm_13588 0.0242 ms 98.7%
+  triton_bmm_13592 0.0243 ms 98.0%
+  triton_bmm_13593 0.0247 ms 96.5%
+  triton_bmm_13591 0.0248 ms 96.2%
+  triton_bmm_13590 0.0249 ms 95.9%
+  triton_bmm_13598 0.0249 ms 95.9%
+  triton_bmm_13594 0.0249 ms 95.8%
+  triton_bmm_13596 0.0249 ms 95.6%
+SingleProcess AUTOTUNE takes 4.1650 seconds
+AUTOTUNE bmm(512x1x145, 512x145x96)
+  triton_bmm_13613 0.0235 ms 100.0%
+  triton_bmm_13623 0.0236 ms 99.5%
+  triton_bmm_13619 0.0238 ms 99.1%
+  triton_bmm_13612 0.0244 ms 96.5%
+  triton_bmm_13614 0.0250 ms 94.3%
+  triton_bmm_13616 0.0262 ms 89.7%
+  triton_bmm_13620 0.0263 ms 89.5%
+  triton_bmm_13615 0.0266 ms 88.6%
+  triton_bmm_13622 0.0267 ms 88.0%
+  triton_bmm_13618 0.0274 ms 85.8%
+SingleProcess AUTOTUNE takes 3.8952 seconds
+AUTOTUNE bmm(512x1x96, 512x96x146)
+  triton_bmm_13695 0.0234 ms 100.0%
+  triton_bmm_13688 0.0237 ms 98.6%
+  triton_bmm_13690 0.0243 ms 96.3%
+  triton_bmm_13687 0.0245 ms 95.2%
+  triton_bmm_13689 0.0245 ms 95.2%
+  triton_bmm_13692 0.0245 ms 95.2%
+  triton_bmm_13694 0.0247 ms 94.7%
+  triton_bmm_13684 0.0248 ms 94.2%
+  triton_bmm_13686 0.0249 ms 93.8%
+  triton_bmm_13691 0.0250 ms 93.6%
+SingleProcess AUTOTUNE takes 4.0382 seconds
+AUTOTUNE bmm(512x1x146, 512x146x96)
+  triton_bmm_13709 0.0226 ms 100.0%
+  triton_bmm_13715 0.0228 ms 98.9%
+  triton_bmm_13708 0.0236 ms 95.5%
+  triton_bmm_13710 0.0236 ms 95.5%
+  triton_bmm_13712 0.0237 ms 95.1%
+  triton_bmm_13718 0.0238 ms 94.6%
+  triton_bmm_13719 0.0239 ms 94.5%
+  triton_bmm_13714 0.0240 ms 94.1%
+  triton_bmm_13711 0.0241 ms 93.5%
+  triton_bmm_13713 0.0244 ms 92.4%
+SingleProcess AUTOTUNE takes 3.9352 seconds
+AUTOTUNE bmm(512x1x96, 512x96x147)
+  triton_bmm_13789 0.0236 ms 100.0%
+  triton_bmm_13791 0.0244 ms 96.8%
+  triton_bmm_13783 0.0244 ms 96.5%
+  triton_bmm_13784 0.0245 ms 96.1%
+  triton_bmm_13786 0.0248 ms 95.0%
+  triton_bmm_13780 0.0249 ms 94.7%
+  triton_bmm_13788 0.0249 ms 94.6%
+  triton_bmm_13785 0.0250 ms 94.5%
+  triton_bmm_13790 0.0251 ms 93.9%
+  triton_bmm_13787 0.0252 ms 93.8%
+SingleProcess AUTOTUNE takes 4.0280 seconds
+AUTOTUNE bmm(512x1x147, 512x147x96)
+  triton_bmm_13815 0.0239 ms 100.0%
+  triton_bmm_13811 0.0240 ms 99.6%
+  triton_bmm_13805 0.0240 ms 99.5%
+  triton_bmm_13804 0.0250 ms 95.6%
+  triton_bmm_13806 0.0250 ms 95.6%
+  triton_bmm_13807 0.0263 ms 90.9%
+  triton_bmm_13808 0.0263 ms 90.8%
+  triton_bmm_13814 0.0264 ms 90.7%
+  triton_bmm_13812 0.0269 ms 88.9%
+  triton_bmm_13810 0.0276 ms 86.6%
+SingleProcess AUTOTUNE takes 4.0510 seconds
+AUTOTUNE bmm(512x1x96, 512x96x148)
+  triton_bmm_13885 0.0238 ms 100.0%
+  triton_bmm_13880 0.0240 ms 99.1%
+  triton_bmm_13887 0.0241 ms 98.7%
+  triton_bmm_13879 0.0247 ms 96.4%
+  triton_bmm_13883 0.0247 ms 96.2%
+  triton_bmm_13876 0.0248 ms 96.0%
+  triton_bmm_13878 0.0248 ms 95.9%
+  triton_bmm_13881 0.0248 ms 95.7%
+  triton_bmm_13886 0.0249 ms 95.6%
+  triton_bmm_13882 0.0250 ms 95.1%
+SingleProcess AUTOTUNE takes 3.8265 seconds
+AUTOTUNE bmm(512x1x148, 512x148x96)
+  triton_bmm_13901 0.0227 ms 100.0%
+  triton_bmm_13907 0.0232 ms 97.7%
+  triton_bmm_13910 0.0234 ms 97.1%
+  triton_bmm_13900 0.0238 ms 95.5%
+  triton_bmm_13902 0.0238 ms 95.4%
+  triton_bmm_13911 0.0240 ms 94.8%
+  triton_bmm_13903 0.0242 ms 93.9%
+  triton_bmm_13904 0.0243 ms 93.7%
+  triton_bmm_13906 0.0243 ms 93.4%
+  triton_bmm_13905 0.0248 ms 91.5%
+SingleProcess AUTOTUNE takes 4.1083 seconds
+AUTOTUNE bmm(512x1x96, 512x96x149)
+  triton_bmm_13981 0.0238 ms 100.0%
+  triton_bmm_13983 0.0239 ms 99.5%
+  triton_bmm_13975 0.0246 ms 96.5%
+  triton_bmm_13976 0.0247 ms 96.3%
+  triton_bmm_13978 0.0249 ms 95.5%
+  triton_bmm_13980 0.0249 ms 95.4%
+  triton_bmm_13977 0.0250 ms 95.1%
+  triton_bmm_13972 0.0251 ms 94.8%
+  triton_bmm_13979 0.0253 ms 93.8%
+  triton_bmm_13974 0.0256 ms 92.8%
+SingleProcess AUTOTUNE takes 3.7479 seconds
+AUTOTUNE bmm(512x1x149, 512x149x96)
+  triton_bmm_14007 0.0239 ms 100.0%
+  triton_bmm_13997 0.0240 ms 99.6%
+  triton_bmm_14003 0.0240 ms 99.5%
+  triton_bmm_13996 0.0251 ms 95.3%
+  triton_bmm_13998 0.0252 ms 94.8%
+  triton_bmm_13999 0.0264 ms 90.4%
+  triton_bmm_14000 0.0265 ms 90.3%
+  triton_bmm_14004 0.0265 ms 90.1%
+  triton_bmm_14006 0.0269 ms 88.8%
+  triton_bmm_14002 0.0281 ms 85.0%
+SingleProcess AUTOTUNE takes 3.7196 seconds
+AUTOTUNE bmm(512x1x96, 512x96x150)
+  triton_bmm_14079 0.0239 ms 100.0%
+  triton_bmm_14072 0.0245 ms 97.3%
+  triton_bmm_14074 0.0250 ms 95.5%
+  triton_bmm_14078 0.0250 ms 95.5%
+  triton_bmm_14073 0.0250 ms 95.4%
+  triton_bmm_14071 0.0251 ms 95.2%
+  triton_bmm_14068 0.0252 ms 94.7%
+  triton_bmm_14070 0.0253 ms 94.4%
+  triton_bmm_14076 0.0253 ms 94.4%
+  triton_bmm_14075 0.0254 ms 94.1%
+SingleProcess AUTOTUNE takes 4.4848 seconds
+AUTOTUNE bmm(512x1x150, 512x150x96)
+  triton_bmm_14099 0.0232 ms 100.0%
+  triton_bmm_14093 0.0232 ms 99.7%
+  triton_bmm_14103 0.0237 ms 97.7%
+  triton_bmm_14092 0.0240 ms 96.4%
+  triton_bmm_14102 0.0240 ms 96.4%
+  triton_bmm_14096 0.0242 ms 95.8%
+  triton_bmm_14094 0.0243 ms 95.5%
+  triton_bmm_14095 0.0246 ms 94.0%
+  triton_bmm_14098 0.0247 ms 93.8%
+  triton_bmm_14097 0.0248 ms 93.4%
+SingleProcess AUTOTUNE takes 4.2157 seconds
+AUTOTUNE bmm(512x1x96, 512x96x151)
+  triton_bmm_14173 0.0239 ms 100.0%
+  triton_bmm_14164 0.0247 ms 96.6%
+  triton_bmm_14175 0.0249 ms 96.0%
+  triton_bmm_14168 0.0251 ms 95.4%
+  triton_bmm_14172 0.0252 ms 94.7%
+  triton_bmm_14170 0.0253 ms 94.3%
+  triton_bmm_14174 0.0254 ms 94.2%
+  triton_bmm_14167 0.0255 ms 93.9%
+  triton_bmm_14166 0.0255 ms 93.8%
+  triton_bmm_14171 0.0257 ms 92.9%
+SingleProcess AUTOTUNE takes 4.1211 seconds
+AUTOTUNE bmm(512x1x151, 512x151x96)
+  triton_bmm_14199 0.0240 ms 100.0%
+  triton_bmm_14195 0.0242 ms 99.3%
+  triton_bmm_14189 0.0243 ms 98.8%
+  triton_bmm_14188 0.0254 ms 94.7%
+  triton_bmm_14190 0.0260 ms 92.4%
+  triton_bmm_14192 0.0269 ms 89.4%
+  triton_bmm_14191 0.0271 ms 88.6%
+  triton_bmm_14198 0.0271 ms 88.6%
+  triton_bmm_14196 0.0272 ms 88.2%
+  triton_bmm_14194 0.0285 ms 84.4%
+SingleProcess AUTOTUNE takes 3.6973 seconds
+AUTOTUNE bmm(512x1x96, 512x96x152)
+  triton_bmm_14271 0.0235 ms 100.0%
+  triton_bmm_14269 0.0235 ms 99.9%
+  triton_bmm_14264 0.0242 ms 97.0%
+  triton_bmm_14260 0.0243 ms 96.6%
+  triton_bmm_14263 0.0244 ms 96.3%
+  triton_bmm_14267 0.0244 ms 96.3%
+  triton_bmm_14270 0.0244 ms 96.1%
+  triton_bmm_14262 0.0247 ms 95.0%
+  triton_bmm_14266 0.0249 ms 94.3%
+  triton_bmm_14268 0.0250 ms 93.9%
+SingleProcess AUTOTUNE takes 3.6433 seconds
+AUTOTUNE bmm(512x1x152, 512x152x96)
+  triton_bmm_14285 0.0228 ms 100.0%
+  triton_bmm_14291 0.0235 ms 97.1%
+  triton_bmm_14284 0.0240 ms 95.1%
+  triton_bmm_14286 0.0242 ms 94.4%
+  triton_bmm_14295 0.0242 ms 94.4%
+  triton_bmm_14294 0.0243 ms 93.8%
+  triton_bmm_14288 0.0245 ms 93.0%
+  triton_bmm_14287 0.0246 ms 92.8%
+  triton_bmm_14290 0.0251 ms 91.1%
+  triton_bmm_14292 0.0258 ms 88.6%
+SingleProcess AUTOTUNE takes 3.7820 seconds
+AUTOTUNE bmm(512x1x96, 512x96x153)
+  triton_bmm_14365 0.0239 ms 100.0%
+  triton_bmm_14367 0.0242 ms 98.6%
+  triton_bmm_14360 0.0252 ms 94.6%
+  triton_bmm_14363 0.0253 ms 94.3%
+  triton_bmm_14356 0.0254 ms 94.0%
+  triton_bmm_14358 0.0256 ms 93.4%
+  triton_bmm_14359 0.0256 ms 93.4%
+  triton_bmm_14362 0.0256 ms 93.4%
+  triton_bmm_14366 0.0256 ms 93.4%
+  triton_bmm_14364 0.0257 ms 93.0%
+SingleProcess AUTOTUNE takes 3.8579 seconds
+AUTOTUNE bmm(512x1x153, 512x153x96)
+  triton_bmm_14387 0.0239 ms 100.0%
+  triton_bmm_14391 0.0243 ms 98.2%
+  triton_bmm_14381 0.0244 ms 97.6%
+  triton_bmm_14380 0.0255 ms 93.5%
+  triton_bmm_14382 0.0259 ms 92.1%
+  triton_bmm_14388 0.0267 ms 89.6%
+  triton_bmm_14383 0.0268 ms 89.2%
+  triton_bmm_14390 0.0273 ms 87.4%
+  triton_bmm_14384 0.0274 ms 87.3%
+  triton_bmm_14386 0.0285 ms 83.7%
+SingleProcess AUTOTUNE takes 3.9990 seconds
+AUTOTUNE bmm(512x1x96, 512x96x154)
+  triton_bmm_14463 0.0241 ms 100.0%
+  triton_bmm_14455 0.0249 ms 96.8%
+  triton_bmm_14452 0.0251 ms 95.9%
+  triton_bmm_14456 0.0251 ms 95.9%
+  triton_bmm_14460 0.0252 ms 95.4%
+  triton_bmm_14459 0.0253 ms 95.2%
+  triton_bmm_14457 0.0254 ms 94.8%
+  triton_bmm_14462 0.0254 ms 94.6%
+  triton_bmm_14454 0.0257 ms 93.8%
+  triton_bmm_14458 0.0258 ms 93.2%
+SingleProcess AUTOTUNE takes 4.2880 seconds
+AUTOTUNE bmm(512x1x154, 512x154x96)
+  triton_bmm_14477 0.0230 ms 100.0%
+  triton_bmm_14483 0.0236 ms 97.6%
+  triton_bmm_14487 0.0240 ms 95.7%
+  triton_bmm_14478 0.0242 ms 95.0%
+  triton_bmm_14486 0.0242 ms 95.0%
+  triton_bmm_14476 0.0243 ms 94.6%
+  triton_bmm_14480 0.0244 ms 94.1%
+  triton_bmm_14479 0.0245 ms 93.7%
+  triton_bmm_14482 0.0248 ms 92.9%
+  triton_bmm_14481 0.0251 ms 91.8%
+SingleProcess AUTOTUNE takes 4.0285 seconds
+AUTOTUNE bmm(512x1x96, 512x96x155)
+  triton_bmm_14559 0.0244 ms 100.0%
+  triton_bmm_14557 0.0246 ms 99.0%
+  triton_bmm_14552 0.0253 ms 96.3%
+  triton_bmm_14548 0.0256 ms 95.1%
+  triton_bmm_14553 0.0257 ms 94.8%
+  triton_bmm_14556 0.0257 ms 94.8%
+  triton_bmm_14554 0.0257 ms 94.7%
+  triton_bmm_14551 0.0259 ms 94.2%
+  triton_bmm_14555 0.0260 ms 93.6%
+  triton_bmm_14550 0.0261 ms 93.4%
+SingleProcess AUTOTUNE takes 4.5737 seconds
+AUTOTUNE bmm(512x1x155, 512x155x96)
+  triton_bmm_14579 0.0247 ms 100.0%
+  triton_bmm_14573 0.0248 ms 99.6%
+  triton_bmm_14583 0.0250 ms 98.6%
+  triton_bmm_14572 0.0254 ms 97.1%
+  triton_bmm_14574 0.0264 ms 93.6%
+  triton_bmm_14575 0.0270 ms 91.5%
+  triton_bmm_14576 0.0271 ms 91.0%
+  triton_bmm_14580 0.0274 ms 90.1%
+  triton_bmm_14582 0.0274 ms 89.9%
+  triton_bmm_14578 0.0286 ms 86.1%
+SingleProcess AUTOTUNE takes 4.2452 seconds
+AUTOTUNE bmm(512x1x96, 512x96x156)
+  triton_bmm_14655 0.0243 ms 100.0%
+  triton_bmm_14653 0.0246 ms 98.8%
+  triton_bmm_14648 0.0248 ms 98.1%
+  triton_bmm_14651 0.0251 ms 97.0%
+  triton_bmm_14654 0.0251 ms 96.8%
+  triton_bmm_14652 0.0254 ms 95.7%
+  triton_bmm_14650 0.0254 ms 95.6%
+  triton_bmm_14644 0.0254 ms 95.5%
+  triton_bmm_14647 0.0256 ms 94.8%
+  triton_bmm_14646 0.0259 ms 93.9%
+SingleProcess AUTOTUNE takes 3.7564 seconds
+AUTOTUNE bmm(512x1x156, 512x156x96)
+  triton_bmm_14675 0.0235 ms 100.0%
+  triton_bmm_14669 0.0236 ms 99.7%
+  triton_bmm_14679 0.0240 ms 97.9%
+  triton_bmm_14670 0.0244 ms 96.5%
+  triton_bmm_14668 0.0244 ms 96.4%
+  triton_bmm_14671 0.0246 ms 95.6%
+  triton_bmm_14678 0.0247 ms 95.0%
+  triton_bmm_14672 0.0252 ms 93.1%
+  triton_bmm_14674 0.0252 ms 93.1%
+  triton_bmm_14673 0.0258 ms 91.0%
+SingleProcess AUTOTUNE takes 4.4644 seconds
+AUTOTUNE bmm(512x1x96, 512x96x157)
+  triton_bmm_14749 0.0242 ms 100.0%
+  triton_bmm_14751 0.0253 ms 95.7%
+  triton_bmm_14748 0.0256 ms 94.7%
+  triton_bmm_14744 0.0257 ms 94.2%
+  triton_bmm_14747 0.0257 ms 94.2%
+  triton_bmm_14745 0.0258 ms 94.0%
+  triton_bmm_14740 0.0259 ms 93.6%
+  triton_bmm_14743 0.0259 ms 93.5%
+  triton_bmm_14750 0.0259 ms 93.5%
+  triton_bmm_14746 0.0260 ms 93.2%
+SingleProcess AUTOTUNE takes 3.8601 seconds
+AUTOTUNE bmm(512x1x157, 512x157x96)
+  triton_bmm_14771 0.0243 ms 100.0%
+  triton_bmm_14765 0.0249 ms 97.5%
+  triton_bmm_14775 0.0250 ms 97.4%
+  triton_bmm_14764 0.0260 ms 93.7%
+  triton_bmm_14766 0.0263 ms 92.5%
+  triton_bmm_14774 0.0271 ms 89.8%
+  triton_bmm_14772 0.0272 ms 89.4%
+  triton_bmm_14767 0.0278 ms 87.6%
+  triton_bmm_14768 0.0278 ms 87.6%
+  triton_bmm_14770 0.0284 ms 85.8%
+SingleProcess AUTOTUNE takes 3.8278 seconds
+AUTOTUNE bmm(512x1x96, 512x96x158)
+  triton_bmm_14847 0.0251 ms 100.0%
+  triton_bmm_14840 0.0253 ms 98.9%
+  triton_bmm_14841 0.0257 ms 97.5%
+  triton_bmm_14846 0.0258 ms 97.0%
+  triton_bmm_14839 0.0258 ms 97.0%
+  triton_bmm_14842 0.0260 ms 96.4%
+  triton_bmm_14844 0.0260 ms 96.4%
+  triton_bmm_14836 0.0260 ms 96.3%
+  triton_bmm_14838 0.0260 ms 96.3%
+  triton_bmm_14845 0.0261 ms 96.1%
+SingleProcess AUTOTUNE takes 4.0313 seconds
+AUTOTUNE bmm(512x1x158, 512x158x96)
+  triton_bmm_14867 0.0236 ms 100.0%
+  triton_bmm_14861 0.0237 ms 99.2%
+  triton_bmm_14862 0.0244 ms 96.5%
+  triton_bmm_14870 0.0246 ms 95.6%
+  triton_bmm_14864 0.0247 ms 95.5%
+  triton_bmm_14860 0.0248 ms 95.0%
+  triton_bmm_14863 0.0250 ms 94.4%
+  triton_bmm_14866 0.0250 ms 94.4%
+  triton_bmm_14871 0.0250 ms 94.4%
+  triton_bmm_14865 0.0256 ms 92.0%
+SingleProcess AUTOTUNE takes 3.9994 seconds
+AUTOTUNE bmm(512x1x96, 512x96x159)
+  triton_bmm_14941 0.0251 ms 100.0%
+  triton_bmm_14943 0.0256 ms 98.1%
+  triton_bmm_14940 0.0258 ms 97.3%
+  triton_bmm_14935 0.0259 ms 96.9%
+  triton_bmm_14936 0.0259 ms 96.7%
+  triton_bmm_14938 0.0260 ms 96.2%
+  triton_bmm_14934 0.0261 ms 96.1%
+  triton_bmm_14937 0.0261 ms 96.0%
+  triton_bmm_14932 0.0262 ms 95.7%
+  triton_bmm_14939 0.0264 ms 94.8%
+SingleProcess AUTOTUNE takes 3.7942 seconds
+AUTOTUNE bmm(512x1x159, 512x159x96)
+  triton_bmm_14963 0.0249 ms 100.0%
+  triton_bmm_14957 0.0252 ms 99.1%
+  triton_bmm_14956 0.0262 ms 95.1%
+  triton_bmm_14958 0.0263 ms 94.9%
+  triton_bmm_14964 0.0273 ms 91.4%
+  triton_bmm_14966 0.0279 ms 89.4%
+  triton_bmm_14960 0.0280 ms 89.0%
+  triton_bmm_14959 0.0281 ms 88.8%
+  triton_bmm_14962 0.0285 ms 87.5%
+  triton_bmm_14967 0.0289 ms 86.4%
+SingleProcess AUTOTUNE takes 3.9270 seconds
+AUTOTUNE bmm(512x1x96, 512x96x160)
+  triton_bmm_15037 0.0245 ms 100.0%
+  triton_bmm_15032 0.0250 ms 98.2%
+  triton_bmm_15038 0.0252 ms 97.3%
+  triton_bmm_15035 0.0253 ms 96.7%
+  triton_bmm_15039 0.0254 ms 96.6%
+  triton_bmm_15030 0.0255 ms 96.0%
+  triton_bmm_15036 0.0256 ms 95.6%
+  triton_bmm_15028 0.0257 ms 95.5%
+  triton_bmm_15033 0.0258 ms 94.9%
+  triton_bmm_15031 0.0259 ms 94.7%
+SingleProcess AUTOTUNE takes 3.7831 seconds
+AUTOTUNE bmm(512x1x160, 512x160x96)
+  triton_bmm_15053 0.0239 ms 100.0%
+  triton_bmm_15059 0.0240 ms 99.7%
+  triton_bmm_15063 0.0244 ms 98.1%
+  triton_bmm_15054 0.0246 ms 97.2%
+  triton_bmm_15052 0.0249 ms 96.0%
+  triton_bmm_15056 0.0250 ms 95.6%
+  triton_bmm_15062 0.0250 ms 95.6%
+  triton_bmm_15055 0.0255 ms 93.8%
+  triton_bmm_15058 0.0255 ms 93.5%
+  triton_bmm_15060 0.0260 ms 91.7%
+SingleProcess AUTOTUNE takes 4.2431 seconds
+AUTOTUNE bmm(512x1x96, 512x96x161)
+  triton_bmm_15128 0.0258 ms 100.0%
+  triton_bmm_15133 0.0259 ms 99.5%
+  triton_bmm_15130 0.0263 ms 98.1%
+  triton_bmm_15132 0.0263 ms 97.9%
+  triton_bmm_15134 0.0264 ms 97.7%
+  triton_bmm_15124 0.0266 ms 97.1%
+  triton_bmm_15126 0.0266 ms 97.0%
+  triton_bmm_15127 0.0268 ms 96.3%
+  triton_bmm_15131 0.0268 ms 96.3%
+  triton_bmm_15129 0.0269 ms 95.8%
+SingleProcess AUTOTUNE takes 3.7668 seconds
+AUTOTUNE bmm(512x1x161, 512x161x96)
+  triton_bmm_15155 0.0258 ms 100.0%
+  triton_bmm_15149 0.0261 ms 99.0%
+  triton_bmm_15159 0.0261 ms 98.9%
+  triton_bmm_15148 0.0272 ms 94.9%
+  triton_bmm_15150 0.0279 ms 92.5%
+  triton_bmm_15158 0.0282 ms 91.7%
+  triton_bmm_15156 0.0282 ms 91.6%
+  triton_bmm_15152 0.0286 ms 90.2%
+  triton_bmm_15151 0.0287 ms 90.0%
+  triton_bmm_15154 0.0299 ms 86.3%
+SingleProcess AUTOTUNE takes 4.9874 seconds
+AUTOTUNE bmm(512x1x96, 512x96x162)
+  triton_bmm_15228 0.0259 ms 100.0%
+  triton_bmm_15224 0.0260 ms 99.9%
+  triton_bmm_15220 0.0260 ms 99.6%
+  triton_bmm_15222 0.0260 ms 99.6%
+  triton_bmm_15226 0.0266 ms 97.3%
+  triton_bmm_15223 0.0267 ms 97.2%
+  triton_bmm_15227 0.0267 ms 97.0%
+  triton_bmm_15230 0.0269 ms 96.3%
+  triton_bmm_15225 0.0274 ms 94.5%
+  triton_bmm_15221 0.0280 ms 92.7%
+SingleProcess AUTOTUNE takes 3.9759 seconds
+AUTOTUNE bmm(512x1x162, 512x162x96)
+  triton_bmm_15245 0.0248 ms 100.0%
+  triton_bmm_15255 0.0252 ms 98.2%
+  triton_bmm_15247 0.0253 ms 97.8%
+  triton_bmm_15251 0.0254 ms 97.4%
+  triton_bmm_15248 0.0255 ms 97.3%
+  triton_bmm_15254 0.0259 ms 95.7%
+  triton_bmm_15246 0.0260 ms 95.1%
+  triton_bmm_15250 0.0263 ms 94.3%
+  triton_bmm_15249 0.0264 ms 94.0%
+  triton_bmm_15252 0.0266 ms 93.3%
+SingleProcess AUTOTUNE takes 3.9491 seconds
+AUTOTUNE bmm(512x1x96, 512x96x163)
+  triton_bmm_15325 0.0262 ms 100.0%
+  triton_bmm_15320 0.0262 ms 99.9%
+  triton_bmm_15319 0.0263 ms 99.9%
+  triton_bmm_15323 0.0263 ms 99.8%
+  triton_bmm_15324 0.0263 ms 99.6%
+  triton_bmm_15318 0.0264 ms 99.5%
+  triton_bmm_15316 0.0265 ms 98.9%
+  triton_bmm_15322 0.0267 ms 98.3%
+  triton_bmm_15326 0.0274 ms 95.8%
+  triton_bmm_15321 0.0276 ms 95.2%
+SingleProcess AUTOTUNE takes 3.7638 seconds
+AUTOTUNE bmm(512x1x163, 512x163x96)
+  triton_bmm_15351 0.0257 ms 100.0%
+  triton_bmm_15347 0.0265 ms 96.9%
+  triton_bmm_15341 0.0266 ms 96.5%
+  triton_bmm_15340 0.0276 ms 93.0%
+  triton_bmm_15342 0.0279 ms 92.1%
+  triton_bmm_15348 0.0281 ms 91.5%
+  triton_bmm_15350 0.0284 ms 90.4%
+  triton_bmm_15344 0.0289 ms 88.9%
+  triton_bmm_15343 0.0293 ms 87.8%
+  triton_bmm_15349 0.0299 ms 86.0%
+SingleProcess AUTOTUNE takes 4.4575 seconds
+AUTOTUNE bmm(512x1x96, 512x96x164)
+  triton_bmm_15419 0.0257 ms 100.0%
+  triton_bmm_15416 0.0258 ms 99.6%
+  triton_bmm_15421 0.0259 ms 99.1%
+  triton_bmm_15422 0.0261 ms 98.5%
+  triton_bmm_15412 0.0263 ms 97.9%
+  triton_bmm_15415 0.0263 ms 97.6%
+  triton_bmm_15420 0.0264 ms 97.5%
+  triton_bmm_15414 0.0264 ms 97.3%
+  triton_bmm_15418 0.0268 ms 95.9%
+  triton_bmm_15417 0.0269 ms 95.4%
+SingleProcess AUTOTUNE takes 3.7825 seconds
+AUTOTUNE bmm(512x1x164, 512x164x96)
+  triton_bmm_15437 0.0248 ms 100.0%
+  triton_bmm_15443 0.0252 ms 98.4%
+  triton_bmm_15446 0.0255 ms 97.4%
+  triton_bmm_15439 0.0255 ms 97.1%
+  triton_bmm_15438 0.0257 ms 96.6%
+  triton_bmm_15436 0.0259 ms 95.7%
+  triton_bmm_15447 0.0260 ms 95.3%
+  triton_bmm_15440 0.0262 ms 94.8%
+  triton_bmm_15444 0.0264 ms 94.1%
+  triton_bmm_15442 0.0264 ms 93.8%
+SingleProcess AUTOTUNE takes 4.6569 seconds
+AUTOTUNE bmm(512x1x96, 512x96x165)
+  triton_bmm_15517 0.0258 ms 100.0%
+  triton_bmm_15512 0.0261 ms 98.9%
+  triton_bmm_15508 0.0263 ms 98.2%
+  triton_bmm_15516 0.0264 ms 97.8%
+  triton_bmm_15510 0.0265 ms 97.3%
+  triton_bmm_15515 0.0265 ms 97.3%
+  triton_bmm_15518 0.0268 ms 96.4%
+  triton_bmm_15511 0.0268 ms 96.1%
+  triton_bmm_15514 0.0270 ms 95.6%
+  triton_bmm_15513 0.0277 ms 93.0%
+SingleProcess AUTOTUNE takes 4.8742 seconds
+AUTOTUNE bmm(512x1x165, 512x165x96)
+  triton_bmm_15543 0.0260 ms 100.0%
+  triton_bmm_15539 0.0264 ms 98.8%
+  triton_bmm_15533 0.0266 ms 97.8%
+  triton_bmm_15532 0.0275 ms 94.8%
+  triton_bmm_15534 0.0280 ms 92.9%
+  triton_bmm_15542 0.0287 ms 90.7%
+  triton_bmm_15540 0.0289 ms 90.2%
+  triton_bmm_15536 0.0290 ms 89.8%
+  triton_bmm_15535 0.0295 ms 88.3%
+  triton_bmm_15541 0.0302 ms 86.2%
+SingleProcess AUTOTUNE takes 3.7979 seconds
+AUTOTUNE bmm(512x1x96, 512x96x166)
+  triton_bmm_15604 0.0260 ms 100.0%
+  triton_bmm_15608 0.0260 ms 99.8%
+  triton_bmm_15612 0.0261 ms 99.5%
+  triton_bmm_15606 0.0263 ms 98.9%
+  triton_bmm_15611 0.0263 ms 98.8%
+  triton_bmm_15610 0.0268 ms 97.1%
+  triton_bmm_15607 0.0270 ms 96.2%
+  triton_bmm_15614 0.0271 ms 95.9%
+  triton_bmm_15609 0.0277 ms 93.9%
+  triton_bmm_15605 0.0282 ms 92.3%
+SingleProcess AUTOTUNE takes 4.2580 seconds
+AUTOTUNE bmm(512x1x166, 512x166x96)
+  triton_bmm_15629 0.0246 ms 100.0%
+  triton_bmm_15635 0.0254 ms 96.9%
+  triton_bmm_15630 0.0258 ms 95.7%
+  triton_bmm_15631 0.0260 ms 94.8%
+  triton_bmm_15638 0.0262 ms 93.9%
+  triton_bmm_15632 0.0263 ms 93.7%
+  triton_bmm_15639 0.0264 ms 93.2%
+  triton_bmm_15634 0.0265 ms 92.9%
+  triton_bmm_15633 0.0266 ms 92.7%
+  triton_bmm_15628 0.0267 ms 92.2%
+SingleProcess AUTOTUNE takes 4.0741 seconds
+AUTOTUNE bmm(512x1x96, 512x96x167)
+  triton_bmm_15704 0.0262 ms 100.0%
+  triton_bmm_15708 0.0264 ms 99.3%
+  triton_bmm_15709 0.0265 ms 98.7%
+  triton_bmm_15700 0.0266 ms 98.4%
+  triton_bmm_15707 0.0267 ms 98.2%
+  triton_bmm_15702 0.0268 ms 97.8%
+  triton_bmm_15706 0.0270 ms 96.9%
+  triton_bmm_15703 0.0273 ms 96.0%
+  triton_bmm_15710 0.0276 ms 94.9%
+  triton_bmm_15705 0.0278 ms 94.0%
+SingleProcess AUTOTUNE takes 4.2560 seconds
+AUTOTUNE bmm(512x1x167, 512x167x96)
+  triton_bmm_15731 0.0261 ms 100.0%
+  triton_bmm_15725 0.0264 ms 98.8%
+  triton_bmm_15735 0.0271 ms 96.3%
+  triton_bmm_15724 0.0273 ms 95.5%
+  triton_bmm_15726 0.0281 ms 92.7%
+  triton_bmm_15732 0.0290 ms 90.1%
+  triton_bmm_15727 0.0295 ms 88.3%
+  triton_bmm_15728 0.0295 ms 88.3%
+  triton_bmm_15734 0.0297 ms 87.8%
+  triton_bmm_15733 0.0302 ms 86.3%
+SingleProcess AUTOTUNE takes 3.7527 seconds
+AUTOTUNE bmm(512x1x96, 512x96x168)
+  triton_bmm_15805 0.0251 ms 100.0%
+  triton_bmm_15800 0.0257 ms 97.5%
+  triton_bmm_15803 0.0259 ms 96.8%
+  triton_bmm_15806 0.0260 ms 96.4%
+  triton_bmm_15804 0.0262 ms 95.5%
+  triton_bmm_15798 0.0263 ms 95.2%
+  triton_bmm_15796 0.0264 ms 95.1%
+  triton_bmm_15802 0.0266 ms 94.3%
+  triton_bmm_15799 0.0270 ms 92.9%
+  triton_bmm_15801 0.0276 ms 90.8%
+SingleProcess AUTOTUNE takes 4.7088 seconds
+AUTOTUNE bmm(512x1x168, 512x168x96)
+  triton_bmm_15821 0.0248 ms 100.0%
+  triton_bmm_15830 0.0257 ms 96.8%
+  triton_bmm_15831 0.0257 ms 96.5%
+  triton_bmm_15822 0.0258 ms 96.4%
+  triton_bmm_15827 0.0258 ms 96.4%
+  triton_bmm_15823 0.0260 ms 95.7%
+  triton_bmm_15826 0.0265 ms 93.7%
+  triton_bmm_15820 0.0265 ms 93.6%
+  triton_bmm_15828 0.0266 ms 93.4%
+  triton_bmm_15824 0.0267 ms 93.0%
+SingleProcess AUTOTUNE takes 4.0067 seconds
+AUTOTUNE bmm(512x1x96, 512x96x169)
+  triton_bmm_15896 0.0266 ms 100.0%
+  triton_bmm_15901 0.0266 ms 99.9%
+  triton_bmm_15892 0.0267 ms 99.8%
+  triton_bmm_15899 0.0267 ms 99.5%
+  triton_bmm_15894 0.0269 ms 99.0%
+  triton_bmm_15900 0.0269 ms 98.8%
+  triton_bmm_15898 0.0271 ms 98.2%
+  triton_bmm_15897 0.0274 ms 97.0%
+  triton_bmm_15895 0.0277 ms 95.9%
+  triton_bmm_15902 0.0280 ms 95.1%
+SingleProcess AUTOTUNE takes 4.0254 seconds
+AUTOTUNE bmm(512x1x169, 512x169x96)
+  triton_bmm_15917 0.0268 ms 100.0%
+  triton_bmm_15923 0.0269 ms 99.3%
+  triton_bmm_15927 0.0270 ms 99.1%
+  triton_bmm_15916 0.0280 ms 95.7%
+  triton_bmm_15918 0.0285 ms 93.9%
+  triton_bmm_15924 0.0291 ms 91.9%
+  triton_bmm_15920 0.0296 ms 90.5%
+  triton_bmm_15926 0.0296 ms 90.3%
+  triton_bmm_15919 0.0299 ms 89.5%
+  triton_bmm_15922 0.0305 ms 87.7%
+SingleProcess AUTOTUNE takes 4.0395 seconds
+AUTOTUNE bmm(512x1x96, 512x96x170)
+  triton_bmm_15992 0.0264 ms 100.0%
+  triton_bmm_15988 0.0268 ms 98.7%
+  triton_bmm_15996 0.0269 ms 98.3%
+  triton_bmm_15991 0.0271 ms 97.5%
+  triton_bmm_15990 0.0271 ms 97.5%
+  triton_bmm_15994 0.0271 ms 97.4%
+  triton_bmm_15998 0.0271 ms 97.3%
+  triton_bmm_15995 0.0272 ms 97.1%
+  triton_bmm_15993 0.0279 ms 94.5%
+  triton_bmm_15989 0.0285 ms 92.7%
+SingleProcess AUTOTUNE takes 3.7027 seconds
+AUTOTUNE bmm(512x1x170, 512x170x96)
+  triton_bmm_16013 0.0253 ms 100.0%
+  triton_bmm_16019 0.0255 ms 99.1%
+  triton_bmm_16015 0.0259 ms 97.7%
+  triton_bmm_16014 0.0260 ms 97.3%
+  triton_bmm_16016 0.0263 ms 96.2%
+  triton_bmm_16022 0.0266 ms 95.2%
+  triton_bmm_16012 0.0267 ms 94.8%
+  triton_bmm_16023 0.0268 ms 94.5%
+  triton_bmm_16018 0.0270 ms 93.6%
+  triton_bmm_16017 0.0274 ms 92.4%
+SingleProcess AUTOTUNE takes 4.4181 seconds
+AUTOTUNE bmm(512x1x96, 512x96x171)
+  triton_bmm_16088 0.0266 ms 100.0%
+  triton_bmm_16093 0.0266 ms 99.7%
+  triton_bmm_16092 0.0268 ms 99.3%
+  triton_bmm_16084 0.0268 ms 99.2%
+  triton_bmm_16086 0.0274 ms 97.0%
+  triton_bmm_16090 0.0275 ms 96.7%
+  triton_bmm_16094 0.0275 ms 96.5%
+  triton_bmm_16091 0.0276 ms 96.2%
+  triton_bmm_16089 0.0278 ms 95.6%
+  triton_bmm_16087 0.0279 ms 95.2%
+SingleProcess AUTOTUNE takes 3.7813 seconds
+AUTOTUNE bmm(512x1x171, 512x171x96)
+  triton_bmm_16119 0.0265 ms 100.0%
+  triton_bmm_16115 0.0270 ms 98.1%
+  triton_bmm_16109 0.0272 ms 97.3%
+  triton_bmm_16108 0.0283 ms 93.7%
+  triton_bmm_16110 0.0288 ms 92.0%
+  triton_bmm_16111 0.0295 ms 89.7%
+  triton_bmm_16116 0.0295 ms 89.7%
+  triton_bmm_16118 0.0297 ms 89.2%
+  triton_bmm_16112 0.0297 ms 89.0%
+  triton_bmm_16114 0.0308 ms 86.0%
+SingleProcess AUTOTUNE takes 3.7226 seconds
+AUTOTUNE bmm(512x1x96, 512x96x172)
+  triton_bmm_16184 0.0261 ms 100.0%
+  triton_bmm_16189 0.0263 ms 99.3%
+  triton_bmm_16187 0.0265 ms 98.6%
+  triton_bmm_16182 0.0267 ms 97.8%
+  triton_bmm_16180 0.0268 ms 97.4%
+  triton_bmm_16188 0.0270 ms 96.8%
+  triton_bmm_16186 0.0273 ms 95.7%
+  triton_bmm_16190 0.0275 ms 95.0%
+  triton_bmm_16185 0.0277 ms 94.3%
+  triton_bmm_16183 0.0277 ms 94.2%
+SingleProcess AUTOTUNE takes 3.8833 seconds
+AUTOTUNE bmm(512x1x172, 512x172x96)
+  triton_bmm_16205 0.0252 ms 100.0%
+  triton_bmm_16211 0.0260 ms 96.7%
+  triton_bmm_16207 0.0262 ms 96.1%
+  triton_bmm_16206 0.0264 ms 95.2%
+  triton_bmm_16208 0.0265 ms 94.9%
+  triton_bmm_16204 0.0266 ms 94.5%
+  triton_bmm_16214 0.0268 ms 93.8%
+  triton_bmm_16212 0.0269 ms 93.6%
+  triton_bmm_16210 0.0271 ms 92.8%
+  triton_bmm_16215 0.0272 ms 92.6%
+SingleProcess AUTOTUNE takes 4.0275 seconds
+AUTOTUNE bmm(512x1x96, 512x96x173)
+  triton_bmm_16280 0.0268 ms 100.0%
+  triton_bmm_16285 0.0270 ms 99.3%
+  triton_bmm_16276 0.0272 ms 98.6%
+  triton_bmm_16284 0.0272 ms 98.5%
+  triton_bmm_16278 0.0273 ms 98.1%
+  triton_bmm_16283 0.0277 ms 96.9%
+  triton_bmm_16282 0.0277 ms 96.7%
+  triton_bmm_16286 0.0278 ms 96.5%
+  triton_bmm_16279 0.0280 ms 95.9%
+  triton_bmm_16281 0.0284 ms 94.4%
+SingleProcess AUTOTUNE takes 3.9683 seconds
+AUTOTUNE bmm(512x1x173, 512x173x96)
+  triton_bmm_16311 0.0267 ms 100.0%
+  triton_bmm_16301 0.0272 ms 98.4%
+  triton_bmm_16307 0.0274 ms 97.7%
+  triton_bmm_16300 0.0286 ms 93.5%
+  triton_bmm_16302 0.0289 ms 92.6%
+  triton_bmm_16310 0.0292 ms 91.6%
+  triton_bmm_16303 0.0296 ms 90.4%
+  triton_bmm_16308 0.0296 ms 90.1%
+  triton_bmm_16304 0.0303 ms 88.2%
+  triton_bmm_16309 0.0310 ms 86.1%
+SingleProcess AUTOTUNE takes 4.2565 seconds
+AUTOTUNE bmm(512x1x96, 512x96x174)
+  triton_bmm_16374 0.0268 ms 100.0%
+  triton_bmm_16376 0.0271 ms 99.1%
+  triton_bmm_16380 0.0272 ms 98.7%
+  triton_bmm_16372 0.0275 ms 97.8%
+  triton_bmm_16378 0.0276 ms 97.3%
+  triton_bmm_16379 0.0276 ms 97.2%
+  triton_bmm_16375 0.0277 ms 96.8%
+  triton_bmm_16382 0.0280 ms 95.8%
+  triton_bmm_16377 0.0284 ms 94.6%
+  triton_bmm_16373 0.0288 ms 93.1%
+SingleProcess AUTOTUNE takes 3.9657 seconds
+AUTOTUNE bmm(512x1x174, 512x174x96)
+  triton_bmm_16397 0.0258 ms 100.0%
+  triton_bmm_16403 0.0259 ms 99.8%
+  triton_bmm_16406 0.0260 ms 99.3%
+  triton_bmm_16400 0.0267 ms 96.5%
+  triton_bmm_16398 0.0268 ms 96.4%
+  triton_bmm_16399 0.0268 ms 96.4%
+  triton_bmm_16407 0.0268 ms 96.2%
+  triton_bmm_16396 0.0268 ms 96.1%
+  triton_bmm_16402 0.0274 ms 94.3%
+  triton_bmm_16404 0.0274 ms 94.2%
+SingleProcess AUTOTUNE takes 3.8354 seconds
+AUTOTUNE bmm(512x1x96, 512x96x175)
+  triton_bmm_16477 0.0266 ms 100.0%
+  triton_bmm_16472 0.0269 ms 99.0%
+  triton_bmm_16470 0.0273 ms 97.4%
+  triton_bmm_16475 0.0274 ms 97.3%
+  triton_bmm_16476 0.0274 ms 97.3%
+  triton_bmm_16468 0.0276 ms 96.3%
+  triton_bmm_16474 0.0278 ms 95.6%
+  triton_bmm_16471 0.0282 ms 94.4%
+  triton_bmm_16478 0.0283 ms 94.1%
+  triton_bmm_16473 0.0287 ms 92.9%
+SingleProcess AUTOTUNE takes 3.9851 seconds
+AUTOTUNE bmm(512x1x175, 512x175x96)
+  triton_bmm_16499 0.0272 ms 100.0%
+  triton_bmm_16493 0.0273 ms 99.7%
+  triton_bmm_16492 0.0289 ms 94.2%
+  triton_bmm_16494 0.0290 ms 93.8%
+  triton_bmm_16500 0.0292 ms 93.2%
+  triton_bmm_16495 0.0303 ms 89.8%
+  triton_bmm_16496 0.0307 ms 88.6%
+  triton_bmm_16502 0.0314 ms 86.7%
+  triton_bmm_16498 0.0316 ms 86.2%
+  triton_bmm_16501 0.0316 ms 86.1%
+SingleProcess AUTOTUNE takes 3.7019 seconds
+AUTOTUNE bmm(512x1x96, 512x96x176)
+  triton_bmm_16573 0.0262 ms 100.0%
+  triton_bmm_16566 0.0267 ms 98.1%
+  triton_bmm_16568 0.0268 ms 97.7%
+  triton_bmm_16571 0.0270 ms 96.9%
+  triton_bmm_16564 0.0271 ms 96.6%
+  triton_bmm_16572 0.0273 ms 96.0%
+  triton_bmm_16570 0.0274 ms 95.7%
+  triton_bmm_16567 0.0276 ms 94.9%
+  triton_bmm_16574 0.0279 ms 94.0%
+  triton_bmm_16569 0.0280 ms 93.7%
+SingleProcess AUTOTUNE takes 4.1264 seconds
+AUTOTUNE bmm(512x1x176, 512x176x96)
+  triton_bmm_16589 0.0254 ms 100.0%
+  triton_bmm_16595 0.0260 ms 97.9%
+  triton_bmm_16591 0.0263 ms 96.7%
+  triton_bmm_16599 0.0263 ms 96.7%
+  triton_bmm_16590 0.0265 ms 96.1%
+  triton_bmm_16598 0.0265 ms 96.1%
+  triton_bmm_16588 0.0272 ms 93.6%
+  triton_bmm_16596 0.0272 ms 93.4%
+  triton_bmm_16592 0.0273 ms 93.3%
+  triton_bmm_16594 0.0273 ms 93.2%
+SingleProcess AUTOTUNE takes 4.4972 seconds
+AUTOTUNE bmm(512x1x96, 512x96x177)
+  triton_bmm_16664 0.0271 ms 100.0%
+  triton_bmm_16669 0.0272 ms 99.9%
+  triton_bmm_16668 0.0272 ms 99.6%
+  triton_bmm_16667 0.0276 ms 98.1%
+  triton_bmm_16660 0.0279 ms 97.4%
+  triton_bmm_16662 0.0280 ms 97.0%
+  triton_bmm_16666 0.0281 ms 96.6%
+  triton_bmm_16670 0.0285 ms 95.2%
+  triton_bmm_16663 0.0285 ms 95.1%
+  triton_bmm_16665 0.0289 ms 93.9%
+SingleProcess AUTOTUNE takes 4.4924 seconds
+AUTOTUNE bmm(512x1x177, 512x177x96)
+  triton_bmm_16685 0.0273 ms 100.0%
+  triton_bmm_16691 0.0276 ms 98.7%
+  triton_bmm_16695 0.0281 ms 97.0%
+  triton_bmm_16686 0.0288 ms 94.9%
+  triton_bmm_16692 0.0290 ms 94.2%
+  triton_bmm_16684 0.0290 ms 94.0%
+  triton_bmm_16687 0.0300 ms 90.8%
+  triton_bmm_16688 0.0303 ms 90.1%
+  triton_bmm_16694 0.0309 ms 88.4%
+  triton_bmm_16690 0.0312 ms 87.4%
+SingleProcess AUTOTUNE takes 4.1927 seconds
+AUTOTUNE bmm(512x1x96, 512x96x178)
+  triton_bmm_16760 0.0269 ms 100.0%
+  triton_bmm_16758 0.0272 ms 98.9%
+  triton_bmm_16764 0.0272 ms 98.9%
+  triton_bmm_16756 0.0276 ms 97.7%
+  triton_bmm_16763 0.0280 ms 96.3%
+  triton_bmm_16762 0.0282 ms 95.6%
+  triton_bmm_16759 0.0282 ms 95.5%
+  triton_bmm_16761 0.0288 ms 93.6%
+  triton_bmm_16766 0.0289 ms 93.2%
+  triton_bmm_16757 0.0292 ms 92.3%
+SingleProcess AUTOTUNE takes 3.8423 seconds
+AUTOTUNE bmm(512x1x178, 512x178x96)
+  triton_bmm_16781 0.0259 ms 100.0%
+  triton_bmm_16783 0.0265 ms 97.6%
+  triton_bmm_16787 0.0265 ms 97.4%
+  triton_bmm_16782 0.0268 ms 96.7%
+  triton_bmm_16784 0.0270 ms 95.6%
+  triton_bmm_16780 0.0272 ms 94.9%
+  triton_bmm_16786 0.0273 ms 94.8%
+  triton_bmm_16791 0.0273 ms 94.7%
+  triton_bmm_16785 0.0278 ms 93.0%
+  triton_bmm_16790 0.0280 ms 92.4%
+SingleProcess AUTOTUNE takes 3.9601 seconds
+AUTOTUNE bmm(512x1x96, 512x96x179)
+  triton_bmm_16856 0.0273 ms 100.0%
+  triton_bmm_16861 0.0275 ms 99.3%
+  triton_bmm_16854 0.0276 ms 98.6%
+  triton_bmm_16852 0.0279 ms 97.8%
+  triton_bmm_16858 0.0279 ms 97.8%
+  triton_bmm_16860 0.0280 ms 97.5%
+  triton_bmm_16859 0.0283 ms 96.5%
+  triton_bmm_16855 0.0284 ms 96.2%
+  triton_bmm_16857 0.0288 ms 94.7%
+  triton_bmm_16862 0.0291 ms 93.7%
+SingleProcess AUTOTUNE takes 4.1473 seconds
+AUTOTUNE bmm(512x1x179, 512x179x96)
+  triton_bmm_16877 0.0276 ms 100.0%
+  triton_bmm_16883 0.0279 ms 98.9%
+  triton_bmm_16887 0.0283 ms 97.6%
+  triton_bmm_16876 0.0289 ms 95.5%
+  triton_bmm_16878 0.0292 ms 94.4%
+  triton_bmm_16884 0.0299 ms 92.3%
+  triton_bmm_16879 0.0302 ms 91.3%
+  triton_bmm_16880 0.0308 ms 89.5%
+  triton_bmm_16886 0.0310 ms 89.1%
+  triton_bmm_16882 0.0319 ms 86.5%
+SingleProcess AUTOTUNE takes 3.9284 seconds
+AUTOTUNE bmm(512x1x96, 512x96x180)
+  triton_bmm_16952 0.0268 ms 100.0%
+  triton_bmm_16955 0.0274 ms 97.8%
+  triton_bmm_16956 0.0274 ms 97.8%
+  triton_bmm_16950 0.0274 ms 97.5%
+  triton_bmm_16948 0.0275 ms 97.3%
+  triton_bmm_16957 0.0276 ms 97.1%
+  triton_bmm_16954 0.0277 ms 96.6%
+  triton_bmm_16958 0.0281 ms 95.3%
+  triton_bmm_16951 0.0282 ms 94.9%
+  triton_bmm_16953 0.0285 ms 93.7%
+SingleProcess AUTOTUNE takes 4.0936 seconds
+AUTOTUNE bmm(512x1x180, 512x180x96)
+  triton_bmm_16973 0.0264 ms 100.0%
+  triton_bmm_16979 0.0266 ms 99.0%
+  triton_bmm_16975 0.0267 ms 98.9%
+  triton_bmm_16974 0.0272 ms 96.9%
+  triton_bmm_16978 0.0273 ms 96.7%
+  triton_bmm_16983 0.0274 ms 96.3%
+  triton_bmm_16980 0.0275 ms 96.0%
+  triton_bmm_16972 0.0275 ms 95.9%
+  triton_bmm_16976 0.0275 ms 95.8%
+  triton_bmm_16982 0.0276 ms 95.7%
+SingleProcess AUTOTUNE takes 4.4681 seconds
+AUTOTUNE bmm(512x1x96, 512x96x181)
+  triton_bmm_17053 0.0273 ms 100.0%
+  triton_bmm_17052 0.0275 ms 99.2%
+  triton_bmm_17048 0.0277 ms 98.5%
+  triton_bmm_17046 0.0278 ms 98.0%
+  triton_bmm_17051 0.0280 ms 97.3%
+  triton_bmm_17050 0.0283 ms 96.6%
+  triton_bmm_17044 0.0283 ms 96.4%
+  triton_bmm_17049 0.0288 ms 94.7%
+  triton_bmm_17047 0.0291 ms 93.9%
+  triton_bmm_17054 0.0295 ms 92.5%
+SingleProcess AUTOTUNE takes 3.9071 seconds
+AUTOTUNE bmm(512x1x181, 512x181x96)
+  triton_bmm_17069 0.0276 ms 100.0%
+  triton_bmm_17075 0.0280 ms 98.9%
+  triton_bmm_17079 0.0285 ms 96.9%
+  triton_bmm_17070 0.0291 ms 95.2%
+  triton_bmm_17068 0.0294 ms 94.1%
+  triton_bmm_17076 0.0295 ms 93.7%
+  triton_bmm_17072 0.0307 ms 90.1%
+  triton_bmm_17078 0.0307 ms 90.1%
+  triton_bmm_17071 0.0309 ms 89.5%
+  triton_bmm_17074 0.0322 ms 86.0%
+SingleProcess AUTOTUNE takes 3.9074 seconds
+AUTOTUNE bmm(512x1x96, 512x96x182)
+  triton_bmm_17144 0.0273 ms 100.0%
+  triton_bmm_17148 0.0275 ms 99.3%
+  triton_bmm_17142 0.0276 ms 99.0%
+  triton_bmm_17140 0.0281 ms 97.2%
+  triton_bmm_17147 0.0286 ms 95.5%
+  triton_bmm_17146 0.0286 ms 95.5%
+  triton_bmm_17145 0.0287 ms 95.1%
+  triton_bmm_17143 0.0290 ms 94.0%
+  triton_bmm_17150 0.0291 ms 93.9%
+  triton_bmm_17141 0.0296 ms 92.3%
+SingleProcess AUTOTUNE takes 3.8193 seconds
+AUTOTUNE bmm(512x1x182, 512x182x96)
+  triton_bmm_17165 0.0265 ms 100.0%
+  triton_bmm_17171 0.0269 ms 98.2%
+  triton_bmm_17167 0.0271 ms 97.6%
+  triton_bmm_17166 0.0275 ms 96.2%
+  triton_bmm_17170 0.0275 ms 96.2%
+  triton_bmm_17168 0.0276 ms 95.8%
+  triton_bmm_17174 0.0278 ms 95.3%
+  triton_bmm_17175 0.0279 ms 94.8%
+  triton_bmm_17164 0.0280 ms 94.4%
+  triton_bmm_17172 0.0284 ms 93.3%
+SingleProcess AUTOTUNE takes 4.3484 seconds
+AUTOTUNE bmm(512x1x96, 512x96x183)
+  triton_bmm_17240 0.0276 ms 100.0%
+  triton_bmm_17244 0.0277 ms 99.7%
+  triton_bmm_17245 0.0278 ms 99.4%
+  triton_bmm_17236 0.0281 ms 98.2%
+  triton_bmm_17238 0.0281 ms 98.2%
+  triton_bmm_17242 0.0282 ms 98.0%
+  triton_bmm_17243 0.0282 ms 97.8%
+  triton_bmm_17239 0.0290 ms 95.3%
+  triton_bmm_17241 0.0295 ms 93.7%
+  triton_bmm_17246 0.0297 ms 93.1%
+SingleProcess AUTOTUNE takes 3.9562 seconds
+AUTOTUNE bmm(512x1x183, 512x183x96)
+  triton_bmm_17261 0.0280 ms 100.0%
+  triton_bmm_17267 0.0283 ms 99.3%
+  triton_bmm_17271 0.0285 ms 98.4%
+  triton_bmm_17260 0.0296 ms 94.7%
+  triton_bmm_17262 0.0296 ms 94.7%
+  triton_bmm_17268 0.0302 ms 92.8%
+  triton_bmm_17263 0.0305 ms 92.1%
+  triton_bmm_17264 0.0309 ms 90.6%
+  triton_bmm_17270 0.0310 ms 90.5%
+  triton_bmm_17266 0.0322 ms 87.2%
+SingleProcess AUTOTUNE takes 3.8544 seconds
+AUTOTUNE bmm(512x1x96, 512x96x184)
+  triton_bmm_17341 0.0270 ms 100.0%
+  triton_bmm_17336 0.0271 ms 99.6%
+  triton_bmm_17339 0.0275 ms 98.3%
+  triton_bmm_17334 0.0276 ms 97.8%
+  triton_bmm_17332 0.0279 ms 96.8%
+  triton_bmm_17338 0.0280 ms 96.6%
+  triton_bmm_17340 0.0281 ms 96.2%
+  triton_bmm_17335 0.0284 ms 95.3%
+  triton_bmm_17342 0.0290 ms 93.2%
+  triton_bmm_17337 0.0291 ms 93.0%
+SingleProcess AUTOTUNE takes 3.8630 seconds
+AUTOTUNE bmm(512x1x184, 512x184x96)
+  triton_bmm_17357 0.0264 ms 100.0%
+  triton_bmm_17359 0.0268 ms 98.4%
+  triton_bmm_17363 0.0270 ms 97.6%
+  triton_bmm_17360 0.0273 ms 96.6%
+  triton_bmm_17358 0.0276 ms 95.7%
+  triton_bmm_17356 0.0276 ms 95.4%
+  triton_bmm_17364 0.0279 ms 94.6%
+  triton_bmm_17362 0.0279 ms 94.5%
+  triton_bmm_17367 0.0280 ms 94.2%
+  triton_bmm_17366 0.0285 ms 92.5%
+SingleProcess AUTOTUNE takes 3.9232 seconds
+AUTOTUNE bmm(512x1x96, 512x96x185)
+  triton_bmm_17437 0.0279 ms 100.0%
+  triton_bmm_17432 0.0281 ms 99.1%
+  triton_bmm_17430 0.0282 ms 99.0%
+  triton_bmm_17436 0.0282 ms 98.8%
+  triton_bmm_17428 0.0287 ms 97.2%
+  triton_bmm_17434 0.0288 ms 96.9%
+  triton_bmm_17435 0.0290 ms 96.2%
+  triton_bmm_17433 0.0295 ms 94.6%
+  triton_bmm_17431 0.0295 ms 94.5%
+  triton_bmm_17438 0.0298 ms 93.6%
+SingleProcess AUTOTUNE takes 3.7159 seconds
+AUTOTUNE bmm(512x1x185, 512x185x96)
+  triton_bmm_17453 0.0285 ms 100.0%
+  triton_bmm_17459 0.0286 ms 99.8%
+  triton_bmm_17463 0.0290 ms 98.3%
+  triton_bmm_17452 0.0294 ms 96.9%
+  triton_bmm_17454 0.0294 ms 96.8%
+  triton_bmm_17460 0.0304 ms 93.7%
+  triton_bmm_17455 0.0308 ms 92.5%
+  triton_bmm_17456 0.0312 ms 91.4%
+  triton_bmm_17462 0.0313 ms 91.0%
+  triton_bmm_17458 0.0320 ms 89.0%
+SingleProcess AUTOTUNE takes 3.9981 seconds
+AUTOTUNE bmm(512x1x96, 512x96x186)
+  triton_bmm_17528 0.0280 ms 100.0%
+  triton_bmm_17526 0.0281 ms 99.9%
+  triton_bmm_17532 0.0283 ms 99.2%
+  triton_bmm_17531 0.0285 ms 98.2%
+  triton_bmm_17524 0.0286 ms 98.0%
+  triton_bmm_17530 0.0286 ms 97.9%
+  triton_bmm_17527 0.0293 ms 95.5%
+  triton_bmm_17529 0.0296 ms 94.8%
+  triton_bmm_17534 0.0297 ms 94.4%
+  triton_bmm_17525 0.0299 ms 93.7%
+SingleProcess AUTOTUNE takes 4.5561 seconds
+AUTOTUNE bmm(512x1x186, 512x186x96)
+  triton_bmm_17549 0.0263 ms 100.0%
+  triton_bmm_17555 0.0272 ms 96.6%
+  triton_bmm_17551 0.0275 ms 95.6%
+  triton_bmm_17550 0.0278 ms 94.6%
+  triton_bmm_17552 0.0281 ms 93.7%
+  triton_bmm_17548 0.0283 ms 93.0%
+  triton_bmm_17554 0.0283 ms 93.0%
+  triton_bmm_17558 0.0283 ms 92.9%
+  triton_bmm_17556 0.0284 ms 92.5%
+  triton_bmm_17553 0.0289 ms 91.0%
+SingleProcess AUTOTUNE takes 4.5574 seconds
+AUTOTUNE bmm(512x1x96, 512x96x187)
+  triton_bmm_17629 0.0279 ms 100.0%
+  triton_bmm_17628 0.0281 ms 99.3%
+  triton_bmm_17624 0.0282 ms 99.1%
+  triton_bmm_17622 0.0283 ms 98.6%
+  triton_bmm_17620 0.0285 ms 98.0%
+  triton_bmm_17626 0.0288 ms 96.9%
+  triton_bmm_17627 0.0292 ms 95.7%
+  triton_bmm_17623 0.0297 ms 94.1%
+  triton_bmm_17625 0.0299 ms 93.5%
+  triton_bmm_17630 0.0300 ms 93.2%
+SingleProcess AUTOTUNE takes 3.9636 seconds
+AUTOTUNE bmm(512x1x187, 512x187x96)
+  triton_bmm_17645 0.0283 ms 100.0%
+  triton_bmm_17651 0.0286 ms 98.8%
+  triton_bmm_17655 0.0291 ms 97.1%
+  triton_bmm_17644 0.0295 ms 95.9%
+  triton_bmm_17646 0.0298 ms 95.0%
+  triton_bmm_17652 0.0304 ms 93.0%
+  triton_bmm_17647 0.0309 ms 91.5%
+  triton_bmm_17654 0.0314 ms 90.2%
+  triton_bmm_17648 0.0317 ms 89.2%
+  triton_bmm_17650 0.0321 ms 88.2%
+SingleProcess AUTOTUNE takes 3.9459 seconds
+AUTOTUNE bmm(512x1x96, 512x96x188)
+  triton_bmm_17720 0.0277 ms 100.0%
+  triton_bmm_17724 0.0280 ms 99.0%
+  triton_bmm_17718 0.0281 ms 98.7%
+  triton_bmm_17716 0.0282 ms 98.2%
+  triton_bmm_17722 0.0283 ms 97.8%
+  triton_bmm_17725 0.0284 ms 97.7%
+  triton_bmm_17723 0.0285 ms 97.1%
+  triton_bmm_17719 0.0289 ms 96.0%
+  triton_bmm_17726 0.0291 ms 95.1%
+  triton_bmm_17721 0.0294 ms 94.2%
+SingleProcess AUTOTUNE takes 3.8313 seconds
+AUTOTUNE bmm(512x1x188, 512x188x96)
+  triton_bmm_17741 0.0267 ms 100.0%
+  triton_bmm_17747 0.0268 ms 99.4%
+  triton_bmm_17743 0.0275 ms 97.2%
+  triton_bmm_17742 0.0275 ms 97.1%
+  triton_bmm_17740 0.0278 ms 96.0%
+  triton_bmm_17751 0.0280 ms 95.2%
+  triton_bmm_17744 0.0282 ms 94.6%
+  triton_bmm_17746 0.0284 ms 94.1%
+  triton_bmm_17748 0.0285 ms 93.5%
+  triton_bmm_17745 0.0286 ms 93.4%
+SingleProcess AUTOTUNE takes 4.4905 seconds
+AUTOTUNE bmm(512x1x96, 512x96x189)
+  triton_bmm_17821 0.0282 ms 100.0%
+  triton_bmm_17820 0.0282 ms 99.9%
+  triton_bmm_17816 0.0284 ms 99.2%
+  triton_bmm_17814 0.0286 ms 98.5%
+  triton_bmm_17818 0.0286 ms 98.3%
+  triton_bmm_17812 0.0287 ms 98.1%
+  triton_bmm_17819 0.0294 ms 95.7%
+  triton_bmm_17815 0.0296 ms 95.1%
+  triton_bmm_17817 0.0301 ms 93.4%
+  triton_bmm_17822 0.0302 ms 93.2%
+SingleProcess AUTOTUNE takes 4.0521 seconds
+AUTOTUNE bmm(512x1x189, 512x189x96)
+  triton_bmm_17837 0.0284 ms 100.0%
+  triton_bmm_17847 0.0286 ms 99.3%
+  triton_bmm_17843 0.0290 ms 98.0%
+  triton_bmm_17836 0.0299 ms 95.2%
+  triton_bmm_17838 0.0302 ms 94.2%
+  triton_bmm_17844 0.0310 ms 91.7%
+  triton_bmm_17846 0.0315 ms 90.2%
+  triton_bmm_17839 0.0317 ms 89.7%
+  triton_bmm_17840 0.0319 ms 89.0%
+  triton_bmm_17842 0.0323 ms 88.0%
+SingleProcess AUTOTUNE takes 4.2471 seconds
+AUTOTUNE bmm(512x1x96, 512x96x190)
+  triton_bmm_17912 0.0279 ms 100.0%
+  triton_bmm_17910 0.0283 ms 98.6%
+  triton_bmm_17916 0.0285 ms 98.1%
+  triton_bmm_17908 0.0289 ms 96.8%
+  triton_bmm_17914 0.0289 ms 96.6%
+  triton_bmm_17915 0.0293 ms 95.2%
+  triton_bmm_17913 0.0295 ms 94.7%
+  triton_bmm_17911 0.0297 ms 94.0%
+  triton_bmm_17918 0.0303 ms 92.1%
+  triton_bmm_17909 0.0307 ms 91.0%
+SingleProcess AUTOTUNE takes 3.8297 seconds
+AUTOTUNE bmm(512x1x190, 512x190x96)
+  triton_bmm_17933 0.0266 ms 100.0%
+  triton_bmm_17939 0.0272 ms 97.8%
+  triton_bmm_17935 0.0278 ms 95.7%
+  triton_bmm_17934 0.0278 ms 95.6%
+  triton_bmm_17938 0.0283 ms 93.9%
+  triton_bmm_17936 0.0285 ms 93.4%
+  triton_bmm_17932 0.0287 ms 92.7%
+  triton_bmm_17937 0.0290 ms 91.8%
+  triton_bmm_17942 0.0290 ms 91.8%
+  triton_bmm_17943 0.0291 ms 91.3%
+SingleProcess AUTOTUNE takes 3.9838 seconds
+AUTOTUNE bmm(512x1x96, 512x96x191)
+  triton_bmm_18013 0.0281 ms 100.0%
+  triton_bmm_18012 0.0285 ms 98.5%
+  triton_bmm_18008 0.0285 ms 98.4%
+  triton_bmm_18004 0.0290 ms 97.0%
+  triton_bmm_18010 0.0292 ms 96.2%
+  triton_bmm_18006 0.0293 ms 95.9%
+  triton_bmm_18011 0.0293 ms 95.8%
+  triton_bmm_18007 0.0300 ms 93.6%
+  triton_bmm_18009 0.0302 ms 93.0%
+  triton_bmm_18005 0.0306 ms 91.8%
+SingleProcess AUTOTUNE takes 4.0417 seconds
+AUTOTUNE bmm(512x1x191, 512x191x96)
+  triton_bmm_18029 0.0288 ms 100.0%
+  triton_bmm_18035 0.0291 ms 99.1%
+  triton_bmm_18030 0.0296 ms 97.4%
+  triton_bmm_18028 0.0301 ms 95.5%
+  triton_bmm_18031 0.0305 ms 94.5%
+  triton_bmm_18036 0.0308 ms 93.6%
+  triton_bmm_18032 0.0310 ms 92.9%
+  triton_bmm_18038 0.0327 ms 88.0%
+  triton_bmm_18034 0.0328 ms 87.7%
+  triton_bmm_18039 0.0329 ms 87.6%
+SingleProcess AUTOTUNE takes 3.9000 seconds
+AUTOTUNE bmm(512x1x96, 512x96x192)
+  triton_bmm_18109 0.0276 ms 100.0%
+  triton_bmm_18104 0.0277 ms 99.7%
+  triton_bmm_18108 0.0282 ms 98.0%
+  triton_bmm_18100 0.0284 ms 97.3%
+  triton_bmm_18102 0.0286 ms 96.7%
+  triton_bmm_18107 0.0289 ms 95.6%
+  triton_bmm_18106 0.0290 ms 95.5%
+  triton_bmm_18103 0.0294 ms 94.0%
+  triton_bmm_18101 0.0297 ms 93.2%
+  triton_bmm_18110 0.0298 ms 92.8%
+SingleProcess AUTOTUNE takes 3.7929 seconds
+AUTOTUNE bmm(512x1x192, 512x192x96)
+  triton_bmm_18125 0.0267 ms 100.0%
+  triton_bmm_18127 0.0275 ms 97.1%
+  triton_bmm_18131 0.0278 ms 96.0%
+  triton_bmm_18126 0.0281 ms 94.9%
+  triton_bmm_18132 0.0283 ms 94.3%
+  triton_bmm_18124 0.0285 ms 93.5%
+  triton_bmm_18128 0.0285 ms 93.5%
+  triton_bmm_18135 0.0287 ms 92.9%
+  triton_bmm_18130 0.0288 ms 92.7%
+  triton_bmm_18134 0.0294 ms 90.6%
+SingleProcess AUTOTUNE takes 4.5651 seconds
+AUTOTUNE bmm(512x1x96, 512x96x193)
+  triton_bmm_18205 0.0289 ms 100.0%
+  triton_bmm_18203 0.0297 ms 97.5%
+  triton_bmm_18199 0.0298 ms 97.0%
+  triton_bmm_18196 0.0301 ms 96.2%
+  triton_bmm_18204 0.0303 ms 95.4%
+  triton_bmm_18200 0.0306 ms 94.7%
+  triton_bmm_18197 0.0307 ms 94.2%
+  triton_bmm_18206 0.0308 ms 93.8%
+  triton_bmm_18202 0.0309 ms 93.7%
+  triton_bmm_18198 0.0310 ms 93.4%
+SingleProcess AUTOTUNE takes 4.0682 seconds
+AUTOTUNE bmm(512x1x193, 512x193x96)
+  triton_bmm_18227 0.0300 ms 100.0%
+  triton_bmm_18221 0.0301 ms 99.8%
+  triton_bmm_18231 0.0307 ms 97.9%
+  triton_bmm_18222 0.0312 ms 96.1%
+  triton_bmm_18220 0.0316 ms 95.1%
+  triton_bmm_18229 0.0322 ms 93.2%
+  triton_bmm_18228 0.0323 ms 93.1%
+  triton_bmm_18224 0.0332 ms 90.5%
+  triton_bmm_18223 0.0332 ms 90.3%
+  triton_bmm_18230 0.0334 ms 90.0%
+SingleProcess AUTOTUNE takes 3.7816 seconds
+TIMEOUT
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+dcgan
+cuda eval  dcgan                               int8dynamic-bs32          
+AUTOTUNE convolution(32x3x64x64, 64x3x4x4)
+  triton_convolution_4 0.0306 ms 100.0%
+  convolution 0.0320 ms 95.4%
+  triton_convolution_3 0.0338 ms 90.5%
+  triton_convolution_0 0.0444 ms 68.9%
+  triton_convolution_5 0.0476 ms 64.1%
+  triton_convolution_2 0.0680 ms 44.9%
+  triton_convolution_1 0.0889 ms 34.4%
+SingleProcess AUTOTUNE takes 3.2460 seconds
+AUTOTUNE convolution(32x64x32x32, 128x64x4x4)
+  convolution 0.0271 ms 100.0%
+  triton_convolution_12 0.1348 ms 20.1%
+  triton_convolution_6 0.1390 ms 19.5%
+  triton_convolution_9 0.1516 ms 17.9%
+  triton_convolution_11 0.1549 ms 17.5%
+  triton_convolution_10 0.1921 ms 14.1%
+  triton_convolution_7 0.2548 ms 10.7%
+  triton_convolution_8 0.4320 ms 6.3%
+SingleProcess AUTOTUNE takes 4.2401 seconds
+AUTOTUNE convolution(32x128x16x16, 256x128x4x4)
+  convolution 0.0250 ms 100.0%
+  triton_convolution_18 0.2353 ms 10.6%
+  triton_convolution_17 0.2601 ms 9.6%
+  triton_convolution_19 0.2858 ms 8.8%
+  triton_convolution_16 0.3161 ms 7.9%
+  triton_convolution_14 0.5007 ms 5.0%
+  triton_convolution_13 0.5034 ms 5.0%
+  triton_convolution_15 0.8294 ms 3.0%
+SingleProcess AUTOTUNE takes 4.9402 seconds
+AUTOTUNE convolution(32x256x8x8, 512x256x4x4)
+  convolution 0.0427 ms 100.0%
+  triton_convolution_25 0.4928 ms 8.7%
+  triton_convolution_24 0.5688 ms 7.5%
+  triton_convolution_26 0.6205 ms 6.9%
+  triton_convolution_23 0.7012 ms 6.1%
+  triton_convolution_22 0.9786 ms 4.4%
+  triton_convolution_20 1.0535 ms 4.0%
+  triton_convolution_21 1.0831 ms 3.9%
+SingleProcess AUTOTUNE takes 4.6885 seconds
+AUTOTUNE convolution(32x512x4x4, 1x512x4x4)
+  convolution 0.0345 ms 100.0%
+  triton_convolution_31 0.2236 ms 15.4%
+  triton_convolution_30 0.2448 ms 14.1%
+  triton_convolution_27 0.2536 ms 13.6%
+  triton_convolution_32 0.3010 ms 11.5%
+  triton_convolution_28 0.3693 ms 9.3%
+  triton_convolution_29 0.4208 ms 8.2%
+SingleProcess AUTOTUNE takes 2.3640 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 770.49it/s]
+2063.084ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model demucs does not support bfloat16, running with amp instead
+loading model: 0it [00:05, ?it/s]
+WARNING:common:Model demucs does not support bfloat16, running with amp instead
+demucs
+cuda eval  demucs                              int8dynamic-bs32          
+WARNING:common:Model demucs does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 14.11it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 18.19it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 19.46it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 20.05it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 20.37it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 20.58it/s]running benchmark:  67%|██████▋   | 20/30 [00:01<00:00, 20.68it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 20.77it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 20.84it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 20.87it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 20.30it/s]
+1256.880ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+densenet121
+cuda eval  densenet121                         int8dynamic-bs32          
+AUTOTUNE mm(100352x64, 64x128)
+  triton_mm_14 0.0334 ms 100.0%
+  triton_mm_8 0.0346 ms 96.4%
+  triton_mm_7 0.0353 ms 94.7%
+  triton_mm_10 0.0360 ms 92.8%
+  triton_mm_9 0.0380 ms 88.0%
+  triton_mm_6 0.0402 ms 83.1%
+  mm 0.0412 ms 81.0%
+  triton_mm_13 0.0414 ms 80.6%
+  triton_mm_15 0.0468 ms 71.5%
+  triton_mm_16 0.0530 ms 63.1%
+SingleProcess AUTOTUNE takes 4.2578 seconds
+AUTOTUNE convolution(32x128x56x56, 32x128x3x3)
+  convolution 0.0785 ms 100.0%
+  triton_convolution_18 0.2720 ms 28.9%
+  triton_convolution_24 0.2807 ms 28.0%
+  triton_convolution_21 0.2981 ms 26.3%
+  triton_convolution_23 0.3150 ms 24.9%
+  triton_convolution_22 0.3257 ms 24.1%
+  triton_convolution_19 0.6949 ms 11.3%
+  triton_convolution_20 0.9586 ms 8.2%
+SingleProcess AUTOTUNE takes 3.1512 seconds
+AUTOTUNE mm(100352x96, 96x128)
+  triton_mm_29 0.0416 ms 100.0%
+  triton_mm_26 0.0427 ms 97.5%
+  triton_mm_32 0.0427 ms 97.4%
+  triton_mm_28 0.0428 ms 97.3%
+  triton_mm_27 0.0429 ms 97.0%
+  triton_mm_25 0.0452 ms 92.1%
+  mm 0.0507 ms 82.1%
+  triton_mm_33 0.0509 ms 81.7%
+  triton_mm_31 0.0661 ms 63.0%
+  triton_mm_36 0.0689 ms 60.4%
+SingleProcess AUTOTUNE takes 4.3459 seconds
+AUTOTUNE mm(100352x128, 128x128)
+  triton_mm_45 0.0466 ms 100.0%
+  triton_mm_46 0.0474 ms 98.4%
+  triton_mm_51 0.0492 ms 94.7%
+  triton_mm_47 0.0498 ms 93.5%
+  triton_mm_44 0.0505 ms 92.2%
+  triton_mm_52 0.0506 ms 92.1%
+  triton_mm_48 0.0517 ms 90.2%
+  mm 0.0533 ms 87.4%
+  triton_mm_54 0.0728 ms 64.0%
+  triton_mm_53 0.0748 ms 62.3%
+SingleProcess AUTOTUNE takes 4.5673 seconds
+AUTOTUNE mm(100352x160, 160x128)
+  triton_mm_64 0.0555 ms 100.0%
+  triton_mm_65 0.0556 ms 99.8%
+  mm 0.0563 ms 98.5%
+  triton_mm_70 0.0564 ms 98.4%
+  triton_mm_66 0.0603 ms 92.1%
+  triton_mm_63 0.0610 ms 91.0%
+  triton_mm_67 0.0619 ms 89.6%
+  triton_mm_71 0.0633 ms 87.6%
+  triton_mm_69 0.0903 ms 61.4%
+  triton_mm_68 0.0905 ms 61.3%
+SingleProcess AUTOTUNE takes 4.9245 seconds
+AUTOTUNE mm(100352x192, 192x128)
+  mm 0.0582 ms 100.0%
+  triton_mm_83 0.0583 ms 99.8%
+  triton_mm_84 0.0585 ms 99.5%
+  triton_mm_89 0.0632 ms 92.0%
+  triton_mm_85 0.0638 ms 91.2%
+  triton_mm_86 0.0647 ms 90.0%
+  triton_mm_82 0.0668 ms 87.1%
+  triton_mm_90 0.0668 ms 87.1%
+  triton_mm_88 0.1028 ms 56.6%
+  triton_mm_87 0.1052 ms 55.3%
+SingleProcess AUTOTUNE takes 4.7443 seconds
+AUTOTUNE mm(100352x224, 224x128)
+  mm 0.0653 ms 100.0%
+  triton_mm_102 0.0655 ms 99.7%
+  triton_mm_103 0.0665 ms 98.3%
+  triton_mm_108 0.0705 ms 92.7%
+  triton_mm_104 0.0713 ms 91.7%
+  triton_mm_105 0.0720 ms 90.7%
+  triton_mm_101 0.0724 ms 90.2%
+  triton_mm_109 0.0767 ms 85.2%
+  triton_mm_111 0.1061 ms 61.6%
+  triton_mm_107 0.1062 ms 61.5%
+SingleProcess AUTOTUNE takes 4.7045 seconds
+AUTOTUNE mm(25088x128, 128x128)
+  triton_mm_139 0.0167 ms 100.0%
+  triton_mm_134 0.0185 ms 90.4%
+  mm 0.0188 ms 88.9%
+  triton_mm_133 0.0196 ms 85.4%
+  triton_mm_135 0.0198 ms 84.3%
+  triton_mm_136 0.0199 ms 84.1%
+  triton_mm_140 0.0200 ms 83.6%
+  triton_mm_132 0.0204 ms 82.1%
+  triton_mm_141 0.0260 ms 64.1%
+  triton_mm_137 0.0265 ms 63.1%
+SingleProcess AUTOTUNE takes 4.3379 seconds
+AUTOTUNE convolution(32x128x28x28, 32x128x3x3)
+  convolution 0.0305 ms 100.0%
+  triton_convolution_144 0.0796 ms 38.3%
+  triton_convolution_147 0.0817 ms 37.4%
+  triton_convolution_148 0.0874 ms 34.9%
+  triton_convolution_149 0.0892 ms 34.2%
+  triton_convolution_150 0.0913 ms 33.5%
+  triton_convolution_145 0.2059 ms 14.8%
+  triton_convolution_146 0.4695 ms 6.5%
+SingleProcess AUTOTUNE takes 2.9962 seconds
+AUTOTUNE mm(25088x160, 160x128)
+  triton_mm_158 0.0193 ms 100.0%
+  mm 0.0203 ms 95.1%
+  triton_mm_153 0.0211 ms 91.5%
+  triton_mm_152 0.0213 ms 90.6%
+  triton_mm_154 0.0220 ms 87.9%
+  triton_mm_155 0.0220 ms 87.9%
+  triton_mm_151 0.0232 ms 83.2%
+  triton_mm_159 0.0244 ms 79.4%
+  triton_mm_157 0.0293 ms 65.9%
+  triton_mm_156 0.0294 ms 65.8%
+SingleProcess AUTOTUNE takes 4.8546 seconds
+AUTOTUNE mm(25088x192, 192x128)
+  mm 0.0210 ms 100.0%
+  triton_mm_177 0.0213 ms 98.3%
+  triton_mm_172 0.0219 ms 95.8%
+  triton_mm_173 0.0223 ms 93.8%
+  triton_mm_171 0.0226 ms 92.8%
+  triton_mm_174 0.0231 ms 90.6%
+  triton_mm_178 0.0252 ms 83.0%
+  triton_mm_170 0.0265 ms 79.2%
+  triton_mm_175 0.0325 ms 64.4%
+  triton_mm_176 0.0330 ms 63.5%
+SingleProcess AUTOTUNE takes 4.8149 seconds
+AUTOTUNE mm(25088x224, 224x128)
+  triton_mm_196 0.0229 ms 100.0%
+  mm 0.0237 ms 96.8%
+  triton_mm_190 0.0239 ms 96.0%
+  triton_mm_191 0.0241 ms 95.2%
+  triton_mm_192 0.0246 ms 93.4%
+  triton_mm_193 0.0246 ms 93.1%
+  triton_mm_189 0.0280 ms 82.0%
+  triton_mm_197 0.0280 ms 82.0%
+  triton_mm_195 0.0337 ms 68.1%
+  triton_mm_194 0.0340 ms 67.5%
+SingleProcess AUTOTUNE takes 5.1180 seconds
+AUTOTUNE mm(25088x288, 288x128)
+  triton_mm_234 0.0269 ms 100.0%
+  mm 0.0277 ms 96.9%
+  triton_mm_231 0.0288 ms 93.4%
+  triton_mm_229 0.0288 ms 93.2%
+  triton_mm_230 0.0292 ms 92.2%
+  triton_mm_228 0.0304 ms 88.5%
+  triton_mm_235 0.0326 ms 82.5%
+  triton_mm_227 0.0364 ms 73.8%
+  triton_mm_232 0.0401 ms 67.1%
+  triton_mm_233 0.0420 ms 64.1%
+SingleProcess AUTOTUNE takes 4.9028 seconds
+AUTOTUNE mm(25088x320, 320x128)
+  mm 0.0277 ms 100.0%
+  triton_mm_253 0.0294 ms 94.3%
+  triton_mm_248 0.0299 ms 92.6%
+  triton_mm_249 0.0302 ms 91.7%
+  triton_mm_250 0.0302 ms 91.6%
+  triton_mm_247 0.0311 ms 89.0%
+  triton_mm_254 0.0334 ms 83.0%
+  triton_mm_246 0.0378 ms 73.2%
+  triton_mm_251 0.0450 ms 61.5%
+  triton_mm_252 0.0454 ms 61.0%
+SingleProcess AUTOTUNE takes 4.6031 seconds
+AUTOTUNE mm(25088x352, 352x128)
+  mm 0.0297 ms 100.0%
+  triton_mm_269 0.0321 ms 92.6%
+  triton_mm_272 0.0322 ms 92.3%
+  triton_mm_267 0.0325 ms 91.4%
+  triton_mm_268 0.0328 ms 90.5%
+  triton_mm_266 0.0336 ms 88.5%
+  triton_mm_273 0.0367 ms 81.1%
+  triton_mm_265 0.0404 ms 73.7%
+  triton_mm_270 0.0451 ms 65.9%
+  triton_mm_271 0.0486 ms 61.2%
+SingleProcess AUTOTUNE takes 4.9158 seconds
+AUTOTUNE mm(25088x384, 384x128)
+  mm 0.0308 ms 100.0%
+  triton_mm_287 0.0335 ms 92.1%
+  triton_mm_286 0.0339 ms 91.0%
+  triton_mm_288 0.0339 ms 91.0%
+  triton_mm_291 0.0350 ms 87.9%
+  triton_mm_285 0.0352 ms 87.6%
+  triton_mm_292 0.0362 ms 85.1%
+  triton_mm_284 0.0438 ms 70.3%
+  triton_mm_289 0.0493 ms 62.5%
+  triton_mm_290 0.0515 ms 59.9%
+SingleProcess AUTOTUNE takes 4.8365 seconds
+AUTOTUNE mm(25088x416, 416x128)
+  mm 0.0333 ms 100.0%
+  triton_mm_306 0.0349 ms 95.5%
+  triton_mm_307 0.0365 ms 91.2%
+  triton_mm_304 0.0369 ms 90.2%
+  triton_mm_310 0.0370 ms 89.9%
+  triton_mm_305 0.0371 ms 89.7%
+  triton_mm_311 0.0389 ms 85.6%
+  triton_mm_303 0.0466 ms 71.4%
+  triton_mm_308 0.0504 ms 66.1%
+  triton_mm_309 0.0518 ms 64.3%
+SingleProcess AUTOTUNE takes 4.6513 seconds
+AUTOTUNE mm(25088x448, 448x128)
+  mm 0.0340 ms 100.0%
+  triton_mm_325 0.0354 ms 95.8%
+  triton_mm_323 0.0363 ms 93.6%
+  triton_mm_326 0.0374 ms 90.8%
+  triton_mm_330 0.0382 ms 88.9%
+  triton_mm_324 0.0387 ms 87.8%
+  triton_mm_329 0.0401 ms 84.7%
+  triton_mm_322 0.0493 ms 68.9%
+  triton_mm_327 0.0539 ms 63.0%
+  triton_mm_328 0.0543 ms 62.5%
+SingleProcess AUTOTUNE takes 4.6042 seconds
+AUTOTUNE mm(25088x480, 480x128)
+  mm 0.0374 ms 100.0%
+  triton_mm_344 0.0380 ms 98.7%
+  triton_mm_342 0.0385 ms 97.2%
+  triton_mm_345 0.0406 ms 92.2%
+  triton_mm_343 0.0410 ms 91.2%
+  triton_mm_348 0.0421 ms 89.0%
+  triton_mm_349 0.0430 ms 87.1%
+  triton_mm_341 0.0520 ms 72.0%
+  triton_mm_346 0.0558 ms 67.1%
+  triton_mm_347 0.0559 ms 67.0%
+SingleProcess AUTOTUNE takes 5.3413 seconds
+AUTOTUNE mm(6272x256, 256x128)
+  triton_mm_376 0.0116 ms 100.0%
+  triton_mm_375 0.0117 ms 99.2%
+  triton_mm_374 0.0125 ms 93.3%
+  triton_mm_380 0.0128 ms 90.8%
+  mm 0.0129 ms 90.5%
+  triton_mm_373 0.0132 ms 88.6%
+  triton_mm_377 0.0152 ms 76.6%
+  triton_mm_372 0.0156 ms 74.7%
+  triton_mm_378 0.0169 ms 68.8%
+  triton_mm_381 0.0171 ms 68.2%
+SingleProcess AUTOTUNE takes 4.5999 seconds
+AUTOTUNE convolution(32x128x14x14, 32x128x3x3)
+  convolution 0.0150 ms 100.0%
+  triton_convolution_389 0.0455 ms 33.0%
+  triton_convolution_388 0.0557 ms 26.9%
+  triton_convolution_387 0.0613 ms 24.5%
+  triton_convolution_384 0.0641 ms 23.4%
+  triton_convolution_390 0.0839 ms 17.9%
+  triton_convolution_385 0.2006 ms 7.5%
+  triton_convolution_386 0.4470 ms 3.4%
+SingleProcess AUTOTUNE takes 3.0167 seconds
+AUTOTUNE mm(6272x288, 288x128)
+  triton_mm_395 0.0129 ms 100.0%
+  triton_mm_394 0.0133 ms 97.1%
+  triton_mm_393 0.0135 ms 95.8%
+  triton_mm_392 0.0138 ms 93.7%
+  mm 0.0140 ms 92.0%
+  triton_mm_399 0.0141 ms 91.8%
+  triton_mm_396 0.0162 ms 80.0%
+  triton_mm_391 0.0166 ms 77.8%
+  triton_mm_397 0.0182 ms 71.0%
+  triton_mm_398 0.0195 ms 66.3%
+SingleProcess AUTOTUNE takes 5.0856 seconds
+AUTOTUNE mm(6272x320, 320x128)
+  triton_mm_414 0.0130 ms 100.0%
+  triton_mm_413 0.0134 ms 97.1%
+  mm 0.0135 ms 96.7%
+  triton_mm_418 0.0136 ms 96.0%
+  triton_mm_412 0.0138 ms 94.4%
+  triton_mm_411 0.0141 ms 92.3%
+  triton_mm_410 0.0172 ms 75.8%
+  triton_mm_415 0.0177 ms 73.7%
+  triton_mm_416 0.0191 ms 68.1%
+  triton_mm_419 0.0191 ms 68.1%
+SingleProcess AUTOTUNE takes 4.9187 seconds
+AUTOTUNE mm(6272x352, 352x128)
+  triton_mm_433 0.0132 ms 100.0%
+  triton_mm_432 0.0137 ms 96.3%
+  mm 0.0143 ms 91.9%
+  triton_mm_431 0.0145 ms 90.7%
+  triton_mm_437 0.0147 ms 89.3%
+  triton_mm_430 0.0149 ms 88.4%
+  triton_mm_434 0.0180 ms 73.0%
+  triton_mm_429 0.0187 ms 70.3%
+  triton_mm_438 0.0193 ms 68.3%
+  triton_mm_435 0.0198 ms 66.5%
+SingleProcess AUTOTUNE takes 5.0211 seconds
+AUTOTUNE mm(6272x384, 384x128)
+  triton_mm_452 0.0139 ms 100.0%
+  mm 0.0140 ms 99.2%
+  triton_mm_451 0.0141 ms 98.7%
+  triton_mm_450 0.0150 ms 92.8%
+  triton_mm_456 0.0150 ms 92.8%
+  triton_mm_449 0.0156 ms 89.4%
+  triton_mm_453 0.0189 ms 73.4%
+  triton_mm_457 0.0198 ms 70.2%
+  triton_mm_448 0.0200 ms 69.4%
+  triton_mm_454 0.0213 ms 65.2%
+SingleProcess AUTOTUNE takes 5.1835 seconds
+AUTOTUNE mm(6272x416, 416x128)
+  triton_mm_471 0.0140 ms 100.0%
+  triton_mm_470 0.0144 ms 96.9%
+  mm 0.0148 ms 94.0%
+  triton_mm_468 0.0156 ms 89.3%
+  triton_mm_469 0.0156 ms 89.3%
+  triton_mm_475 0.0161 ms 86.7%
+  triton_mm_472 0.0188 ms 74.0%
+  triton_mm_467 0.0210 ms 66.6%
+  triton_mm_473 0.0215 ms 64.8%
+  triton_mm_476 0.0216 ms 64.5%
+SingleProcess AUTOTUNE takes 4.6986 seconds
+AUTOTUNE mm(6272x448, 448x128)
+  triton_mm_489 0.0142 ms 100.0%
+  triton_mm_494 0.0146 ms 97.8%
+  mm 0.0146 ms 97.6%
+  triton_mm_490 0.0147 ms 96.9%
+  triton_mm_488 0.0166 ms 85.7%
+  triton_mm_487 0.0168 ms 84.6%
+  triton_mm_491 0.0198 ms 71.8%
+  triton_mm_492 0.0206 ms 69.0%
+  triton_mm_495 0.0213 ms 66.9%
+  triton_mm_486 0.0222 ms 64.0%
+SingleProcess AUTOTUNE takes 4.7602 seconds
+AUTOTUNE mm(6272x480, 480x128)
+  triton_mm_508 0.0148 ms 100.0%
+  triton_mm_509 0.0150 ms 98.9%
+  mm 0.0152 ms 97.7%
+  triton_mm_513 0.0163 ms 91.3%
+  triton_mm_506 0.0164 ms 90.3%
+  triton_mm_507 0.0164 ms 90.3%
+  triton_mm_514 0.0208 ms 71.3%
+  triton_mm_510 0.0210 ms 70.8%
+  triton_mm_511 0.0211 ms 70.3%
+  triton_mm_505 0.0233 ms 63.6%
+SingleProcess AUTOTUNE takes 4.6862 seconds
+AUTOTUNE mm(6272x512, 512x128)
+  triton_mm_527 0.0150 ms 100.0%
+  mm 0.0152 ms 98.7%
+  triton_mm_532 0.0153 ms 98.5%
+  triton_mm_528 0.0153 ms 98.1%
+  triton_mm_526 0.0171 ms 88.2%
+  triton_mm_525 0.0173 ms 87.0%
+  triton_mm_529 0.0209 ms 71.9%
+  triton_mm_530 0.0214 ms 70.1%
+  triton_mm_533 0.0219 ms 68.8%
+  triton_mm_524 0.0242 ms 62.2%
+SingleProcess AUTOTUNE takes 4.9429 seconds
+AUTOTUNE mm(6272x544, 544x128)
+  triton_mm_546 0.0157 ms 100.0%
+  triton_mm_547 0.0164 ms 95.7%
+  mm 0.0165 ms 95.3%
+  triton_mm_551 0.0166 ms 94.8%
+  triton_mm_545 0.0181 ms 86.9%
+  triton_mm_544 0.0183 ms 86.2%
+  triton_mm_549 0.0219 ms 71.8%
+  triton_mm_548 0.0223 ms 70.6%
+  triton_mm_552 0.0242 ms 65.0%
+  triton_mm_543 0.0251 ms 62.8%
+SingleProcess AUTOTUNE takes 4.7074 seconds
+AUTOTUNE mm(6272x576, 576x128)
+  mm 0.0161 ms 100.0%
+  triton_mm_565 0.0165 ms 97.7%
+  triton_mm_566 0.0169 ms 95.6%
+  triton_mm_570 0.0169 ms 95.6%
+  triton_mm_563 0.0187 ms 86.3%
+  triton_mm_564 0.0188 ms 85.6%
+  triton_mm_567 0.0229 ms 70.4%
+  triton_mm_568 0.0240 ms 67.3%
+  triton_mm_571 0.0243 ms 66.3%
+  triton_mm_562 0.0263 ms 61.2%
+SingleProcess AUTOTUNE takes 5.1239 seconds
+AUTOTUNE mm(6272x608, 608x128)
+  triton_mm_585 0.0170 ms 100.0%
+  triton_mm_584 0.0172 ms 98.3%
+  mm 0.0173 ms 98.1%
+  triton_mm_589 0.0175 ms 96.7%
+  triton_mm_583 0.0190 ms 89.4%
+  triton_mm_582 0.0191 ms 88.9%
+  triton_mm_586 0.0231 ms 73.5%
+  triton_mm_587 0.0239 ms 71.0%
+  triton_mm_590 0.0239 ms 71.0%
+  triton_mm_581 0.0273 ms 62.1%
+SingleProcess AUTOTUNE takes 5.3370 seconds
+AUTOTUNE mm(6272x640, 640x128)
+  mm 0.0168 ms 100.0%
+  triton_mm_608 0.0172 ms 97.6%
+  triton_mm_603 0.0173 ms 97.2%
+  triton_mm_604 0.0178 ms 94.8%
+  triton_mm_601 0.0198 ms 85.0%
+  triton_mm_602 0.0200 ms 84.0%
+  triton_mm_605 0.0248 ms 68.0%
+  triton_mm_609 0.0255 ms 66.1%
+  triton_mm_606 0.0255 ms 65.9%
+  triton_mm_600 0.0290 ms 58.1%
+SingleProcess AUTOTUNE takes 4.8703 seconds
+AUTOTUNE mm(6272x672, 672x128)
+  mm 0.0177 ms 100.0%
+  triton_mm_622 0.0178 ms 99.6%
+  triton_mm_623 0.0185 ms 95.7%
+  triton_mm_627 0.0191 ms 92.5%
+  triton_mm_620 0.0201 ms 88.1%
+  triton_mm_621 0.0202 ms 87.5%
+  triton_mm_624 0.0245 ms 72.1%
+  triton_mm_625 0.0251 ms 70.6%
+  triton_mm_628 0.0276 ms 64.2%
+  triton_mm_619 0.0299 ms 59.2%
+SingleProcess AUTOTUNE takes 5.6124 seconds
+AUTOTUNE mm(6272x704, 704x128)
+  mm 0.0181 ms 100.0%
+  triton_mm_646 0.0181 ms 99.8%
+  triton_mm_641 0.0182 ms 99.1%
+  triton_mm_642 0.0187 ms 96.9%
+  triton_mm_639 0.0210 ms 86.0%
+  triton_mm_640 0.0217 ms 83.3%
+  triton_mm_643 0.0256 ms 70.7%
+  triton_mm_644 0.0269 ms 67.1%
+  triton_mm_647 0.0280 ms 64.6%
+  triton_mm_638 0.0308 ms 58.8%
+SingleProcess AUTOTUNE takes 5.4649 seconds
+AUTOTUNE mm(6272x736, 736x128)
+  mm 0.0186 ms 100.0%
+  triton_mm_660 0.0192 ms 96.7%
+  triton_mm_661 0.0196 ms 94.6%
+  triton_mm_665 0.0203 ms 91.5%
+  triton_mm_658 0.0219 ms 84.8%
+  triton_mm_659 0.0220 ms 84.2%
+  triton_mm_662 0.0260 ms 71.3%
+  triton_mm_663 0.0274 ms 67.8%
+  triton_mm_666 0.0281 ms 66.1%
+  triton_mm_657 0.0323 ms 57.5%
+SingleProcess AUTOTUNE takes 5.7786 seconds
+AUTOTUNE mm(6272x768, 768x128)
+  mm 0.0185 ms 100.0%
+  triton_mm_679 0.0188 ms 98.5%
+  triton_mm_684 0.0192 ms 96.5%
+  triton_mm_680 0.0193 ms 96.2%
+  triton_mm_677 0.0226 ms 82.0%
+  triton_mm_678 0.0226 ms 81.9%
+  triton_mm_681 0.0279 ms 66.5%
+  triton_mm_682 0.0291 ms 63.6%
+  triton_mm_685 0.0292 ms 63.6%
+  triton_mm_676 0.0329 ms 56.3%
+SingleProcess AUTOTUNE takes 5.0768 seconds
+AUTOTUNE mm(6272x800, 800x128)
+  mm 0.0198 ms 100.0%
+  triton_mm_698 0.0201 ms 98.4%
+  triton_mm_699 0.0202 ms 98.1%
+  triton_mm_696 0.0227 ms 87.0%
+  triton_mm_697 0.0233 ms 84.9%
+  triton_mm_700 0.0276 ms 71.7%
+  triton_mm_701 0.0285 ms 69.3%
+  triton_mm_704 0.0304 ms 65.0%
+  triton_mm_695 0.0337 ms 58.7%
+  triton_mm_705 0.0411 ms 48.1%
+SingleProcess AUTOTUNE takes 5.6666 seconds
+AUTOTUNE mm(6272x832, 832x128)
+  mm 0.0197 ms 100.0%
+  triton_mm_717 0.0204 ms 96.4%
+  triton_mm_718 0.0209 ms 94.3%
+  triton_mm_722 0.0210 ms 93.9%
+  triton_mm_716 0.0240 ms 82.1%
+  triton_mm_715 0.0244 ms 80.6%
+  triton_mm_719 0.0288 ms 68.4%
+  triton_mm_720 0.0298 ms 66.0%
+  triton_mm_723 0.0317 ms 62.1%
+  triton_mm_714 0.0357 ms 55.1%
+SingleProcess AUTOTUNE takes 5.1165 seconds
+AUTOTUNE mm(6272x864, 864x128)
+  mm 0.0207 ms 100.0%
+  triton_mm_736 0.0211 ms 98.0%
+  triton_mm_737 0.0214 ms 96.7%
+  triton_mm_741 0.0221 ms 93.5%
+  triton_mm_735 0.0243 ms 85.0%
+  triton_mm_734 0.0244 ms 84.9%
+  triton_mm_738 0.0295 ms 70.2%
+  triton_mm_739 0.0301 ms 68.7%
+  triton_mm_742 0.0311 ms 66.5%
+  triton_mm_733 0.0365 ms 56.7%
+SingleProcess AUTOTUNE takes 5.1196 seconds
+AUTOTUNE mm(6272x896, 896x128)
+  mm 0.0209 ms 100.0%
+  triton_mm_755 0.0215 ms 97.5%
+  triton_mm_760 0.0217 ms 96.5%
+  triton_mm_756 0.0219 ms 95.8%
+  triton_mm_753 0.0255 ms 82.2%
+  triton_mm_754 0.0256 ms 81.7%
+  triton_mm_757 0.0303 ms 69.1%
+  triton_mm_758 0.0316 ms 66.3%
+  triton_mm_761 0.0325 ms 64.3%
+  triton_mm_752 0.0375 ms 55.9%
+SingleProcess AUTOTUNE takes 5.0887 seconds
+AUTOTUNE mm(6272x928, 928x128)
+  mm 0.0216 ms 100.0%
+  triton_mm_774 0.0221 ms 97.8%
+  triton_mm_779 0.0228 ms 94.8%
+  triton_mm_775 0.0231 ms 93.6%
+  triton_mm_772 0.0249 ms 86.8%
+  triton_mm_773 0.0262 ms 82.4%
+  triton_mm_776 0.0310 ms 69.7%
+  triton_mm_777 0.0311 ms 69.4%
+  triton_mm_780 0.0340 ms 63.6%
+  triton_mm_771 0.0388 ms 55.7%
+SingleProcess AUTOTUNE takes 5.5842 seconds
+AUTOTUNE mm(6272x960, 960x128)
+  mm 0.0211 ms 100.0%
+  triton_mm_793 0.0220 ms 95.8%
+  triton_mm_794 0.0228 ms 92.3%
+  triton_mm_798 0.0229 ms 91.9%
+  triton_mm_791 0.0261 ms 80.5%
+  triton_mm_792 0.0273 ms 77.0%
+  triton_mm_795 0.0331 ms 63.7%
+  triton_mm_796 0.0332 ms 63.4%
+  triton_mm_799 0.0338 ms 62.4%
+  triton_mm_797 0.0399 ms 52.8%
+SingleProcess AUTOTUNE takes 4.8133 seconds
+AUTOTUNE mm(6272x992, 992x128)
+  mm 0.0223 ms 100.0%
+  triton_mm_812 0.0229 ms 97.3%
+  triton_mm_813 0.0235 ms 94.8%
+  triton_mm_817 0.0237 ms 93.9%
+  triton_mm_810 0.0267 ms 83.5%
+  triton_mm_811 0.0273 ms 81.6%
+  triton_mm_814 0.0317 ms 70.3%
+  triton_mm_815 0.0325 ms 68.5%
+  triton_mm_818 0.0332 ms 67.3%
+  triton_mm_809 0.0409 ms 54.5%
+SingleProcess AUTOTUNE takes 4.8404 seconds
+AUTOTUNE mm(1568x512, 512x128)
+  mm 0.0108 ms 100.0%
+  triton_mm_846 0.0112 ms 96.9%
+  triton_mm_845 0.0115 ms 94.4%
+  triton_mm_848 0.0119 ms 90.9%
+  triton_mm_849 0.0122 ms 89.2%
+  triton_mm_844 0.0129 ms 84.3%
+  triton_mm_843 0.0135 ms 80.5%
+  triton_mm_841 0.0153 ms 70.8%
+  triton_mm_842 0.0156 ms 69.3%
+  triton_mm_840 0.0206 ms 52.6%
+SingleProcess AUTOTUNE takes 5.4689 seconds
+AUTOTUNE convolution(32x128x7x7, 32x128x3x3)
+  convolution 0.0134 ms 100.0%
+  triton_convolution_857 0.0420 ms 31.9%
+  triton_convolution_856 0.0546 ms 24.5%
+  triton_convolution_855 0.0560 ms 23.9%
+  triton_convolution_852 0.0628 ms 21.3%
+  triton_convolution_858 0.0790 ms 16.9%
+  triton_convolution_853 0.1935 ms 6.9%
+  triton_convolution_854 0.4096 ms 3.3%
+SingleProcess AUTOTUNE takes 3.1823 seconds
+AUTOTUNE mm(1568x544, 544x128)
+  triton_mm_865 0.0113 ms 100.0%
+  mm 0.0113 ms 99.7%
+  triton_mm_864 0.0116 ms 97.0%
+  triton_mm_867 0.0126 ms 89.6%
+  triton_mm_868 0.0132 ms 85.9%
+  triton_mm_863 0.0135 ms 83.5%
+  triton_mm_862 0.0136 ms 83.3%
+  triton_mm_860 0.0156 ms 72.6%
+  triton_mm_861 0.0159 ms 71.2%
+  triton_mm_859 0.0216 ms 52.2%
+SingleProcess AUTOTUNE takes 5.2444 seconds
+AUTOTUNE mm(1568x576, 576x128)
+  mm 0.0113 ms 100.0%
+  triton_mm_884 0.0118 ms 95.1%
+  triton_mm_883 0.0120 ms 94.1%
+  triton_mm_886 0.0121 ms 93.1%
+  triton_mm_887 0.0132 ms 85.1%
+  triton_mm_881 0.0139 ms 81.3%
+  triton_mm_882 0.0142 ms 79.3%
+  triton_mm_880 0.0167 ms 67.4%
+  triton_mm_879 0.0168 ms 67.2%
+  triton_mm_878 0.0229 ms 49.2%
+SingleProcess AUTOTUNE takes 5.1173 seconds
+AUTOTUNE mm(1568x608, 608x128)
+  triton_mm_903 0.0118 ms 100.0%
+  triton_mm_902 0.0121 ms 98.1%
+  mm 0.0122 ms 97.4%
+  triton_mm_905 0.0128 ms 92.5%
+  triton_mm_906 0.0133 ms 88.7%
+  triton_mm_901 0.0147 ms 80.6%
+  triton_mm_900 0.0148 ms 79.7%
+  triton_mm_899 0.0164 ms 72.4%
+  triton_mm_898 0.0169 ms 70.1%
+  triton_mm_897 0.0239 ms 49.5%
+SingleProcess AUTOTUNE takes 4.9152 seconds
+AUTOTUNE mm(1568x640, 640x128)
+  mm 0.0116 ms 100.0%
+  triton_mm_921 0.0120 ms 97.1%
+  triton_mm_922 0.0128 ms 90.5%
+  triton_mm_925 0.0129 ms 89.9%
+  triton_mm_924 0.0132 ms 88.1%
+  triton_mm_919 0.0146 ms 79.6%
+  triton_mm_920 0.0151 ms 76.9%
+  triton_mm_918 0.0179 ms 65.1%
+  triton_mm_917 0.0179 ms 64.8%
+  triton_mm_916 0.0244 ms 47.6%
+SingleProcess AUTOTUNE takes 5.2011 seconds
+AUTOTUNE mm(1568x672, 672x128)
+  mm 0.0124 ms 100.0%
+  triton_mm_940 0.0127 ms 97.5%
+  triton_mm_941 0.0128 ms 96.8%
+  triton_mm_943 0.0134 ms 92.6%
+  triton_mm_944 0.0144 ms 85.8%
+  triton_mm_939 0.0151 ms 82.2%
+  triton_mm_938 0.0155 ms 79.7%
+  triton_mm_936 0.0179 ms 69.1%
+  triton_mm_937 0.0180 ms 68.8%
+  triton_mm_935 0.0258 ms 47.9%
+SingleProcess AUTOTUNE takes 5.0538 seconds
+AUTOTUNE mm(1568x704, 704x128)
+  mm 0.0119 ms 100.0%
+  triton_mm_959 0.0130 ms 91.4%
+  triton_mm_960 0.0134 ms 88.5%
+  triton_mm_962 0.0137 ms 86.7%
+  triton_mm_963 0.0140 ms 85.1%
+  triton_mm_958 0.0153 ms 77.8%
+  triton_mm_957 0.0157 ms 75.7%
+  triton_mm_955 0.0184 ms 64.6%
+  triton_mm_956 0.0188 ms 63.1%
+  triton_mm_954 0.0261 ms 45.5%
+SingleProcess AUTOTUNE takes 5.0616 seconds
+AUTOTUNE mm(1568x736, 736x128)
+  mm 0.0127 ms 100.0%
+  triton_mm_978 0.0127 ms 100.0%
+  triton_mm_979 0.0129 ms 98.3%
+  triton_mm_982 0.0141 ms 89.8%
+  triton_mm_981 0.0145 ms 87.7%
+  triton_mm_976 0.0159 ms 80.0%
+  triton_mm_977 0.0163 ms 78.1%
+  triton_mm_975 0.0191 ms 66.6%
+  triton_mm_974 0.0191 ms 66.5%
+  triton_mm_973 0.0274 ms 46.4%
+SingleProcess AUTOTUNE takes 5.3019 seconds
+AUTOTUNE mm(1568x768, 768x128)
+  mm 0.0126 ms 100.0%
+  triton_mm_997 0.0135 ms 93.2%
+  triton_mm_1000 0.0139 ms 90.5%
+  triton_mm_998 0.0140 ms 90.3%
+  triton_mm_1001 0.0146 ms 86.1%
+  triton_mm_996 0.0161 ms 78.2%
+  triton_mm_995 0.0164 ms 76.6%
+  triton_mm_993 0.0194 ms 64.8%
+  triton_mm_994 0.0195 ms 64.6%
+  triton_mm_992 0.0285 ms 44.2%
+SingleProcess AUTOTUNE takes 5.5796 seconds
+AUTOTUNE mm(1568x800, 800x128)
+  mm 0.0130 ms 100.0%
+  triton_mm_1016 0.0131 ms 99.3%
+  triton_mm_1017 0.0136 ms 95.8%
+  triton_mm_1019 0.0152 ms 85.5%
+  triton_mm_1020 0.0152 ms 85.5%
+  triton_mm_1015 0.0170 ms 76.6%
+  triton_mm_1014 0.0172 ms 75.7%
+  triton_mm_1013 0.0196 ms 66.2%
+  triton_mm_1012 0.0200 ms 65.0%
+  triton_mm_1011 0.0292 ms 44.6%
+SingleProcess AUTOTUNE takes 5.0127 seconds
+AUTOTUNE mm(1568x832, 832x128)
+  mm 0.0130 ms 100.0%
+  triton_mm_1035 0.0141 ms 91.8%
+  triton_mm_1036 0.0142 ms 91.4%
+  triton_mm_1038 0.0150 ms 86.5%
+  triton_mm_1039 0.0151 ms 86.0%
+  triton_mm_1033 0.0172 ms 75.4%
+  triton_mm_1034 0.0172 ms 75.4%
+  triton_mm_1031 0.0209 ms 61.9%
+  triton_mm_1032 0.0209 ms 61.9%
+  triton_mm_1030 0.0304 ms 42.7%
+SingleProcess AUTOTUNE takes 5.5532 seconds
+AUTOTUNE mm(1568x864, 864x128)
+  mm 0.0132 ms 100.0%
+  triton_mm_1054 0.0136 ms 97.4%
+  triton_mm_1055 0.0144 ms 91.6%
+  triton_mm_1057 0.0152 ms 87.1%
+  triton_mm_1058 0.0153 ms 86.2%
+  triton_mm_1053 0.0173 ms 76.2%
+  triton_mm_1052 0.0178 ms 74.4%
+  triton_mm_1051 0.0206 ms 64.0%
+  triton_mm_1050 0.0212 ms 62.5%
+  triton_mm_1049 0.0310 ms 42.7%
+SingleProcess AUTOTUNE takes 5.1937 seconds
+AUTOTUNE mm(1568x896, 896x128)
+  mm 0.0132 ms 100.0%
+  triton_mm_1073 0.0142 ms 93.0%
+  triton_mm_1074 0.0145 ms 90.7%
+  triton_mm_1076 0.0152 ms 86.9%
+  triton_mm_1077 0.0158 ms 83.4%
+  triton_mm_1071 0.0176 ms 74.9%
+  triton_mm_1072 0.0179 ms 73.6%
+  triton_mm_1070 0.0216 ms 61.0%
+  triton_mm_1069 0.0219 ms 60.1%
+  triton_mm_1068 0.0317 ms 41.6%
+SingleProcess AUTOTUNE takes 5.4972 seconds
+AUTOTUNE mm(1568x928, 928x128)
+  mm 0.0132 ms 100.0%
+  triton_mm_1093 0.0144 ms 91.4%
+  triton_mm_1092 0.0146 ms 90.4%
+  triton_mm_1095 0.0162 ms 81.3%
+  triton_mm_1096 0.0164 ms 80.5%
+  triton_mm_1090 0.0184 ms 71.5%
+  triton_mm_1091 0.0184 ms 71.5%
+  triton_mm_1088 0.0216 ms 61.0%
+  triton_mm_1089 0.0216 ms 60.9%
+  triton_mm_1087 0.0332 ms 39.7%
+SingleProcess AUTOTUNE takes 5.2763 seconds
+AUTOTUNE mm(1568x960, 960x128)
+  mm 0.0131 ms 100.0%
+  triton_mm_1112 0.0149 ms 88.2%
+  triton_mm_1111 0.0151 ms 87.0%
+  triton_mm_1114 0.0156 ms 83.8%
+  triton_mm_1115 0.0168 ms 78.1%
+  triton_mm_1109 0.0181 ms 72.3%
+  triton_mm_1110 0.0187 ms 70.2%
+  triton_mm_1107 0.0224 ms 58.6%
+  triton_mm_1108 0.0226 ms 58.1%
+  triton_mm_1106 0.0342 ms 38.4%
+SingleProcess AUTOTUNE takes 4.9097 seconds
+AUTOTUNE mm(1568x992, 992x128)
+  mm 0.0139 ms 100.0%
+  triton_mm_1130 0.0145 ms 95.9%
+  triton_mm_1131 0.0149 ms 93.4%
+  triton_mm_1133 0.0164 ms 84.9%
+  triton_mm_1134 0.0165 ms 84.2%
+  triton_mm_1129 0.0187 ms 74.5%
+  triton_mm_1128 0.0191 ms 72.7%
+  triton_mm_1126 0.0226 ms 61.6%
+  triton_mm_1127 0.0231 ms 60.1%
+  triton_mm_1125 0.0352 ms 39.6%
+SingleProcess AUTOTUNE takes 5.1848 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  10%|█         | 3/30 [00:00<00:00, 27.95it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:00, 33.21it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 34.73it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 35.83it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 36.53it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 36.68it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 36.89it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 35.85it/s]
+3531.315ms
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_c4
+WARNING:common:Model detectron2_fasterrcnn_r_101_c4 does not support bfloat16, running with amp instead
+loading model: 0it [00:08, ?it/s]
+WARNING:common:Model detectron2_fasterrcnn_r_101_c4 does not support bfloat16, running with amp instead
+cuda eval  detectron2_fasterrcnn_r_101_c4      int8dynamic-bs32          
+WARNING:common:Model detectron2_fasterrcnn_r_101_c4 does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['mutated inputs']
+AUTOTUNE convolution(32x3x1199x1333, 64x3x7x7)
+  convolution 3.6636 ms 100.0%
+  triton_convolution_3 21.6590 ms 16.9%
+  triton_convolution_4 23.7937 ms 15.4%
+  triton_convolution_5 26.3969 ms 13.9%
+  triton_convolution_0 29.6158 ms 12.4%
+  triton_convolution_2 31.9388 ms 11.5%
+  triton_convolution_1 79.7962 ms 4.6%
+SingleProcess AUTOTUNE takes 4.7266 seconds
+AUTOTUNE mm(3206400x64, 64x64)
+  triton_mm_14 0.5091 ms 100.0%
+  triton_mm_8 0.5108 ms 99.7%
+  triton_mm_7 0.5175 ms 98.4%
+  triton_mm_10 0.5270 ms 96.6%
+  triton_mm_6 0.5354 ms 95.1%
+  triton_mm_13 0.5387 ms 94.5%
+  triton_mm_9 0.5422 ms 93.9%
+  mm 0.5564 ms 91.5%
+  triton_mm_16 0.5946 ms 85.6%
+  triton_mm_15 0.6817 ms 74.7%
+SingleProcess AUTOTUNE takes 4.1349 seconds
+AUTOTUNE convolution(32x64x300x334, 64x64x3x3)
+  convolution 1.4103 ms 100.0%
+  triton_convolution_18 7.0067 ms 20.1%
+  triton_convolution_23 8.0105 ms 17.6%
+  triton_convolution_24 9.5701 ms 14.7%
+  triton_convolution_19 11.8053 ms 11.9%
+  triton_convolution_21 11.9803 ms 11.8%
+  triton_convolution_22 12.3943 ms 11.4%
+  triton_convolution_20 27.9455 ms 5.0%
+SingleProcess AUTOTUNE takes 4.5466 seconds
+AUTOTUNE mm(3206400x64, 64x256)
+  triton_mm_27 1.5132 ms 100.0%
+  triton_mm_26 1.5624 ms 96.9%
+  triton_mm_29 1.7212 ms 87.9%
+  mm 1.7369 ms 87.1%
+  triton_mm_28 1.7641 ms 85.8%
+  triton_mm_33 1.8110 ms 83.6%
+  triton_mm_25 1.8952 ms 79.8%
+  triton_mm_32 1.9609 ms 77.2%
+  triton_mm_35 2.3900 ms 63.3%
+  triton_mm_34 2.7946 ms 54.1%
+SingleProcess AUTOTUNE takes 4.8006 seconds
+AUTOTUNE mm(3206400x256, 256x64)
+  triton_mm_51 1.3116 ms 100.0%
+  triton_mm_53 1.3399 ms 97.9%
+  triton_mm_56 1.3417 ms 97.8%
+  mm 1.3797 ms 95.1%
+  triton_mm_57 1.4092 ms 93.1%
+  triton_mm_50 1.4097 ms 93.0%
+  triton_mm_52 1.4368 ms 91.3%
+  triton_mm_49 1.4472 ms 90.6%
+  triton_mm_54 2.0782 ms 63.1%
+  triton_mm_55 2.2582 ms 58.1%
+SingleProcess AUTOTUNE takes 4.6247 seconds
+AUTOTUNE convolution(32x256x300x334, 128x256x1x1)
+  convolution 0.4466 ms 100.0%
+  triton_convolution_114 1.0561 ms 42.3%
+  triton_convolution_111 1.2012 ms 37.2%
+  triton_convolution_117 1.3389 ms 33.4%
+  triton_convolution_116 1.4809 ms 30.2%
+  triton_convolution_115 1.6441 ms 27.2%
+  triton_convolution_112 3.1058 ms 14.4%
+  triton_convolution_113 6.6407 ms 6.7%
+SingleProcess AUTOTUNE takes 4.4114 seconds
+AUTOTUNE convolution(32x128x150x167, 128x128x3x3)
+  convolution 1.1567 ms 100.0%
+  triton_convolution_121 6.9318 ms 16.7%
+  triton_convolution_118 7.8340 ms 14.8%
+  triton_convolution_123 8.2215 ms 14.1%
+  triton_convolution_124 10.3963 ms 11.1%
+  triton_convolution_122 12.0419 ms 9.6%
+  triton_convolution_119 13.6938 ms 8.4%
+  triton_convolution_120 27.8867 ms 4.1%
+SingleProcess AUTOTUNE takes 4.8186 seconds
+AUTOTUNE mm(801600x128, 128x512)
+  triton_mm_127 0.9803 ms 100.0%
+  triton_mm_126 1.0014 ms 97.9%
+  triton_mm_132 1.0522 ms 93.2%
+  mm 1.1040 ms 88.8%
+  triton_mm_125 1.1288 ms 86.8%
+  triton_mm_128 1.1711 ms 83.7%
+  triton_mm_129 1.1851 ms 82.7%
+  triton_mm_133 1.3664 ms 71.7%
+  triton_mm_135 1.7144 ms 57.2%
+  triton_mm_134 2.2277 ms 44.0%
+SingleProcess AUTOTUNE takes 4.7219 seconds
+AUTOTUNE convolution(32x256x300x334, 512x256x1x1)
+  convolution 1.4932 ms 100.0%
+  triton_convolution_140 4.1769 ms 35.7%
+  triton_convolution_142 4.4275 ms 33.7%
+  triton_convolution_143 5.2915 ms 28.2%
+  triton_convolution_141 6.5236 ms 22.9%
+  triton_convolution_137 8.1883 ms 18.2%
+  triton_convolution_138 12.3643 ms 12.1%
+  triton_convolution_139 26.6656 ms 5.6%
+SingleProcess AUTOTUNE takes 5.7246 seconds
+AUTOTUNE mm(801600x512, 512x128)
+  mm 0.7080 ms 100.0%
+  triton_mm_146 0.8184 ms 86.5%
+  triton_mm_145 0.8711 ms 81.3%
+  triton_mm_148 0.8796 ms 80.5%
+  triton_mm_147 0.9305 ms 76.1%
+  triton_mm_151 0.9558 ms 74.1%
+  triton_mm_152 1.0622 ms 66.7%
+  triton_mm_144 1.0890 ms 65.0%
+  triton_mm_154 1.6966 ms 41.7%
+  triton_mm_149 1.7448 ms 40.6%
+SingleProcess AUTOTUNE takes 5.2440 seconds
+AUTOTUNE convolution(32x512x150x167, 256x512x1x1)
+  convolution 0.3102 ms 100.0%
+  triton_convolution_240 1.0466 ms 29.6%
+  triton_convolution_242 1.0597 ms 29.3%
+  triton_convolution_243 1.2924 ms 24.0%
+  triton_convolution_241 1.6560 ms 18.7%
+  triton_convolution_237 2.0882 ms 14.9%
+  triton_convolution_238 3.1232 ms 9.9%
+  triton_convolution_239 6.5920 ms 4.7%
+SingleProcess AUTOTUNE takes 4.8463 seconds
+AUTOTUNE convolution(32x256x75x84, 256x256x3x3)
+  convolution 1.0650 ms 100.0%
+  triton_convolution_249 5.9998 ms 17.7%
+  triton_convolution_247 7.2283 ms 14.7%
+  triton_convolution_244 7.7493 ms 13.7%
+  triton_convolution_250 8.3150 ms 12.8%
+  triton_convolution_245 16.2382 ms 6.6%
+  triton_convolution_248 16.2655 ms 6.5%
+  triton_convolution_246 29.2792 ms 3.6%
+SingleProcess AUTOTUNE takes 5.4518 seconds
+AUTOTUNE mm(201600x256, 256x1024)
+  mm 0.6618 ms 100.0%
+  triton_mm_253 0.7572 ms 87.4%
+  triton_mm_252 0.7599 ms 87.1%
+  triton_mm_258 0.7957 ms 83.2%
+  triton_mm_251 0.9019 ms 73.4%
+  triton_mm_254 0.9034 ms 73.3%
+  triton_mm_255 0.9124 ms 72.5%
+  triton_mm_259 1.0650 ms 62.1%
+  triton_mm_261 1.2844 ms 51.5%
+  triton_mm_260 1.8700 ms 35.4%
+SingleProcess AUTOTUNE takes 4.8383 seconds
+AUTOTUNE convolution(32x512x150x167, 1024x512x1x1)
+  convolution 1.1845 ms 100.0%
+  triton_convolution_266 4.0700 ms 29.1%
+  triton_convolution_268 4.1097 ms 28.8%
+  triton_convolution_269 5.0028 ms 23.7%
+  triton_convolution_267 6.4722 ms 18.3%
+  triton_convolution_263 8.0773 ms 14.7%
+  triton_convolution_264 12.3822 ms 9.6%
+  triton_convolution_265 25.9202 ms 4.6%
+SingleProcess AUTOTUNE takes 5.4410 seconds
+AUTOTUNE mm(201600x1024, 1024x256)
+  mm 0.5403 ms 100.0%
+  triton_mm_271 0.6515 ms 82.9%
+  triton_mm_272 0.6525 ms 82.8%
+  triton_mm_274 0.7526 ms 71.8%
+  triton_mm_273 0.7541 ms 71.7%
+  triton_mm_270 0.8653 ms 62.4%
+  triton_mm_278 0.8938 ms 60.4%
+  triton_mm_277 0.9605 ms 56.2%
+  triton_mm_280 1.2795 ms 42.2%
+  triton_mm_279 1.5435 ms 35.0%
+SingleProcess AUTOTUNE takes 5.3875 seconds
+AUTOTUNE convolution(32x1024x75x84, 1024x1024x3x3)
+  convolution 16.6891 ms 100.0%
+  triton_convolution_957 103.6529 ms 16.1%
+  triton_convolution_952 172.0782 ms 9.7%
+  triton_convolution_955 221.8030 ms 7.5%
+  triton_convolution_958 249.4051 ms 6.7%
+  triton_convolution_953 291.9445 ms 5.7%
+  triton_convolution_956 398.8468 ms 4.2%
+  triton_convolution_954 464.9940 ms 3.6%
+SingleProcess AUTOTUNE takes 19.4523 seconds
+AUTOTUNE addmm(201600x15, 201600x1024, 1024x15)
+  triton_mm_967 0.2811 ms 100.0%
+  triton_mm_968 0.2821 ms 99.6%
+  triton_mm_961 0.2892 ms 97.2%
+  triton_mm_964 0.2928 ms 96.0%
+  triton_mm_962 0.2928 ms 96.0%
+  triton_mm_965 0.2929 ms 96.0%
+  triton_mm_960 0.2929 ms 95.9%
+  triton_mm_963 0.2938 ms 95.7%
+  triton_mm_959 0.3963 ms 70.9%
+  triton_mm_966 0.3968 ms 70.8%
+SingleProcess AUTOTUNE takes 5.1499 seconds
+AUTOTUNE addmm(201600x60, 201600x1024, 1024x60)
+  triton_mm_973 0.3100 ms 100.0%
+  triton_mm_979 0.3155 ms 98.3%
+  triton_mm_975 0.3182 ms 97.4%
+  triton_mm_972 0.3220 ms 96.3%
+  triton_mm_974 0.3267 ms 94.9%
+  bias_addmm 0.4202 ms 73.8%
+  triton_mm_971 0.4237 ms 73.2%
+  triton_mm_978 0.4256 ms 72.8%
+  triton_mm_977 0.4258 ms 72.8%
+  triton_mm_976 0.4315 ms 71.8%
+SingleProcess AUTOTUNE takes 5.5195 seconds
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 11:23:20,898] [29/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+AUTOTUNE convolution(28747x512x7x7, 512x512x3x3)
+  convolution 28.4763 ms 100.0%
+  triton_convolution_988 184.2613 ms 15.5%
+  triton_convolution_983 239.0187 ms 11.9%
+  triton_convolution_986 265.3913 ms 10.7%
+  triton_convolution_989 420.4398 ms 6.8%
+  triton_convolution_987 500.2507 ms 5.7%
+  triton_convolution_984 575.5853 ms 4.9%
+  triton_convolution_985 693.6349 ms 4.1%
+SingleProcess AUTOTUNE takes 27.2822 seconds
+ERROR:common:Backend dynamo failed in warmup()
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2380, in warmup
+    fn(model, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 488, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward
+    return self.inference(batched_inputs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 203, in inference
+    images = self.preprocess_image(batched_inputs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 208, in resume_in_inference
+    proposals, _ = self.proposal_generator(images, features, None)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in resume_in_inference
+    results, _ = self.roi_heads(images, features, proposals, None)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 477, in forward
+    box_features = self._shared_roi_transform(
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 456, in _shared_roi_transform
+    x = self.pooler(features, boxes)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 654, in catch_errors
+    return callback(frame, cache_entry, hooks, frame_state)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 727, in _convert_frame
+    result = inner_convert(frame, cache_entry, hooks, frame_state)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 383, in _convert_frame_assert
+    compiled_product = _compile(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 646, in _compile
+    guarded_code = compile_inner(code, one_graph, hooks, transform)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 562, in compile_inner
+    out_code = transform_code_object(code, transform)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/bytecode_transformation.py", line 1033, in transform_code_object
+    transformations(instructions, code_options)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 151, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 527, in transform
+    tracer.run()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2144, in run
+    super().run()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 818, in run
+    and self.step()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 781, in step
+    getattr(self, inst.opname)(inst)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2259, in RETURN_VALUE
+    self.output.compile_subgraph(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 919, in compile_subgraph
+    self.compile_and_call_fx_graph(tx, list(reversed(stack_values)), root)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1087, in compile_and_call_fx_graph
+    compiled_fn = self.call_user_compiler(gm)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1159, in call_user_compiler
+    raise BackendCompilerFailed(self.compiler_fn, e).with_traceback(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1140, in call_user_compiler
+    compiled_fn = compiler_fn(gm, self.example_inputs())
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_dynamo.py", line 117, in debug_wrapper
+    compiled_gm = compiler_fn(gm, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/__init__.py", line 1672, in __call__
+    return compile_fx(model_, inputs_, config_patches=self.config)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 952, in compile_fx
+    return compile_fx(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1168, in compile_fx
+    return aot_autograd(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/backends/common.py", line 55, in compiler_fn
+    cg = aot_module_simplified(gm, example_inputs, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 887, in aot_module_simplified
+    compiled_fn = create_aot_dispatcher_function(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 600, in create_aot_dispatcher_function
+    compiled_fn = compiler_fn(flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 425, in aot_wrapper_dedupe
+    return compiler_fn(flat_fn, leaf_flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 630, in aot_wrapper_synthetic_base
+    return compiler_fn(flat_fn, flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py", line 97, in aot_dispatch_base
+    compiled_fw = compiler(fw_module, updated_flat_args)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1100, in fw_compiler_base
+    return inner_compile(
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_aot.py", line 83, in debug_wrapper
+    inner_compiled_fn = compiler_fn(gm, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/debug.py", line 305, in inner
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 320, in compile_fx_inner
+    compiled_graph = fx_codegen_and_compile(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 550, in fx_codegen_and_compile
+    compiled_fn = graph.compile_to_fn()
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 1117, in compile_to_fn
+    return self.compile_to_module().call
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 1071, in compile_to_module
+    mod = PyCodeCache.load_by_key_path(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 1891, in load_by_key_path
+    exec(code, mod.__dict__, mod.__dict__)
+  File "/tmp/torchinductor_cdhernandez/ax/caxcp4wd2lphx55vgchvpe4wujkilrhg2s26uqpiarrojt46ftl2.py", line 364, in <module>
+    async_compile.wait(globals())
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 2470, in wait
+    scope[key] = result.result()
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 2313, in result
+    self.future.result()
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/concurrent/futures/_base.py", line 458, in result
+    return self.__get_result()
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result
+    raise self._exception
+torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised:
+CompilationError: at 14:40:    xnumel = 196
+    yoffset = tl.program_id(1).to(tl.int64) * YBLOCK
+    yindex = yoffset + tl.arange(0, YBLOCK)[None, :].to(tl.int64)
+    ymask = yindex < ynumel
+    xoffset = tl.program_id(0).to(tl.int64) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None].to(tl.int64)
+    xmask = xindex < xnumel
+    x2 = xindex
+    y3 = yindex
+    y0 = yindex % 1024
+    y1 = (yindex // 1024)
+    tmp0 = tl.load(in_ptr0 + (x2 + (196*y3)), xmask, eviction_policy='evict_last').to(tl.float32)
+                                        ^
+ValueError('numel (262144) exceeds triton maximum tensor numel (131072)')
+
+Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information
+
+
+You can suppress this exception and fall back to eager by setting:
+    import torch._dynamo
+    torch._dynamo.config.suppress_errors = True
+
+Run failed with return code:  255
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_dc5
+loading model: 0it [00:08, ?it/s]
+WARNING:root:detectron2_fasterrcnn_r_101_dc5 failed to load
+Original Error: "roi_align_forward_kernel" not implemented for 'BFloat16'
+Eager model failed to run
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model
+    self.model_iter_fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward
+    return self.inference(batched_inputs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference
+    results, _ = self.roi_heads(images, features, proposals, None)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 747, in forward
+    pred_instances = self._forward_box(features, proposals)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 798, in _forward_box
+    box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals])
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward
+    return self.level_poolers[0](x[0], pooler_fmt_boxes)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward
+    return roi_align(
+  File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 238, in roi_align
+    return torch.ops.torchvision.roi_align(
+  File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 755, in __call__
+    return self._op(*args, **(kwargs or {}))
+RuntimeError: "roi_align_forward_kernel" not implemented for 'BFloat16'
+
+The above exception was the direct cause of the following exception:
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model
+    self.validate_model(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model
+    raise NotImplementedError("Eager model failed to run") from e
+NotImplementedError: Eager model failed to run
+
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_fpn
+WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:07, ?it/s]
+WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_fasterrcnn_r_101_fpn     int8dynamic-bs32          
+WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['mutated inputs']
+AUTOTUNE convolution(32x3x1216x1344, 64x3x7x7)
+  convolution 3.7106 ms 100.0%
+  triton_convolution_3 22.1062 ms 16.8%
+  triton_convolution_4 24.2973 ms 15.3%
+  triton_convolution_5 26.9378 ms 13.8%
+  triton_convolution_0 30.0738 ms 12.3%
+  triton_convolution_2 32.5176 ms 11.4%
+  triton_convolution_1 81.4506 ms 4.6%
+SingleProcess AUTOTUNE takes 5.0160 seconds
+AUTOTUNE mm(3268608x64, 64x64)
+  triton_mm_14 0.5186 ms 100.0%
+  triton_mm_8 0.5193 ms 99.9%
+  triton_mm_7 0.5283 ms 98.2%
+  triton_mm_10 0.5368 ms 96.6%
+  triton_mm_6 0.5460 ms 95.0%
+  triton_mm_13 0.5492 ms 94.4%
+  triton_mm_9 0.5513 ms 94.1%
+  mm 0.5668 ms 91.5%
+  triton_mm_15 0.6987 ms 74.2%
+  triton_mm_16 0.7410 ms 70.0%
+SingleProcess AUTOTUNE takes 4.1927 seconds
+AUTOTUNE convolution(32x64x304x336, 64x64x3x3)
+  convolution 1.4962 ms 100.0%
+  triton_convolution_18 7.1458 ms 20.9%
+  triton_convolution_23 8.0892 ms 18.5%
+  triton_convolution_24 9.7211 ms 15.4%
+  triton_convolution_19 11.9942 ms 12.5%
+  triton_convolution_21 12.1926 ms 12.3%
+  triton_convolution_22 12.5636 ms 11.9%
+  triton_convolution_20 28.5195 ms 5.2%
+SingleProcess AUTOTUNE takes 4.7321 seconds
+AUTOTUNE mm(3268608x64, 64x256)
+  triton_mm_27 1.5259 ms 100.0%
+  triton_mm_26 1.5455 ms 98.7%
+  triton_mm_28 1.7398 ms 87.7%
+  mm 1.7660 ms 86.4%
+  triton_mm_29 1.7692 ms 86.2%
+  triton_mm_33 1.8433 ms 82.8%
+  triton_mm_25 1.8774 ms 81.3%
+  triton_mm_32 1.9772 ms 77.2%
+  triton_mm_35 2.4140 ms 63.2%
+  triton_mm_34 2.8250 ms 54.0%
+SingleProcess AUTOTUNE takes 4.9396 seconds
+AUTOTUNE mm(3268608x256, 256x64)
+  triton_mm_51 1.3316 ms 100.0%
+  triton_mm_53 1.3624 ms 97.7%
+  triton_mm_56 1.3661 ms 97.5%
+  mm 1.4040 ms 94.8%
+  triton_mm_57 1.4285 ms 93.2%
+  triton_mm_50 1.4361 ms 92.7%
+  triton_mm_49 1.4596 ms 91.2%
+  triton_mm_52 1.4744 ms 90.3%
+  triton_mm_54 2.1015 ms 63.4%
+  triton_mm_55 2.2606 ms 58.9%
+SingleProcess AUTOTUNE takes 4.7929 seconds
+AUTOTUNE convolution(32x256x304x336, 128x256x1x1)
+  convolution 0.4567 ms 100.0%
+  triton_convolution_114 1.0757 ms 42.5%
+  triton_convolution_111 1.2215 ms 37.4%
+  triton_convolution_117 1.3653 ms 33.5%
+  triton_convolution_116 1.5024 ms 30.4%
+  triton_convolution_115 1.6693 ms 27.4%
+  triton_convolution_112 3.1440 ms 14.5%
+  triton_convolution_113 6.8288 ms 6.7%
+SingleProcess AUTOTUNE takes 4.5671 seconds
+AUTOTUNE convolution(32x128x152x168, 128x128x3x3)
+  convolution 1.1805 ms 100.0%
+  triton_convolution_121 7.0887 ms 16.7%
+  triton_convolution_118 8.0229 ms 14.7%
+  triton_convolution_123 8.3772 ms 14.1%
+  triton_convolution_124 10.6232 ms 11.1%
+  triton_convolution_122 12.4481 ms 9.5%
+  triton_convolution_119 13.7375 ms 8.6%
+  triton_convolution_120 28.6058 ms 4.1%
+SingleProcess AUTOTUNE takes 4.7865 seconds
+AUTOTUNE mm(817152x128, 128x512)
+  triton_mm_127 0.9996 ms 100.0%
+  triton_mm_126 1.0023 ms 99.7%
+  triton_mm_132 1.0768 ms 92.8%
+  triton_mm_125 1.1161 ms 89.6%
+  mm 1.1354 ms 88.0%
+  triton_mm_128 1.1937 ms 83.7%
+  triton_mm_129 1.2079 ms 82.8%
+  triton_mm_133 1.3932 ms 71.8%
+  triton_mm_135 1.6393 ms 61.0%
+  triton_mm_134 2.2798 ms 43.8%
+SingleProcess AUTOTUNE takes 4.8697 seconds
+AUTOTUNE convolution(32x256x304x336, 512x256x1x1)
+  convolution 1.5260 ms 100.0%
+  triton_convolution_140 4.2531 ms 35.9%
+  triton_convolution_142 4.5226 ms 33.7%
+  triton_convolution_143 5.3815 ms 28.4%
+  triton_convolution_141 6.6562 ms 22.9%
+  triton_convolution_137 8.2852 ms 18.4%
+  triton_convolution_138 12.5198 ms 12.2%
+  triton_convolution_139 27.1271 ms 5.6%
+SingleProcess AUTOTUNE takes 5.0886 seconds
+AUTOTUNE mm(817152x512, 512x128)
+  mm 0.7224 ms 100.0%
+  triton_mm_146 0.8024 ms 90.0%
+  triton_mm_145 0.8341 ms 86.6%
+  triton_mm_148 0.8952 ms 80.7%
+  triton_mm_147 0.9011 ms 80.2%
+  triton_mm_151 0.9721 ms 74.3%
+  triton_mm_152 1.0261 ms 70.4%
+  triton_mm_144 1.0666 ms 67.7%
+  triton_mm_154 1.5977 ms 45.2%
+  triton_mm_149 1.7777 ms 40.6%
+SingleProcess AUTOTUNE takes 5.0220 seconds
+AUTOTUNE convolution(32x512x152x168, 256x512x1x1)
+  convolution 0.3106 ms 100.0%
+  triton_convolution_240 1.0496 ms 29.6%
+  triton_convolution_242 1.0762 ms 28.9%
+  triton_convolution_243 1.2981 ms 23.9%
+  triton_convolution_241 1.7032 ms 18.2%
+  triton_convolution_237 2.1030 ms 14.8%
+  triton_convolution_238 3.1476 ms 9.9%
+  triton_convolution_239 6.7274 ms 4.6%
+SingleProcess AUTOTUNE takes 4.7450 seconds
+AUTOTUNE convolution(32x256x76x84, 256x256x3x3)
+  convolution 1.0683 ms 100.0%
+  triton_convolution_249 6.4305 ms 16.6%
+  triton_convolution_247 7.2643 ms 14.7%
+  triton_convolution_244 7.9726 ms 13.4%
+  triton_convolution_250 11.5396 ms 9.3%
+  triton_convolution_248 16.4705 ms 6.5%
+  triton_convolution_245 19.3214 ms 5.5%
+  triton_convolution_246 29.2912 ms 3.6%
+SingleProcess AUTOTUNE takes 5.4721 seconds
+AUTOTUNE mm(204288x256, 256x1024)
+  mm 0.6795 ms 100.0%
+  triton_mm_253 0.7678 ms 88.5%
+  triton_mm_252 0.7709 ms 88.1%
+  triton_mm_258 0.7911 ms 85.9%
+  triton_mm_251 0.9138 ms 74.4%
+  triton_mm_254 0.9144 ms 74.3%
+  triton_mm_255 0.9386 ms 72.4%
+  triton_mm_259 1.0814 ms 62.8%
+  triton_mm_261 1.3090 ms 51.9%
+  triton_mm_260 1.8635 ms 36.5%
+SingleProcess AUTOTUNE takes 4.9279 seconds
+AUTOTUNE convolution(32x512x152x168, 1024x512x1x1)
+  convolution 1.2123 ms 100.0%
+  triton_convolution_266 4.1210 ms 29.4%
+  triton_convolution_268 4.2024 ms 28.8%
+  triton_convolution_269 5.0803 ms 23.9%
+  triton_convolution_267 6.5498 ms 18.5%
+  triton_convolution_263 8.1485 ms 14.9%
+  triton_convolution_264 12.4510 ms 9.7%
+  triton_convolution_265 26.7531 ms 4.5%
+SingleProcess AUTOTUNE takes 5.0502 seconds
+AUTOTUNE mm(204288x1024, 1024x256)
+  mm 0.5419 ms 100.0%
+  triton_mm_271 0.6533 ms 82.9%
+  triton_mm_272 0.6558 ms 82.6%
+  triton_mm_273 0.7579 ms 71.5%
+  triton_mm_274 0.7680 ms 70.6%
+  triton_mm_270 0.8710 ms 62.2%
+  triton_mm_278 0.8870 ms 61.1%
+  triton_mm_277 0.9553 ms 56.7%
+  triton_mm_280 1.3468 ms 40.2%
+  triton_mm_279 1.5510 ms 34.9%
+SingleProcess AUTOTUNE takes 5.6453 seconds
+AUTOTUNE convolution(32x1024x76x84, 512x1024x1x1)
+  convolution 0.2594 ms 100.0%
+  triton_convolution_955 1.0005 ms 25.9%
+  triton_convolution_957 1.1321 ms 22.9%
+  triton_convolution_958 1.2459 ms 20.8%
+  triton_convolution_956 1.9279 ms 13.5%
+  triton_convolution_952 2.2604 ms 11.5%
+  triton_convolution_953 3.1723 ms 8.2%
+  triton_convolution_954 6.5568 ms 4.0%
+SingleProcess AUTOTUNE takes 5.2660 seconds
+AUTOTUNE convolution(32x512x38x42, 512x512x3x3)
+  convolution 1.0310 ms 100.0%
+  triton_convolution_964 6.9642 ms 14.8%
+  triton_convolution_959 9.8914 ms 10.4%
+  triton_convolution_962 10.1237 ms 10.2%
+  triton_convolution_965 17.1603 ms 6.0%
+  triton_convolution_963 19.1972 ms 5.4%
+  triton_convolution_960 21.0584 ms 4.9%
+  triton_convolution_961 28.6043 ms 3.6%
+SingleProcess AUTOTUNE takes 5.4783 seconds
+AUTOTUNE mm(51072x512, 512x2048)
+  mm 0.5481 ms 100.0%
+  triton_mm_968 0.6581 ms 83.3%
+  triton_mm_967 0.6594 ms 83.1%
+  triton_mm_973 0.6983 ms 78.5%
+  triton_mm_970 0.7820 ms 70.1%
+  triton_mm_969 0.7847 ms 69.9%
+  triton_mm_966 0.8211 ms 66.8%
+  triton_mm_974 0.9289 ms 59.0%
+  triton_mm_976 1.2285 ms 44.6%
+  triton_mm_972 1.6656 ms 32.9%
+SingleProcess AUTOTUNE takes 5.7071 seconds
+AUTOTUNE convolution(32x1024x76x84, 2048x1024x1x1)
+  convolution 1.0465 ms 100.0%
+  triton_convolution_981 3.9368 ms 26.6%
+  triton_convolution_983 4.3779 ms 23.9%
+  triton_convolution_984 4.9079 ms 21.3%
+  triton_convolution_982 7.6491 ms 13.7%
+  triton_convolution_978 8.3042 ms 12.6%
+  triton_convolution_979 12.2804 ms 8.5%
+  triton_convolution_980 26.2045 ms 4.0%
+SingleProcess AUTOTUNE takes 5.1454 seconds
+AUTOTUNE mm(51072x2048, 2048x512)
+  mm 0.4900 ms 100.0%
+  triton_mm_987 0.6065 ms 80.8%
+  triton_mm_986 0.6077 ms 80.6%
+  triton_mm_988 0.7009 ms 69.9%
+  triton_mm_989 0.7034 ms 69.7%
+  triton_mm_985 0.8206 ms 59.7%
+  triton_mm_993 0.8374 ms 58.5%
+  triton_mm_992 0.8790 ms 55.7%
+  triton_mm_995 1.2860 ms 38.1%
+  triton_mm_991 1.5045 ms 32.6%
+SingleProcess AUTOTUNE takes 5.4583 seconds
+AUTOTUNE addmm(51072x256, 51072x2048, 2048x256)
+  bias_addmm 0.3029 ms 100.0%
+  addmm 0.3089 ms 98.1%
+  triton_mm_1049 0.3235 ms 93.6%
+  triton_mm_1048 0.3275 ms 92.5%
+  triton_mm_1050 0.3550 ms 85.3%
+  triton_mm_1051 0.3622 ms 83.6%
+  triton_mm_1055 0.4242 ms 71.4%
+  triton_mm_1047 0.4364 ms 69.4%
+  triton_mm_1054 0.5085 ms 59.6%
+  triton_mm_1057 0.7001 ms 43.3%
+SingleProcess AUTOTUNE takes 5.7065 seconds
+AUTOTUNE convolution(32x256x38x42, 256x256x3x3)
+  convolution 0.2750 ms 100.0%
+  triton_convolution_1064 1.7089 ms 16.1%
+  triton_convolution_1062 1.8415 ms 14.9%
+  triton_convolution_1059 2.1425 ms 12.8%
+  triton_convolution_1065 2.9491 ms 9.3%
+  triton_convolution_1063 3.9602 ms 6.9%
+  triton_convolution_1060 4.6976 ms 5.9%
+  triton_convolution_1061 7.4853 ms 3.7%
+SingleProcess AUTOTUNE takes 5.0653 seconds
+AUTOTUNE addmm(3268608x256, 3268608x256, 256x256)
+  triton_mm_1067 3.5880 ms 100.0%
+  triton_mm_1068 3.6040 ms 99.6%
+  triton_mm_1073 3.8683 ms 92.8%
+  triton_mm_1066 4.2427 ms 84.6%
+  triton_mm_1070 4.2668 ms 84.1%
+  triton_mm_1069 4.2703 ms 84.0%
+  triton_mm_1074 4.8278 ms 74.3%
+  bias_addmm 5.9413 ms 60.4%
+  addmm 5.9656 ms 60.1%
+  triton_mm_1076 6.2184 ms 57.7%
+SingleProcess AUTOTUNE takes 5.7881 seconds
+AUTOTUNE addmm(817152x256, 817152x512, 512x256)
+  bias_addmm 1.3238 ms 100.0%
+  triton_mm_1080 1.5291 ms 86.6%
+  triton_mm_1079 1.5313 ms 86.4%
+  triton_mm_1081 1.7488 ms 75.7%
+  triton_mm_1082 1.7947 ms 73.8%
+  addmm 1.8729 ms 70.7%
+  triton_mm_1078 1.9184 ms 69.0%
+  triton_mm_1086 2.0476 ms 64.6%
+  triton_mm_1085 2.6087 ms 50.7%
+  triton_mm_1088 2.8319 ms 46.7%
+SingleProcess AUTOTUNE takes 6.2909 seconds
+AUTOTUNE addmm(204288x256, 204288x1024, 1024x256)
+  bias_addmm 0.5711 ms 100.0%
+  triton_mm_1091 0.6686 ms 85.4%
+  triton_mm_1092 0.6850 ms 83.4%
+  addmm 0.7367 ms 77.5%
+  triton_mm_1093 0.7751 ms 73.7%
+  triton_mm_1094 0.7860 ms 72.7%
+  triton_mm_1090 0.8937 ms 63.9%
+  triton_mm_1098 0.9115 ms 62.7%
+  triton_mm_1097 0.9857 ms 57.9%
+  triton_mm_1100 1.3505 ms 42.3%
+SingleProcess AUTOTUNE takes 5.8394 seconds
+AUTOTUNE convolution(32x256x304x336, 256x256x3x3)
+  convolution 17.6540 ms 100.0%
+  triton_convolution_1107 100.5644 ms 17.6%
+  triton_convolution_1105 118.3672 ms 14.9%
+  triton_convolution_1102 122.4366 ms 14.4%
+  triton_convolution_1108 186.0839 ms 9.5%
+  triton_convolution_1106 265.2757 ms 6.7%
+  triton_convolution_1103 300.7162 ms 5.9%
+  triton_convolution_1104 466.2684 ms 3.8%
+SingleProcess AUTOTUNE takes 17.1328 seconds
+AUTOTUNE convolution(32x256x152x168, 256x256x3x3)
+  convolution 4.3941 ms 100.0%
+  triton_convolution_1114 25.2578 ms 17.4%
+  triton_convolution_1112 29.7074 ms 14.8%
+  triton_convolution_1109 31.0365 ms 14.2%
+  triton_convolution_1115 46.3984 ms 9.5%
+  triton_convolution_1113 66.1080 ms 6.6%
+  triton_convolution_1110 76.1160 ms 5.8%
+  triton_convolution_1111 116.5012 ms 3.8%
+SingleProcess AUTOTUNE takes 7.3893 seconds
+AUTOTUNE addmm(3268608x3, 3268608x256, 256x3)
+  triton_mm_1132 1.0489 ms 100.0%
+  triton_mm_1131 1.0684 ms 98.2%
+  triton_mm_1135 1.0694 ms 98.1%
+  triton_mm_1133 1.0721 ms 97.8%
+  triton_mm_1134 1.0732 ms 97.7%
+  triton_mm_1137 1.0832 ms 96.8%
+  triton_mm_1130 1.0861 ms 96.6%
+  triton_mm_1141 1.1324 ms 92.6%
+  triton_mm_1140 1.1654 ms 90.0%
+  triton_mm_1138 1.6334 ms 64.2%
+SingleProcess AUTOTUNE takes 4.4295 seconds
+AUTOTUNE addmm(817152x3, 817152x256, 256x3)
+  triton_mm_1151 0.2790 ms 100.0%
+  triton_mm_1150 0.2827 ms 98.7%
+  triton_mm_1154 0.2834 ms 98.4%
+  triton_mm_1152 0.2838 ms 98.3%
+  triton_mm_1153 0.2846 ms 98.0%
+  triton_mm_1149 0.2883 ms 96.8%
+  triton_mm_1156 0.2890 ms 96.5%
+  triton_mm_1160 0.3026 ms 92.2%
+  triton_mm_1159 0.3090 ms 90.3%
+  bias_addmm 0.4003 ms 69.7%
+SingleProcess AUTOTUNE takes 4.7481 seconds
+AUTOTUNE addmm(204288x3, 204288x256, 256x3)
+  triton_mm_1169 0.0863 ms 100.0%
+  triton_mm_1171 0.0867 ms 99.5%
+  triton_mm_1173 0.0870 ms 99.2%
+  triton_mm_1172 0.0878 ms 98.3%
+  triton_mm_1170 0.0879 ms 98.2%
+  triton_mm_1168 0.0897 ms 96.2%
+  triton_mm_1175 0.0902 ms 95.7%
+  triton_mm_1178 0.0952 ms 90.6%
+  triton_mm_1179 0.1020 ms 84.6%
+  triton_mm_1176 0.1132 ms 76.3%
+SingleProcess AUTOTUNE takes 4.5108 seconds
+AUTOTUNE addmm(51072x3, 51072x256, 256x3)
+  triton_mm_1189 0.0340 ms 100.0%
+  triton_mm_1191 0.0343 ms 99.0%
+  triton_mm_1190 0.0344 ms 98.7%
+  triton_mm_1188 0.0347 ms 97.9%
+  triton_mm_1187 0.0351 ms 96.9%
+  triton_mm_1194 0.0351 ms 96.8%
+  triton_mm_1192 0.0352 ms 96.6%
+  triton_mm_1197 0.0364 ms 93.4%
+  triton_mm_1198 0.0364 ms 93.3%
+  bias_addmm 0.0380 ms 89.4%
+SingleProcess AUTOTUNE takes 4.2754 seconds
+AUTOTUNE convolution(32x256x19x21, 256x256x3x3)
+  convolution 0.0683 ms 100.0%
+  triton_convolution_1202 0.4277 ms 16.0%
+  triton_convolution_1204 0.4415 ms 15.5%
+  triton_convolution_1199 0.5705 ms 12.0%
+  triton_convolution_1205 0.6938 ms 9.8%
+  triton_convolution_1200 0.7713 ms 8.9%
+  triton_convolution_1203 0.7883 ms 8.7%
+  triton_convolution_1201 1.8232 ms 3.7%
+SingleProcess AUTOTUNE takes 4.8825 seconds
+AUTOTUNE addmm(12768x3, 12768x256, 256x3)
+  triton_mm_1209 0.0132 ms 100.0%
+  triton_mm_1207 0.0133 ms 99.8%
+  triton_mm_1211 0.0133 ms 99.5%
+  triton_mm_1208 0.0135 ms 97.9%
+  triton_mm_1210 0.0141 ms 94.1%
+  triton_mm_1214 0.0149 ms 89.0%
+  triton_mm_1206 0.0153 ms 86.6%
+  bias_addmm 0.0161 ms 82.5%
+  triton_mm_1213 0.0161 ms 82.3%
+  triton_mm_1216 0.0173 ms 76.7%
+SingleProcess AUTOTUNE takes 4.5147 seconds
+AUTOTUNE addmm(3268608x12, 3268608x256, 256x12)
+  triton_mm_1226 1.0755 ms 100.0%
+  triton_mm_1220 1.1086 ms 97.0%
+  triton_mm_1219 1.1316 ms 95.0%
+  triton_mm_1221 1.1340 ms 94.8%
+  triton_mm_1223 1.1346 ms 94.8%
+  triton_mm_1222 1.1365 ms 94.6%
+  triton_mm_1225 1.1634 ms 92.4%
+  triton_mm_1218 1.1695 ms 92.0%
+  triton_mm_1227 1.2061 ms 89.2%
+  triton_mm_1229 1.2184 ms 88.3%
+SingleProcess AUTOTUNE takes 4.3601 seconds
+AUTOTUNE addmm(817152x12, 817152x256, 256x12)
+  triton_mm_1238 0.2854 ms 100.0%
+  triton_mm_1232 0.2928 ms 97.5%
+  triton_mm_1231 0.2960 ms 96.4%
+  triton_mm_1233 0.2968 ms 96.2%
+  triton_mm_1235 0.2970 ms 96.1%
+  triton_mm_1234 0.2985 ms 95.6%
+  triton_mm_1237 0.3063 ms 93.2%
+  triton_mm_1230 0.3072 ms 92.9%
+  triton_mm_1239 0.3164 ms 90.2%
+  triton_mm_1236 0.3207 ms 89.0%
+SingleProcess AUTOTUNE takes 4.1640 seconds
+AUTOTUNE addmm(204288x12, 204288x256, 256x12)
+  triton_mm_1250 0.0884 ms 100.0%
+  triton_mm_1243 0.0885 ms 99.8%
+  triton_mm_1247 0.0889 ms 99.4%
+  triton_mm_1245 0.0892 ms 99.1%
+  triton_mm_1244 0.0893 ms 98.9%
+  triton_mm_1246 0.0901 ms 98.1%
+  triton_mm_1242 0.0923 ms 95.8%
+  triton_mm_1249 0.0923 ms 95.8%
+  triton_mm_1248 0.0940 ms 94.0%
+  triton_mm_1251 0.0940 ms 94.0%
+SingleProcess AUTOTUNE takes 4.0702 seconds
+AUTOTUNE addmm(51072x12, 51072x256, 256x12)
+  triton_mm_1256 0.0333 ms 100.0%
+  triton_mm_1262 0.0336 ms 99.0%
+  triton_mm_1263 0.0343 ms 97.2%
+  triton_mm_1255 0.0343 ms 97.1%
+  triton_mm_1257 0.0344 ms 96.8%
+  triton_mm_1258 0.0344 ms 96.7%
+  triton_mm_1254 0.0350 ms 95.1%
+  triton_mm_1261 0.0350 ms 95.1%
+  triton_mm_1259 0.0352 ms 94.5%
+  triton_mm_1260 0.0361 ms 92.2%
+SingleProcess AUTOTUNE takes 4.3500 seconds
+AUTOTUNE addmm(12768x12, 12768x256, 256x12)
+  triton_mm_1267 0.0132 ms 100.0%
+  triton_mm_1275 0.0132 ms 100.0%
+  triton_mm_1271 0.0134 ms 98.6%
+  triton_mm_1270 0.0135 ms 97.9%
+  triton_mm_1274 0.0136 ms 97.4%
+  triton_mm_1272 0.0137 ms 96.7%
+  triton_mm_1269 0.0138 ms 95.6%
+  triton_mm_1268 0.0139 ms 95.2%
+  bias_addmm 0.0147 ms 90.2%
+  triton_mm_1266 0.0152 ms 87.1%
+SingleProcess AUTOTUNE takes 4.2287 seconds
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 11:30:21,351] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+AUTOTUNE int_mm(32000x12544, 12544x1024, 32000x1024)
+  triton_mm_1288 1.6268 ms 100.0%
+  triton_mm_1287 1.6641 ms 97.8%
+  triton_mm_1286 3.2028 ms 50.8%
+  triton_mm_1279 3.3561 ms 48.5%
+  triton_mm_1280 3.3804 ms 48.1%
+  triton_mm_1281 3.5864 ms 45.4%
+  triton_mm_1282 3.6539 ms 44.5%
+  triton_mm_1278 4.6130 ms 35.3%
+  triton_mm_1285 6.7362 ms 24.2%
+  triton_mm_1284 10.3430 ms 15.7%
+SingleProcess AUTOTUNE takes 7.9031 seconds
+AUTOTUNE int_mm(32000x1024, 1024x1024, 32000x1024)
+  triton_mm_1299 0.3092 ms 100.0%
+  triton_mm_1298 0.3122 ms 99.0%
+  triton_mm_1291 0.3562 ms 86.8%
+  triton_mm_1290 0.3630 ms 85.2%
+  triton_mm_1297 0.3685 ms 83.9%
+  triton_mm_1292 0.4132 ms 74.8%
+  triton_mm_1293 0.4229 ms 73.1%
+  triton_mm_1289 0.4459 ms 69.3%
+  triton_mm_1296 0.6285 ms 49.2%
+  triton_mm_1295 0.9780 ms 31.6%
+SingleProcess AUTOTUNE takes 7.8710 seconds
+AUTOTUNE int_mm(32000x1024, 1024x81, 32000x81)
+  triton_mm_1309 0.0622 ms 100.0%
+  triton_mm_1308 0.0670 ms 92.8%
+  triton_mm_1304 0.0760 ms 81.8%
+  triton_mm_1302 0.0789 ms 78.8%
+  triton_mm_1310 0.0896 ms 69.4%
+  triton_mm_1301 0.0958 ms 64.9%
+  triton_mm_1303 0.0958 ms 64.9%
+  triton_mm_1305 0.1015 ms 61.2%
+  triton_mm_1300 0.1186 ms 52.4%
+  triton_mm_1307 0.1248 ms 49.8%
+SingleProcess AUTOTUNE takes 8.0148 seconds
+AUTOTUNE int_mm(32000x1024, 1024x320, 32000x320)
+  triton_mm_1319 0.1247 ms 100.0%
+  triton_mm_1321 0.1270 ms 98.2%
+  triton_mm_1313 0.1348 ms 92.5%
+  triton_mm_1315 0.1486 ms 83.9%
+  triton_mm_1320 0.1552 ms 80.4%
+  triton_mm_1312 0.1607 ms 77.6%
+  triton_mm_1311 0.1653 ms 75.4%
+  triton_mm_1314 0.1705 ms 73.1%
+  triton_mm_1318 0.2664 ms 46.8%
+  triton_mm_1317 0.3146 ms 39.6%
+SingleProcess AUTOTUNE takes 7.6002 seconds
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:02<01:02,  2.14s/it]running benchmark:   7%|▋         | 2/30 [00:04<00:59,  2.11s/it]running benchmark:  10%|█         | 3/30 [00:06<00:56,  2.10s/it]running benchmark:  13%|█▎        | 4/30 [00:08<00:54,  2.09s/it]running benchmark:  17%|█▋        | 5/30 [00:10<00:52,  2.09s/it]running benchmark:  20%|██        | 6/30 [00:12<00:49,  2.08s/it]running benchmark:  23%|██▎       | 7/30 [00:14<00:47,  2.08s/it]running benchmark:  27%|██▋       | 8/30 [00:16<00:45,  2.07s/it]running benchmark:  30%|███       | 9/30 [00:18<00:43,  2.08s/it]running benchmark:  33%|███▎      | 10/30 [00:20<00:41,  2.07s/it]running benchmark:  37%|███▋      | 11/30 [00:22<00:39,  2.08s/it]running benchmark:  40%|████      | 12/30 [00:25<00:37,  2.08s/it]running benchmark:  43%|████▎     | 13/30 [00:27<00:35,  2.08s/it]running benchmark:  47%|████▋     | 14/30 [00:29<00:32,  2.06s/it]running benchmark:  50%|█████     | 15/30 [00:31<00:30,  2.04s/it]running benchmark:  53%|█████▎    | 16/30 [00:33<00:28,  2.03s/it]running benchmark:  57%|█████▋    | 17/30 [00:35<00:26,  2.02s/it]running benchmark:  60%|██████    | 18/30 [00:37<00:24,  2.02s/it]running benchmark:  63%|██████▎   | 19/30 [00:39<00:22,  2.02s/it]running benchmark:  67%|██████▋   | 20/30 [00:41<00:20,  2.03s/it]running benchmark:  70%|███████   | 21/30 [00:43<00:18,  2.04s/it]running benchmark:  73%|███████▎  | 22/30 [00:45<00:16,  2.05s/it]running benchmark:  77%|███████▋  | 23/30 [00:47<00:14,  2.05s/it]running benchmark:  80%|████████  | 24/30 [00:49<00:12,  2.04s/it]running benchmark:  83%|████████▎ | 25/30 [00:51<00:10,  2.04s/it]running benchmark:  87%|████████▋ | 26/30 [00:53<00:08,  2.03s/it]running benchmark:  90%|█████████ | 27/30 [00:55<00:06,  2.03s/it]running benchmark:  93%|█████████▎| 28/30 [00:57<00:04,  2.03s/it]running benchmark:  97%|█████████▋| 29/30 [00:59<00:02,  2.03s/it]running benchmark: 100%|██████████| 30/30 [01:01<00:00,  2.03s/it]running benchmark: 100%|██████████| 30/30 [01:01<00:00,  2.05s/it]
+2927.547ms
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_50_c4
+loading model: 0it [00:06, ?it/s]
+WARNING:root:detectron2_fasterrcnn_r_50_c4 failed to load
+Original Error: "roi_align_forward_kernel" not implemented for 'BFloat16'
+Eager model failed to run
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model
+    self.model_iter_fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward
+    return self.inference(batched_inputs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference
+    results, _ = self.roi_heads(images, features, proposals, None)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 477, in forward
+    box_features = self._shared_roi_transform(
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 456, in _shared_roi_transform
+    x = self.pooler(features, boxes)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward
+    return self.level_poolers[0](x[0], pooler_fmt_boxes)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward
+    return roi_align(
+  File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 238, in roi_align
+    return torch.ops.torchvision.roi_align(
+  File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 755, in __call__
+    return self._op(*args, **(kwargs or {}))
+RuntimeError: "roi_align_forward_kernel" not implemented for 'BFloat16'
+
+The above exception was the direct cause of the following exception:
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model
+    self.validate_model(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model
+    raise NotImplementedError("Eager model failed to run") from e
+NotImplementedError: Eager model failed to run
+
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_50_dc5
+WARNING:common:Model detectron2_fasterrcnn_r_50_dc5 does not support bfloat16, running with amp instead
+loading model: 0it [00:07, ?it/s]
+WARNING:common:Model detectron2_fasterrcnn_r_50_dc5 does not support bfloat16, running with amp instead
+cuda eval  detectron2_fasterrcnn_r_50_dc5      int8dynamic-bs32          
+WARNING:common:Model detectron2_fasterrcnn_r_50_dc5 does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['mutated inputs']
+AUTOTUNE mm(201600x1024, 1024x512)
+  mm 1.0251 ms 100.0%
+  triton_mm_427 1.2480 ms 82.1%
+  triton_mm_426 1.2514 ms 81.9%
+  triton_mm_428 1.4767 ms 69.4%
+  triton_mm_429 1.4783 ms 69.3%
+  triton_mm_425 1.6093 ms 63.7%
+  triton_mm_432 1.7034 ms 60.2%
+  triton_mm_433 1.7276 ms 59.3%
+  triton_mm_435 2.4649 ms 41.6%
+  triton_mm_434 2.9982 ms 34.2%
+SingleProcess AUTOTUNE takes 5.3354 seconds
+AUTOTUNE mm(201600x512, 512x2048)
+  mm 2.1985 ms 100.0%
+  triton_mm_439 2.6464 ms 83.1%
+  triton_mm_438 2.6546 ms 82.8%
+  triton_mm_444 2.7832 ms 79.0%
+  triton_mm_440 3.1069 ms 70.8%
+  triton_mm_441 3.1550 ms 69.7%
+  triton_mm_437 3.2710 ms 67.2%
+  triton_mm_445 3.6688 ms 59.9%
+  triton_mm_447 4.9105 ms 44.8%
+  triton_mm_443 6.6928 ms 32.8%
+SingleProcess AUTOTUNE takes 5.2146 seconds
+AUTOTUNE mm(201600x1024, 1024x2048)
+  mm 4.0108 ms 100.0%
+  triton_mm_451 4.8500 ms 82.7%
+  triton_mm_450 4.9174 ms 81.6%
+  triton_mm_452 5.7943 ms 69.2%
+  triton_mm_453 5.8078 ms 69.1%
+  triton_mm_449 6.1762 ms 64.9%
+  triton_mm_456 6.5651 ms 61.1%
+  triton_mm_457 6.7696 ms 59.2%
+  triton_mm_459 9.4297 ms 42.5%
+  triton_mm_458 11.9871 ms 33.5%
+SingleProcess AUTOTUNE takes 5.6916 seconds
+AUTOTUNE mm(201600x2048, 2048x512)
+  mm 2.0099 ms 100.0%
+  triton_mm_463 2.4148 ms 83.2%
+  triton_mm_462 2.4343 ms 82.6%
+  triton_mm_465 2.8084 ms 71.6%
+  triton_mm_464 2.8303 ms 71.0%
+  triton_mm_461 3.2515 ms 61.8%
+  triton_mm_469 3.3163 ms 60.6%
+  triton_mm_468 3.3236 ms 60.5%
+  triton_mm_471 4.9657 ms 40.5%
+  triton_mm_466 5.9533 ms 33.8%
+SingleProcess AUTOTUNE takes 5.3234 seconds
+AUTOTUNE convolution(32x2048x75x84, 2048x2048x3x3)
+  convolution 68.9486 ms 100.0%
+  triton_convolution_514 677.5403 ms 10.2%
+  triton_convolution_509 743.2748 ms 9.3%
+  triton_convolution_515 1005.0124 ms 6.9%
+  triton_convolution_512 1151.3732 ms 6.0%
+  triton_convolution_510 1179.7103 ms 5.8%
+  triton_convolution_511 1853.1465 ms 3.7%
+  triton_convolution_513 2211.6580 ms 3.1%
+SingleProcess AUTOTUNE takes 75.0716 seconds
+AUTOTUNE addmm(201600x15, 201600x2048, 2048x15)
+  triton_mm_524 0.5210 ms 100.0%
+  triton_mm_525 0.5295 ms 98.4%
+  triton_mm_518 0.5434 ms 95.9%
+  triton_mm_521 0.5438 ms 95.8%
+  triton_mm_519 0.5439 ms 95.8%
+  triton_mm_517 0.5459 ms 95.4%
+  triton_mm_522 0.5463 ms 95.4%
+  triton_mm_520 0.5465 ms 95.3%
+  triton_mm_516 0.7949 ms 65.5%
+  triton_mm_523 0.7979 ms 65.3%
+SingleProcess AUTOTUNE takes 5.0114 seconds
+AUTOTUNE addmm(201600x60, 201600x2048, 2048x60)
+  triton_mm_530 0.5738 ms 100.0%
+  triton_mm_536 0.5820 ms 98.6%
+  triton_mm_532 0.5882 ms 97.5%
+  triton_mm_529 0.5973 ms 96.1%
+  triton_mm_531 0.6088 ms 94.2%
+  triton_mm_528 0.6182 ms 92.8%
+  triton_mm_535 0.6359 ms 90.2%
+  bias_addmm 0.8100 ms 70.8%
+  triton_mm_534 0.8153 ms 70.4%
+  triton_mm_533 0.8294 ms 69.2%
+SingleProcess AUTOTUNE takes 5.6168 seconds
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 11:40:00,313] [29/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+ERROR:common:Backend dynamo failed in warmup()
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2380, in warmup
+    fn(model, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 488, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward
+    return self.inference(batched_inputs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 203, in inference
+    images = self.preprocess_image(batched_inputs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 208, in resume_in_inference
+    proposals, _ = self.proposal_generator(images, features, None)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in resume_in_inference
+    results, _ = self.roi_heads(images, features, proposals, None)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 747, in forward
+    pred_instances = self._forward_box(features, proposals)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 798, in _forward_box
+    box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals])
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 654, in catch_errors
+    return callback(frame, cache_entry, hooks, frame_state)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 727, in _convert_frame
+    result = inner_convert(frame, cache_entry, hooks, frame_state)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 383, in _convert_frame_assert
+    compiled_product = _compile(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 646, in _compile
+    guarded_code = compile_inner(code, one_graph, hooks, transform)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 562, in compile_inner
+    out_code = transform_code_object(code, transform)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/bytecode_transformation.py", line 1033, in transform_code_object
+    transformations(instructions, code_options)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 151, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 527, in transform
+    tracer.run()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2144, in run
+    super().run()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 818, in run
+    and self.step()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 781, in step
+    getattr(self, inst.opname)(inst)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 465, in wrapper
+    return handle_graph_break(self, inst, speculation.reason)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 521, in handle_graph_break
+    self.output.compile_subgraph(self, reason=reason)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 945, in compile_subgraph
+    self.compile_and_call_fx_graph(tx, pass2.graph_output_vars(), root)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1087, in compile_and_call_fx_graph
+    compiled_fn = self.call_user_compiler(gm)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1159, in call_user_compiler
+    raise BackendCompilerFailed(self.compiler_fn, e).with_traceback(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1140, in call_user_compiler
+    compiled_fn = compiler_fn(gm, self.example_inputs())
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_dynamo.py", line 117, in debug_wrapper
+    compiled_gm = compiler_fn(gm, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/__init__.py", line 1672, in __call__
+    return compile_fx(model_, inputs_, config_patches=self.config)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 952, in compile_fx
+    return compile_fx(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1168, in compile_fx
+    return aot_autograd(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/backends/common.py", line 55, in compiler_fn
+    cg = aot_module_simplified(gm, example_inputs, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 887, in aot_module_simplified
+    compiled_fn = create_aot_dispatcher_function(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 600, in create_aot_dispatcher_function
+    compiled_fn = compiler_fn(flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 425, in aot_wrapper_dedupe
+    return compiler_fn(flat_fn, leaf_flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 630, in aot_wrapper_synthetic_base
+    return compiler_fn(flat_fn, flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py", line 97, in aot_dispatch_base
+    compiled_fw = compiler(fw_module, updated_flat_args)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1100, in fw_compiler_base
+    return inner_compile(
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_aot.py", line 83, in debug_wrapper
+    inner_compiled_fn = compiler_fn(gm, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/debug.py", line 305, in inner
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 320, in compile_fx_inner
+    compiled_graph = fx_codegen_and_compile(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 535, in fx_codegen_and_compile
+    graph.run(*example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 518, in run
+    return super().run(*args)
+  File "/home/cdhernandez/local/pytorch/torch/fx/interpreter.py", line 138, in run
+    self.env[node] = self.run_node(node)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 826, in run_node
+    result = super().run_node(n)
+  File "/home/cdhernandez/local/pytorch/torch/fx/interpreter.py", line 195, in run_node
+    return getattr(self, n.op)(n.target, args, kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 662, in call_function
+    return target(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/fx_passes/post_grad.py", line 988, in fused_int_mm_mul
+    return inductor.kernel.mm.tuned_fused_int_mm_mul(mat1, mat2, mat3, out_dtype)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/kernel/mm.py", line 305, in tuned_fused_int_mm_mul
+    return autotune_select_algorithm("int_mm", choices, [mat1, mat2, mat3], layout)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/select_algorithm.py", line 991, in autotune_select_algorithm
+    return _ALGORITHM_SELECTOR_CACHE(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/select_algorithm.py", line 723, in __call__
+    raise RuntimeError(
+torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised:
+RuntimeError: No choices to select, please consider adding ATEN into max_autotune_gemm_backends config (defined in torch/_inductor/config.py) to allow at least one choice. 
+
+Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information
+
+
+You can suppress this exception and fall back to eager by setting:
+    import torch._dynamo
+    torch._dynamo.config.suppress_errors = True
+
+Run failed with return code:  255
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_50_fpn
+WARNING:common:Model detectron2_fasterrcnn_r_50_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:05, ?it/s]
+WARNING:common:Model detectron2_fasterrcnn_r_50_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_fasterrcnn_r_50_fpn      int8dynamic-bs32          
+WARNING:common:Model detectron2_fasterrcnn_r_50_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 11:42:08,027] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:01<00:57,  1.98s/it]running benchmark:   7%|▋         | 2/30 [00:03<00:54,  1.96s/it]running benchmark:  10%|█         | 3/30 [00:05<00:53,  1.97s/it]running benchmark:  13%|█▎        | 4/30 [00:07<00:51,  1.98s/it]running benchmark:  17%|█▋        | 5/30 [00:09<00:49,  1.99s/it]running benchmark:  20%|██        | 6/30 [00:11<00:47,  1.99s/it]running benchmark:  23%|██▎       | 7/30 [00:13<00:45,  2.00s/it]running benchmark:  27%|██▋       | 8/30 [00:15<00:43,  1.99s/it]running benchmark:  30%|███       | 9/30 [00:17<00:41,  1.99s/it]running benchmark:  33%|███▎      | 10/30 [00:19<00:39,  1.99s/it]running benchmark:  37%|███▋      | 11/30 [00:21<00:37,  1.98s/it]running benchmark:  40%|████      | 12/30 [00:23<00:35,  1.98s/it]running benchmark:  43%|████▎     | 13/30 [00:25<00:33,  1.98s/it]running benchmark:  47%|████▋     | 14/30 [00:27<00:31,  1.99s/it]running benchmark:  50%|█████     | 15/30 [00:29<00:29,  1.99s/it]running benchmark:  53%|█████▎    | 16/30 [00:31<00:27,  1.99s/it]running benchmark:  57%|█████▋    | 17/30 [00:33<00:25,  1.99s/it]running benchmark:  60%|██████    | 18/30 [00:35<00:23,  2.00s/it]running benchmark:  63%|██████▎   | 19/30 [00:37<00:21,  1.99s/it]running benchmark:  67%|██████▋   | 20/30 [00:39<00:19,  1.99s/it]running benchmark:  70%|███████   | 21/30 [00:41<00:17,  1.99s/it]running benchmark:  73%|███████▎  | 22/30 [00:43<00:15,  1.98s/it]running benchmark:  77%|███████▋  | 23/30 [00:45<00:13,  1.99s/it]running benchmark:  80%|████████  | 24/30 [00:47<00:11,  1.98s/it]running benchmark:  83%|████████▎ | 25/30 [00:49<00:09,  1.98s/it]running benchmark:  87%|████████▋ | 26/30 [00:51<00:07,  1.99s/it]running benchmark:  90%|█████████ | 27/30 [00:53<00:05,  1.98s/it]running benchmark:  93%|█████████▎| 28/30 [00:55<00:03,  1.99s/it]running benchmark:  97%|█████████▋| 29/30 [00:57<00:01,  1.98s/it]running benchmark: 100%|██████████| 30/30 [00:59<00:00,  1.98s/it]running benchmark: 100%|██████████| 30/30 [00:59<00:00,  1.99s/it]
+2952.228ms
+loading model: 0it [00:00, ?it/s]detectron2_fcos_r_50_fpn
+WARNING:common:Model detectron2_fcos_r_50_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:06, ?it/s]
+WARNING:common:Model detectron2_fcos_r_50_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_fcos_r_50_fpn            int8dynamic-bs32          
+WARNING:common:Model detectron2_fcos_r_50_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['mutated inputs']
+AUTOTUNE convolution(32x256x38x42, 256x256x3x3)
+  convolution 0.0783 ms 100.0%
+  triton_convolution_582 0.5978 ms 13.1%
+  triton_convolution_577 0.7004 ms 11.2%
+  triton_convolution_578 0.8704 ms 9.0%
+  triton_convolution_580 0.9077 ms 8.6%
+  triton_convolution_583 0.9590 ms 8.2%
+  triton_convolution_581 1.4903 ms 5.3%
+  triton_convolution_579 2.0072 ms 3.9%
+SingleProcess AUTOTUNE takes 5.1466 seconds
+AUTOTUNE convolution(32x256x19x21, 256x256x3x3)
+  convolution 0.0448 ms 100.0%
+  triton_convolution_589 0.2911 ms 15.4%
+  triton_convolution_590 0.3798 ms 11.8%
+  triton_convolution_587 0.4128 ms 10.9%
+  triton_convolution_588 0.5483 ms 8.2%
+  triton_convolution_584 0.5991 ms 7.5%
+  triton_convolution_585 0.6881 ms 6.5%
+  triton_convolution_586 0.8734 ms 5.1%
+SingleProcess AUTOTUNE takes 4.8997 seconds
+[2023-12-12 11:51:02,187] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (1000)
+[2023-12-12 11:51:02,187] torch._dynamo.convert_frame: [WARNING]    function: 'resume_in__decode_per_level_predictions' (/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/dense_detector.py:213)
+[2023-12-12 11:51:02,187] torch._dynamo.convert_frame: [WARNING]    last reason: tensor 'L['___stack0']' size mismatch at index 0. expected 749787, actual 2762
+[2023-12-12 11:51:02,187] torch._dynamo.convert_frame: [WARNING] To log all recompilation reasons, use TORCH_LOGS="recompiles".
+[2023-12-12 11:51:02,187] torch._dynamo.convert_frame: [WARNING] To diagnose recompilation issues, see https://pytorch.org/docs/master/compile/troubleshooting.html.
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:01<00:29,  1.03s/it]running benchmark:   7%|▋         | 2/30 [00:02<00:28,  1.03s/it]running benchmark:  10%|█         | 3/30 [00:03<00:27,  1.03s/it]running benchmark:  13%|█▎        | 4/30 [00:04<00:26,  1.03s/it]running benchmark:  17%|█▋        | 5/30 [00:05<00:25,  1.03s/it]running benchmark:  20%|██        | 6/30 [00:06<00:24,  1.02s/it]running benchmark:  23%|██▎       | 7/30 [00:07<00:23,  1.02s/it]running benchmark:  27%|██▋       | 8/30 [00:08<00:22,  1.01s/it]running benchmark:  30%|███       | 9/30 [00:09<00:21,  1.03s/it]running benchmark:  33%|███▎      | 10/30 [00:10<00:20,  1.03s/it]running benchmark:  37%|███▋      | 11/30 [00:11<00:19,  1.04s/it]running benchmark:  40%|████      | 12/30 [00:12<00:18,  1.04s/it]running benchmark:  43%|████▎     | 13/30 [00:13<00:17,  1.04s/it]running benchmark:  47%|████▋     | 14/30 [00:14<00:16,  1.03s/it]running benchmark:  50%|█████     | 15/30 [00:15<00:15,  1.03s/it]running benchmark:  53%|█████▎    | 16/30 [00:16<00:14,  1.03s/it]running benchmark:  57%|█████▋    | 17/30 [00:17<00:13,  1.03s/it]running benchmark:  60%|██████    | 18/30 [00:18<00:12,  1.02s/it]running benchmark:  63%|██████▎   | 19/30 [00:19<00:11,  1.01s/it]running benchmark:  67%|██████▋   | 20/30 [00:20<00:10,  1.02s/it]running benchmark:  70%|███████   | 21/30 [00:21<00:09,  1.02s/it]running benchmark:  73%|███████▎  | 22/30 [00:22<00:08,  1.02s/it]running benchmark:  77%|███████▋  | 23/30 [00:23<00:07,  1.02s/it]running benchmark:  80%|████████  | 24/30 [00:24<00:06,  1.01s/it]running benchmark:  83%|████████▎ | 25/30 [00:25<00:05,  1.02s/it]running benchmark:  87%|████████▋ | 26/30 [00:26<00:04,  1.02s/it]running benchmark:  90%|█████████ | 27/30 [00:27<00:03,  1.03s/it]running benchmark:  93%|█████████▎| 28/30 [00:28<00:02,  1.03s/it]running benchmark:  97%|█████████▋| 29/30 [00:29<00:01,  1.03s/it]running benchmark: 100%|██████████| 30/30 [00:30<00:00,  1.03s/it]running benchmark: 100%|██████████| 30/30 [00:30<00:00,  1.03s/it]
+1065.188ms
+loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_101_c4
+WARNING:common:Model detectron2_maskrcnn_r_101_c4 does not support bfloat16, running with amp instead
+loading model: 0it [00:06, ?it/s]
+WARNING:common:Model detectron2_maskrcnn_r_101_c4 does not support bfloat16, running with amp instead
+cuda eval  detectron2_maskrcnn_r_101_c4        int8dynamic-bs32          
+WARNING:common:Model detectron2_maskrcnn_r_101_c4 does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 12:01:32,367] [29/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+AUTOTUNE convolution(28390x512x7x7, 512x512x3x3)
+  convolution 27.7557 ms 100.0%
+  triton_convolution_988 182.0226 ms 15.2%
+  triton_convolution_983 236.1335 ms 11.8%
+  triton_convolution_986 262.0013 ms 10.6%
+  triton_convolution_989 414.7668 ms 6.7%
+  triton_convolution_987 493.9995 ms 5.6%
+  triton_convolution_984 571.7394 ms 4.9%
+  triton_convolution_985 685.0649 ms 4.1%
+SingleProcess AUTOTUNE takes 27.3520 seconds
+ERROR:common:Backend dynamo failed in warmup()
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2380, in warmup
+    fn(model, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 488, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward
+    return self.inference(batched_inputs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 203, in inference
+    images = self.preprocess_image(batched_inputs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 208, in resume_in_inference
+    proposals, _ = self.proposal_generator(images, features, None)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in resume_in_inference
+    results, _ = self.roi_heads(images, features, proposals, None)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 477, in forward
+    box_features = self._shared_roi_transform(
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 456, in _shared_roi_transform
+    x = self.pooler(features, boxes)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 654, in catch_errors
+    return callback(frame, cache_entry, hooks, frame_state)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 727, in _convert_frame
+    result = inner_convert(frame, cache_entry, hooks, frame_state)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 383, in _convert_frame_assert
+    compiled_product = _compile(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 646, in _compile
+    guarded_code = compile_inner(code, one_graph, hooks, transform)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 562, in compile_inner
+    out_code = transform_code_object(code, transform)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/bytecode_transformation.py", line 1033, in transform_code_object
+    transformations(instructions, code_options)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 151, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 527, in transform
+    tracer.run()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2144, in run
+    super().run()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 818, in run
+    and self.step()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 781, in step
+    getattr(self, inst.opname)(inst)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2259, in RETURN_VALUE
+    self.output.compile_subgraph(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 919, in compile_subgraph
+    self.compile_and_call_fx_graph(tx, list(reversed(stack_values)), root)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1087, in compile_and_call_fx_graph
+    compiled_fn = self.call_user_compiler(gm)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1159, in call_user_compiler
+    raise BackendCompilerFailed(self.compiler_fn, e).with_traceback(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1140, in call_user_compiler
+    compiled_fn = compiler_fn(gm, self.example_inputs())
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_dynamo.py", line 117, in debug_wrapper
+    compiled_gm = compiler_fn(gm, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/__init__.py", line 1672, in __call__
+    return compile_fx(model_, inputs_, config_patches=self.config)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 952, in compile_fx
+    return compile_fx(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1168, in compile_fx
+    return aot_autograd(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/backends/common.py", line 55, in compiler_fn
+    cg = aot_module_simplified(gm, example_inputs, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 887, in aot_module_simplified
+    compiled_fn = create_aot_dispatcher_function(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 600, in create_aot_dispatcher_function
+    compiled_fn = compiler_fn(flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 425, in aot_wrapper_dedupe
+    return compiler_fn(flat_fn, leaf_flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 630, in aot_wrapper_synthetic_base
+    return compiler_fn(flat_fn, flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py", line 97, in aot_dispatch_base
+    compiled_fw = compiler(fw_module, updated_flat_args)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1100, in fw_compiler_base
+    return inner_compile(
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_aot.py", line 83, in debug_wrapper
+    inner_compiled_fn = compiler_fn(gm, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/debug.py", line 305, in inner
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 320, in compile_fx_inner
+    compiled_graph = fx_codegen_and_compile(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 550, in fx_codegen_and_compile
+    compiled_fn = graph.compile_to_fn()
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 1117, in compile_to_fn
+    return self.compile_to_module().call
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 1071, in compile_to_module
+    mod = PyCodeCache.load_by_key_path(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 1891, in load_by_key_path
+    exec(code, mod.__dict__, mod.__dict__)
+  File "/tmp/torchinductor_cdhernandez/3n/c3nmaxvhaldax7e64yzdvlbxcvrykks3cdqq4tsnbelhzew73i5t.py", line 364, in <module>
+    async_compile.wait(globals())
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 2470, in wait
+    scope[key] = result.result()
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 2313, in result
+    self.future.result()
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/concurrent/futures/_base.py", line 458, in result
+    return self.__get_result()
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result
+    raise self._exception
+torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised:
+CompilationError: at 14:40:    xnumel = 196
+    yoffset = tl.program_id(1).to(tl.int64) * YBLOCK
+    yindex = yoffset + tl.arange(0, YBLOCK)[None, :].to(tl.int64)
+    ymask = yindex < ynumel
+    xoffset = tl.program_id(0).to(tl.int64) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None].to(tl.int64)
+    xmask = xindex < xnumel
+    x2 = xindex
+    y3 = yindex
+    y0 = yindex % 1024
+    y1 = (yindex // 1024)
+    tmp0 = tl.load(in_ptr0 + (x2 + (196*y3)), xmask, eviction_policy='evict_last').to(tl.float32)
+                                        ^
+ValueError('numel (262144) exceeds triton maximum tensor numel (131072)')
+
+Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information
+
+
+You can suppress this exception and fall back to eager by setting:
+    import torch._dynamo
+    torch._dynamo.config.suppress_errors = True
+
+Run failed with return code:  255
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_101_fpn
+WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:08, ?it/s]
+WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_maskrcnn_r_101_fpn       int8dynamic-bs32          
+WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 12:04:47,676] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 12:06:04,086] [30/1_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+AUTOTUNE convolution(967x256x14x14, 256x256x3x3)
+  convolution 0.9932 ms 100.0%
+  triton_convolution_1327 5.8403 ms 17.0%
+  triton_convolution_1325 6.4791 ms 15.3%
+  triton_convolution_1322 7.3768 ms 13.5%
+  triton_convolution_1328 10.8547 ms 9.1%
+  triton_convolution_1326 14.0171 ms 7.1%
+  triton_convolution_1323 18.6382 ms 5.3%
+  triton_convolution_1324 25.5688 ms 3.9%
+SingleProcess AUTOTUNE takes 5.9567 seconds
+AUTOTUNE addmm(758128x80, 758128x256, 256x80)
+  bias_addmm 0.3601 ms 100.0%
+  triton_mm_1352 0.4278 ms 84.2%
+  triton_mm_1357 0.4576 ms 78.7%
+  triton_mm_1351 0.4592 ms 78.4%
+  triton_mm_1354 0.4899 ms 73.5%
+  triton_mm_1353 0.4958 ms 72.6%
+  triton_mm_1350 0.5550 ms 64.9%
+  triton_mm_1358 0.5671 ms 63.5%
+  addmm 0.6006 ms 60.0%
+  triton_mm_1355 0.7203 ms 50.0%
+SingleProcess AUTOTUNE takes 5.8245 seconds
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:02<01:03,  2.21s/it]running benchmark:   7%|▋         | 2/30 [00:04<01:01,  2.21s/it]running benchmark:  10%|█         | 3/30 [00:06<00:59,  2.22s/it]running benchmark:  13%|█▎        | 4/30 [00:08<00:58,  2.23s/it]running benchmark:  17%|█▋        | 5/30 [00:11<00:55,  2.23s/it]running benchmark:  20%|██        | 6/30 [00:13<00:53,  2.23s/it]running benchmark:  23%|██▎       | 7/30 [00:15<00:51,  2.24s/it]running benchmark:  27%|██▋       | 8/30 [00:17<00:49,  2.23s/it]running benchmark:  30%|███       | 9/30 [00:20<00:47,  2.24s/it]running benchmark:  33%|███▎      | 10/30 [00:22<00:44,  2.24s/it]running benchmark:  37%|███▋      | 11/30 [00:24<00:42,  2.23s/it]running benchmark:  40%|████      | 12/30 [00:26<00:40,  2.23s/it]running benchmark:  43%|████▎     | 13/30 [00:28<00:37,  2.23s/it]running benchmark:  47%|████▋     | 14/30 [00:31<00:35,  2.23s/it]running benchmark:  50%|█████     | 15/30 [00:33<00:33,  2.23s/it]running benchmark:  53%|█████▎    | 16/30 [00:35<00:31,  2.24s/it]running benchmark:  57%|█████▋    | 17/30 [00:37<00:29,  2.24s/it]running benchmark:  60%|██████    | 18/30 [00:40<00:26,  2.23s/it]running benchmark:  63%|██████▎   | 19/30 [00:42<00:24,  2.22s/it]running benchmark:  67%|██████▋   | 20/30 [00:44<00:22,  2.23s/it]running benchmark:  70%|███████   | 21/30 [00:46<00:19,  2.22s/it]running benchmark:  73%|███████▎  | 22/30 [00:49<00:17,  2.22s/it]running benchmark:  77%|███████▋  | 23/30 [00:51<00:15,  2.21s/it]running benchmark:  80%|████████  | 24/30 [00:53<00:13,  2.22s/it]running benchmark:  83%|████████▎ | 25/30 [00:55<00:11,  2.22s/it]running benchmark:  87%|████████▋ | 26/30 [00:57<00:08,  2.21s/it]running benchmark:  90%|█████████ | 27/30 [01:00<00:06,  2.21s/it]running benchmark:  93%|█████████▎| 28/30 [01:02<00:04,  2.20s/it]running benchmark:  97%|█████████▋| 29/30 [01:04<00:02,  2.19s/it]running benchmark: 100%|██████████| 30/30 [01:06<00:00,  2.19s/it]running benchmark: 100%|██████████| 30/30 [01:06<00:00,  2.22s/it]
+2772.795ms
+loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_50_c4
+WARNING:common:Model detectron2_maskrcnn_r_50_c4 does not support bfloat16, running with amp instead
+loading model: 0it [00:06, ?it/s]
+WARNING:common:Model detectron2_maskrcnn_r_50_c4 does not support bfloat16, running with amp instead
+cuda eval  detectron2_maskrcnn_r_50_c4         int8dynamic-bs32          
+WARNING:common:Model detectron2_maskrcnn_r_50_c4 does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 12:15:25,275] [29/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+AUTOTUNE convolution(24339x512x7x7, 512x512x3x3)
+  convolution 23.8064 ms 100.0%
+  triton_convolution_461 156.1302 ms 15.2%
+  triton_convolution_456 201.8537 ms 11.8%
+  triton_convolution_459 224.8360 ms 10.6%
+  triton_convolution_462 355.8909 ms 6.7%
+  triton_convolution_460 423.6372 ms 5.6%
+  triton_convolution_457 485.6638 ms 4.9%
+  triton_convolution_458 585.6576 ms 4.1%
+SingleProcess AUTOTUNE takes 23.8676 seconds
+ERROR:common:Backend dynamo failed in warmup()
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2380, in warmup
+    fn(model, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 488, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward
+    return self.inference(batched_inputs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 203, in inference
+    images = self.preprocess_image(batched_inputs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 208, in resume_in_inference
+    proposals, _ = self.proposal_generator(images, features, None)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in resume_in_inference
+    results, _ = self.roi_heads(images, features, proposals, None)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 477, in forward
+    box_features = self._shared_roi_transform(
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 456, in _shared_roi_transform
+    x = self.pooler(features, boxes)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 654, in catch_errors
+    return callback(frame, cache_entry, hooks, frame_state)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 727, in _convert_frame
+    result = inner_convert(frame, cache_entry, hooks, frame_state)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 383, in _convert_frame_assert
+    compiled_product = _compile(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 646, in _compile
+    guarded_code = compile_inner(code, one_graph, hooks, transform)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 562, in compile_inner
+    out_code = transform_code_object(code, transform)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/bytecode_transformation.py", line 1033, in transform_code_object
+    transformations(instructions, code_options)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 151, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 527, in transform
+    tracer.run()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2144, in run
+    super().run()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 818, in run
+    and self.step()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 781, in step
+    getattr(self, inst.opname)(inst)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2259, in RETURN_VALUE
+    self.output.compile_subgraph(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 919, in compile_subgraph
+    self.compile_and_call_fx_graph(tx, list(reversed(stack_values)), root)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1087, in compile_and_call_fx_graph
+    compiled_fn = self.call_user_compiler(gm)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1159, in call_user_compiler
+    raise BackendCompilerFailed(self.compiler_fn, e).with_traceback(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1140, in call_user_compiler
+    compiled_fn = compiler_fn(gm, self.example_inputs())
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_dynamo.py", line 117, in debug_wrapper
+    compiled_gm = compiler_fn(gm, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/__init__.py", line 1672, in __call__
+    return compile_fx(model_, inputs_, config_patches=self.config)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 952, in compile_fx
+    return compile_fx(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1168, in compile_fx
+    return aot_autograd(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/backends/common.py", line 55, in compiler_fn
+    cg = aot_module_simplified(gm, example_inputs, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 887, in aot_module_simplified
+    compiled_fn = create_aot_dispatcher_function(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 600, in create_aot_dispatcher_function
+    compiled_fn = compiler_fn(flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 425, in aot_wrapper_dedupe
+    return compiler_fn(flat_fn, leaf_flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 630, in aot_wrapper_synthetic_base
+    return compiler_fn(flat_fn, flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py", line 97, in aot_dispatch_base
+    compiled_fw = compiler(fw_module, updated_flat_args)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1100, in fw_compiler_base
+    return inner_compile(
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_aot.py", line 83, in debug_wrapper
+    inner_compiled_fn = compiler_fn(gm, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/debug.py", line 305, in inner
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 320, in compile_fx_inner
+    compiled_graph = fx_codegen_and_compile(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 550, in fx_codegen_and_compile
+    compiled_fn = graph.compile_to_fn()
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 1117, in compile_to_fn
+    return self.compile_to_module().call
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 1071, in compile_to_module
+    mod = PyCodeCache.load_by_key_path(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 1891, in load_by_key_path
+    exec(code, mod.__dict__, mod.__dict__)
+  File "/tmp/torchinductor_cdhernandez/kp/ckpn3swpygsqaratfkmyk2ersel6uqr2t2qlnd25twg2aaq3pktt.py", line 364, in <module>
+    async_compile.wait(globals())
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 2470, in wait
+    scope[key] = result.result()
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 2313, in result
+    self.future.result()
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/concurrent/futures/_base.py", line 458, in result
+    return self.__get_result()
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result
+    raise self._exception
+torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised:
+CompilationError: at 14:40:    xnumel = 196
+    yoffset = tl.program_id(1).to(tl.int64) * YBLOCK
+    yindex = yoffset + tl.arange(0, YBLOCK)[None, :].to(tl.int64)
+    ymask = yindex < ynumel
+    xoffset = tl.program_id(0).to(tl.int64) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None].to(tl.int64)
+    xmask = xindex < xnumel
+    x2 = xindex
+    y3 = yindex
+    y0 = yindex % 1024
+    y1 = (yindex // 1024)
+    tmp0 = tl.load(in_ptr0 + (x2 + (196*y3)), xmask, eviction_policy='evict_last').to(tl.float32)
+                                        ^
+ValueError('numel (262144) exceeds triton maximum tensor numel (131072)')
+
+Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information
+
+
+You can suppress this exception and fall back to eager by setting:
+    import torch._dynamo
+    torch._dynamo.config.suppress_errors = True
+
+Run failed with return code:  255
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_50_fpn
+WARNING:common:Model detectron2_maskrcnn_r_50_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:06, ?it/s]
+WARNING:common:Model detectron2_maskrcnn_r_50_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_maskrcnn_r_50_fpn        int8dynamic-bs32          
+WARNING:common:Model detectron2_maskrcnn_r_50_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 12:17:51,708] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 12:18:56,679] [30/1_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+AUTOTUNE convolution(1154x256x14x14, 256x256x3x3)
+  convolution 1.1834 ms 100.0%
+  triton_convolution_800 6.9411 ms 17.0%
+  triton_convolution_798 7.7173 ms 15.3%
+  triton_convolution_795 8.7726 ms 13.5%
+  triton_convolution_801 12.9288 ms 9.2%
+  triton_convolution_799 16.7297 ms 7.1%
+  triton_convolution_796 22.5466 ms 5.2%
+  triton_convolution_797 30.0653 ms 3.9%
+SingleProcess AUTOTUNE takes 5.5937 seconds
+AUTOTUNE addmm(904736x80, 904736x256, 256x80)
+  bias_addmm 0.4253 ms 100.0%
+  triton_mm_830 0.5515 ms 77.1%
+  triton_mm_825 0.5585 ms 76.2%
+  triton_mm_827 0.5972 ms 71.2%
+  triton_mm_824 0.7048 ms 60.3%
+  triton_mm_826 0.7101 ms 59.9%
+  addmm 0.7138 ms 59.6%
+  triton_mm_831 0.7736 ms 55.0%
+  triton_mm_823 0.8249 ms 51.6%
+  triton_mm_828 0.8686 ms 49.0%
+SingleProcess AUTOTUNE takes 5.7222 seconds
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:02<01:00,  2.08s/it]running benchmark:   7%|▋         | 2/30 [00:04<00:58,  2.07s/it]running benchmark:  10%|█         | 3/30 [00:06<00:55,  2.07s/it]running benchmark:  13%|█▎        | 4/30 [00:08<00:53,  2.07s/it]running benchmark:  17%|█▋        | 5/30 [00:10<00:51,  2.07s/it]running benchmark:  20%|██        | 6/30 [00:12<00:49,  2.06s/it]running benchmark:  23%|██▎       | 7/30 [00:14<00:47,  2.06s/it]running benchmark:  27%|██▋       | 8/30 [00:16<00:45,  2.05s/it]running benchmark:  30%|███       | 9/30 [00:18<00:42,  2.04s/it]running benchmark:  33%|███▎      | 10/30 [00:20<00:40,  2.04s/it]running benchmark:  37%|███▋      | 11/30 [00:22<00:38,  2.04s/it]running benchmark:  40%|████      | 12/30 [00:24<00:36,  2.04s/it]running benchmark:  43%|████▎     | 13/30 [00:26<00:34,  2.03s/it]running benchmark:  47%|████▋     | 14/30 [00:28<00:32,  2.03s/it]running benchmark:  50%|█████     | 15/30 [00:30<00:30,  2.03s/it]running benchmark:  53%|█████▎    | 16/30 [00:32<00:28,  2.03s/it]running benchmark:  57%|█████▋    | 17/30 [00:34<00:26,  2.03s/it]running benchmark:  60%|██████    | 18/30 [00:36<00:24,  2.03s/it]running benchmark:  63%|██████▎   | 19/30 [00:38<00:22,  2.03s/it]running benchmark:  67%|██████▋   | 20/30 [00:40<00:20,  2.03s/it]running benchmark:  70%|███████   | 21/30 [00:42<00:18,  2.03s/it]running benchmark:  73%|███████▎  | 22/30 [00:44<00:16,  2.03s/it]running benchmark:  77%|███████▋  | 23/30 [00:46<00:14,  2.03s/it]running benchmark:  80%|████████  | 24/30 [00:48<00:12,  2.03s/it]running benchmark:  83%|████████▎ | 25/30 [00:50<00:10,  2.03s/it]running benchmark:  87%|████████▋ | 26/30 [00:53<00:08,  2.03s/it]running benchmark:  90%|█████████ | 27/30 [00:55<00:06,  2.03s/it]running benchmark:  93%|█████████▎| 28/30 [00:57<00:04,  2.03s/it]running benchmark:  97%|█████████▋| 29/30 [00:59<00:02,  2.04s/it]running benchmark: 100%|██████████| 30/30 [01:01<00:00,  2.05s/it]running benchmark: 100%|██████████| 30/30 [01:01<00:00,  2.04s/it]
+2930.183ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:14, ?it/s]
+dlrm
+cuda eval  dlrm                                int8dynamic-bs32          
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:00, 44.05it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 44.87it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 53.80it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 58.11it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 56.42it/s]
+20787.104ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model doctr_det_predictor does not support bfloat16, running with amp instead
+loading model: 0it [00:07, ?it/s]
+WARNING:common:Model doctr_det_predictor does not support bfloat16, running with amp instead
+doctr_det_predictor
+cuda eval  doctr_det_predictor                 int8dynamic-bs32          
+WARNING:common:Model doctr_det_predictor does not support bfloat16, running with amp instead
+AUTOTUNE convolution(32x3x1024x1024, 64x3x7x7)
+  convolution 2.2905 ms 100.0%
+  triton_convolution_3 14.0364 ms 16.3%
+  triton_convolution_4 15.7353 ms 14.6%
+  triton_convolution_5 17.4095 ms 13.2%
+  triton_convolution_0 18.3109 ms 12.5%
+  triton_convolution_2 21.9029 ms 10.5%
+  triton_convolution_1 28.0493 ms 8.2%
+SingleProcess AUTOTUNE takes 4.2811 seconds
+AUTOTUNE mm(2097152x64, 64x64)
+  triton_mm_14 0.3299 ms 100.0%
+  triton_mm_8 0.3380 ms 97.6%
+  triton_mm_7 0.3429 ms 96.2%
+  triton_mm_10 0.3487 ms 94.6%
+  triton_mm_6 0.3545 ms 93.1%
+  triton_mm_13 0.3565 ms 92.5%
+  triton_mm_9 0.3578 ms 92.2%
+  mm 0.3639 ms 90.7%
+  triton_mm_16 0.3895 ms 84.7%
+  triton_mm_15 0.4394 ms 75.1%
+SingleProcess AUTOTUNE takes 4.3438 seconds
+AUTOTUNE convolution(32x64x256x256, 64x64x3x3)
+  convolution 0.9196 ms 100.0%
+  triton_convolution_18 4.3928 ms 20.9%
+  triton_convolution_24 4.4343 ms 20.7%
+  triton_convolution_23 5.0573 ms 18.2%
+  triton_convolution_19 6.3445 ms 14.5%
+  triton_convolution_21 7.6130 ms 12.1%
+  triton_convolution_22 7.7789 ms 11.8%
+  triton_convolution_20 19.2466 ms 4.8%
+SingleProcess AUTOTUNE takes 4.2721 seconds
+AUTOTUNE mm(2097152x64, 64x256)
+  triton_mm_27 0.9577 ms 100.0%
+  triton_mm_26 0.9609 ms 99.7%
+  triton_mm_28 1.0915 ms 87.7%
+  triton_mm_29 1.0960 ms 87.4%
+  mm 1.1112 ms 86.2%
+  triton_mm_33 1.1710 ms 81.8%
+  triton_mm_25 1.1773 ms 81.3%
+  triton_mm_32 1.2474 ms 76.8%
+  triton_mm_35 1.4778 ms 64.8%
+  triton_mm_34 1.8064 ms 53.0%
+SingleProcess AUTOTUNE takes 4.7296 seconds
+AUTOTUNE mm(2097152x256, 256x64)
+  triton_mm_51 0.8634 ms 100.0%
+  triton_mm_50 0.8815 ms 97.9%
+  triton_mm_56 0.8842 ms 97.7%
+  triton_mm_53 0.8846 ms 97.6%
+  triton_mm_57 0.8996 ms 96.0%
+  triton_mm_49 0.9074 ms 95.2%
+  mm 0.9095 ms 94.9%
+  triton_mm_52 0.9190 ms 94.0%
+  triton_mm_54 1.3449 ms 64.2%
+  triton_mm_55 1.3943 ms 61.9%
+SingleProcess AUTOTUNE takes 4.8422 seconds
+AUTOTUNE mm(2097152x256, 256x128)
+  mm 1.0759 ms 100.0%
+  triton_mm_113 1.2036 ms 89.4%
+  triton_mm_118 1.2638 ms 85.1%
+  triton_mm_115 1.3270 ms 81.1%
+  triton_mm_112 1.3412 ms 80.2%
+  triton_mm_114 1.3994 ms 76.9%
+  triton_mm_119 1.5787 ms 68.1%
+  triton_mm_111 1.5853 ms 67.9%
+  triton_mm_121 2.5104 ms 42.9%
+  triton_mm_116 2.6312 ms 40.9%
+SingleProcess AUTOTUNE takes 5.8606 seconds
+AUTOTUNE convolution(32x128x256x256, 128x128x3x3)
+  convolution 0.8040 ms 100.0%
+  triton_convolution_123 7.4221 ms 10.8%
+  triton_convolution_126 7.5576 ms 10.6%
+  triton_convolution_129 7.8223 ms 10.3%
+  triton_convolution_128 8.1416 ms 9.9%
+  triton_convolution_124 10.6759 ms 7.5%
+  triton_convolution_127 14.6186 ms 5.5%
+  triton_convolution_125 20.0971 ms 4.0%
+SingleProcess AUTOTUNE takes 4.8526 seconds
+AUTOTUNE mm(524288x128, 128x512)
+  triton_mm_131 0.6248 ms 100.0%
+  triton_mm_132 0.6287 ms 99.4%
+  triton_mm_137 0.6812 ms 91.7%
+  triton_mm_130 0.7103 ms 88.0%
+  mm 0.7170 ms 87.1%
+  triton_mm_134 0.7675 ms 81.4%
+  triton_mm_133 0.7688 ms 81.3%
+  triton_mm_138 0.8843 ms 70.7%
+  triton_mm_140 0.9949 ms 62.8%
+  triton_mm_139 1.4613 ms 42.8%
+SingleProcess AUTOTUNE takes 4.7065 seconds
+AUTOTUNE convolution(32x256x256x256, 512x256x1x1)
+  convolution 0.9657 ms 100.0%
+  triton_convolution_142 2.7684 ms 34.9%
+  triton_convolution_145 2.8289 ms 34.1%
+  triton_convolution_147 3.1197 ms 31.0%
+  triton_convolution_148 3.4158 ms 28.3%
+  triton_convolution_143 3.4896 ms 27.7%
+  triton_convolution_146 3.9305 ms 24.6%
+  triton_convolution_144 17.3653 ms 5.6%
+SingleProcess AUTOTUNE takes 5.3375 seconds
+AUTOTUNE mm(524288x512, 512x128)
+  mm 0.4614 ms 100.0%
+  triton_mm_150 0.5038 ms 91.6%
+  triton_mm_151 0.5062 ms 91.1%
+  triton_mm_153 0.5618 ms 82.1%
+  triton_mm_152 0.5619 ms 82.1%
+  triton_mm_156 0.6251 ms 73.8%
+  triton_mm_157 0.6460 ms 71.4%
+  triton_mm_149 0.6553 ms 70.4%
+  triton_mm_159 1.0005 ms 46.1%
+  triton_mm_155 1.1189 ms 41.2%
+SingleProcess AUTOTUNE takes 5.0530 seconds
+AUTOTUNE mm(524288x512, 512x256)
+  mm 0.7688 ms 100.0%
+  triton_mm_244 0.9241 ms 83.2%
+  triton_mm_243 0.9397 ms 81.8%
+  triton_mm_245 1.0660 ms 72.1%
+  triton_mm_249 1.0662 ms 72.1%
+  triton_mm_246 1.0749 ms 71.5%
+  triton_mm_242 1.1869 ms 64.8%
+  triton_mm_250 1.2500 ms 61.5%
+  triton_mm_252 1.8379 ms 41.8%
+  triton_mm_251 2.2087 ms 34.8%
+SingleProcess AUTOTUNE takes 5.3084 seconds
+AUTOTUNE convolution(32x256x128x128, 256x256x3x3)
+  convolution 0.7038 ms 100.0%
+  triton_convolution_259 5.3709 ms 13.1%
+  triton_convolution_254 6.4752 ms 10.9%
+  triton_convolution_260 9.0645 ms 7.8%
+  triton_convolution_257 9.3488 ms 7.5%
+  triton_convolution_255 11.3304 ms 6.2%
+  triton_convolution_258 17.8420 ms 3.9%
+  triton_convolution_256 20.5932 ms 3.4%
+SingleProcess AUTOTUNE takes 5.2822 seconds
+AUTOTUNE mm(131072x256, 256x1024)
+  mm 0.4231 ms 100.0%
+  triton_mm_263 0.4827 ms 87.7%
+  triton_mm_262 0.4876 ms 86.8%
+  triton_mm_268 0.5118 ms 82.7%
+  triton_mm_261 0.5785 ms 73.1%
+  triton_mm_264 0.5848 ms 72.3%
+  triton_mm_265 0.5872 ms 72.1%
+  triton_mm_269 0.6783 ms 62.4%
+  triton_mm_271 0.8382 ms 50.5%
+  triton_mm_270 1.2052 ms 35.1%
+SingleProcess AUTOTUNE takes 5.2171 seconds
+AUTOTUNE convolution(32x512x128x128, 1024x512x1x1)
+  convolution 0.7571 ms 100.0%
+  triton_convolution_273 2.6636 ms 28.4%
+  triton_convolution_276 2.7304 ms 27.7%
+  triton_convolution_278 2.9062 ms 26.0%
+  triton_convolution_279 3.2396 ms 23.4%
+  triton_convolution_274 3.4258 ms 22.1%
+  triton_convolution_277 3.6892 ms 20.5%
+  triton_convolution_275 16.9966 ms 4.5%
+SingleProcess AUTOTUNE takes 4.9505 seconds
+AUTOTUNE mm(131072x1024, 1024x256)
+  mm 0.3482 ms 100.0%
+  triton_mm_281 0.4223 ms 82.5%
+  triton_mm_282 0.4230 ms 82.3%
+  triton_mm_284 0.4795 ms 72.6%
+  triton_mm_283 0.4828 ms 72.1%
+  triton_mm_280 0.5570 ms 62.5%
+  triton_mm_288 0.5681 ms 61.3%
+  triton_mm_287 0.6271 ms 55.5%
+  triton_mm_290 0.8613 ms 40.4%
+  triton_mm_285 1.0128 ms 34.4%
+SingleProcess AUTOTUNE takes 5.1907 seconds
+AUTOTUNE mm(131072x1024, 1024x512)
+  mm 0.6632 ms 100.0%
+  triton_mm_437 0.8000 ms 82.9%
+  triton_mm_436 0.8028 ms 82.6%
+  triton_mm_438 0.9460 ms 70.1%
+  triton_mm_439 0.9483 ms 69.9%
+  triton_mm_435 1.0481 ms 63.3%
+  triton_mm_442 1.1133 ms 59.6%
+  triton_mm_443 1.1159 ms 59.4%
+  triton_mm_445 1.6086 ms 41.2%
+  triton_mm_444 1.9530 ms 34.0%
+SingleProcess AUTOTUNE takes 5.2963 seconds
+AUTOTUNE convolution(32x512x64x64, 512x512x3x3)
+  convolution 0.7314 ms 100.0%
+  triton_convolution_452 6.0821 ms 12.0%
+  triton_convolution_447 7.8467 ms 9.3%
+  triton_convolution_453 9.5774 ms 7.6%
+  triton_convolution_448 10.9687 ms 6.7%
+  triton_convolution_450 11.7161 ms 6.2%
+  triton_convolution_449 20.5286 ms 3.6%
+  triton_convolution_451 20.8696 ms 3.5%
+SingleProcess AUTOTUNE takes 5.6402 seconds
+AUTOTUNE mm(32768x512, 512x2048)
+  mm 0.3486 ms 100.0%
+  triton_mm_455 0.4144 ms 84.1%
+  triton_mm_456 0.4161 ms 83.8%
+  triton_mm_461 0.4467 ms 78.0%
+  triton_mm_458 0.4938 ms 70.6%
+  triton_mm_457 0.4975 ms 70.1%
+  triton_mm_454 0.5198 ms 67.1%
+  triton_mm_462 0.5903 ms 59.1%
+  triton_mm_464 0.7850 ms 44.4%
+  triton_mm_460 1.0722 ms 32.5%
+SingleProcess AUTOTUNE takes 5.4534 seconds
+AUTOTUNE convolution(32x1024x64x64, 2048x1024x1x1)
+  convolution 0.6654 ms 100.0%
+  triton_convolution_466 2.6981 ms 24.7%
+  triton_convolution_471 2.7872 ms 23.9%
+  triton_convolution_467 2.8548 ms 23.3%
+  triton_convolution_472 3.0131 ms 22.1%
+  triton_convolution_469 3.0928 ms 21.5%
+  triton_convolution_470 3.5113 ms 18.9%
+  triton_convolution_468 16.5693 ms 4.0%
+SingleProcess AUTOTUNE takes 5.2285 seconds
+AUTOTUNE mm(32768x2048, 2048x512)
+  mm 0.3194 ms 100.0%
+  triton_mm_475 0.3883 ms 82.3%
+  triton_mm_474 0.3890 ms 82.1%
+  triton_mm_476 0.4392 ms 72.7%
+  triton_mm_477 0.4395 ms 72.7%
+  triton_mm_473 0.5216 ms 61.2%
+  triton_mm_481 0.5268 ms 60.6%
+  triton_mm_480 0.5820 ms 54.9%
+  triton_mm_483 0.8303 ms 38.5%
+  triton_mm_478 0.9468 ms 33.7%
+SingleProcess AUTOTUNE takes 5.6639 seconds
+AUTOTUNE mm(32768x2048, 2048x256)
+  mm 0.1871 ms 100.0%
+  triton_mm_536 0.2121 ms 88.2%
+  triton_mm_537 0.2154 ms 86.9%
+  triton_mm_538 0.2301 ms 81.3%
+  triton_mm_539 0.2330 ms 80.3%
+  triton_mm_543 0.2632 ms 71.1%
+  triton_mm_535 0.2763 ms 67.7%
+  triton_mm_542 0.3190 ms 58.6%
+  triton_mm_545 0.4464 ms 41.9%
+  triton_mm_541 0.4777 ms 39.2%
+SingleProcess AUTOTUNE takes 5.0503 seconds
+AUTOTUNE mm(2097152x256, 256x256)
+  mm 1.8792 ms 100.0%
+  triton_mm_572 2.1497 ms 87.4%
+  triton_mm_573 2.1625 ms 86.9%
+  triton_mm_578 2.2915 ms 82.0%
+  triton_mm_575 2.5323 ms 74.2%
+  triton_mm_574 2.5340 ms 74.2%
+  triton_mm_571 2.5404 ms 74.0%
+  triton_mm_579 2.9178 ms 64.4%
+  triton_mm_581 3.7341 ms 50.3%
+  triton_mm_580 5.0066 ms 37.5%
+SingleProcess AUTOTUNE takes 5.2456 seconds
+AUTOTUNE convolution(32x256x256x256, 64x256x3x3)
+  convolution 3.1755 ms 100.0%
+  triton_convolution_589 20.5496 ms 15.5%
+  triton_convolution_583 23.0038 ms 13.8%
+  triton_convolution_588 25.7646 ms 12.3%
+  triton_convolution_586 31.2527 ms 10.2%
+  triton_convolution_584 39.6906 ms 8.0%
+  triton_convolution_587 41.3603 ms 7.7%
+  triton_convolution_585 80.4223 ms 3.9%
+SingleProcess AUTOTUNE takes 5.9732 seconds
+AUTOTUNE convolution(32x256x128x128, 64x256x3x3)
+  convolution 0.7645 ms 100.0%
+  triton_convolution_596 5.1106 ms 15.0%
+  triton_convolution_590 5.8724 ms 13.0%
+  triton_convolution_595 6.4137 ms 11.9%
+  triton_convolution_593 7.7822 ms 9.8%
+  triton_convolution_591 9.8485 ms 7.8%
+  triton_convolution_594 10.2838 ms 7.4%
+  triton_convolution_592 20.0244 ms 3.8%
+SingleProcess AUTOTUNE takes 4.3423 seconds
+AUTOTUNE convolution(32x256x64x64, 64x256x3x3)
+  convolution 0.1939 ms 100.0%
+  triton_convolution_603 1.3135 ms 14.8%
+  triton_convolution_597 1.5527 ms 12.5%
+  triton_convolution_602 1.6405 ms 11.8%
+  triton_convolution_600 1.9281 ms 10.1%
+  triton_convolution_601 2.5886 ms 7.5%
+  triton_convolution_598 2.6006 ms 7.5%
+  triton_convolution_599 5.1993 ms 3.7%
+SingleProcess AUTOTUNE takes 3.9664 seconds
+AUTOTUNE convolution(32x256x32x32, 64x256x3x3)
+  convolution 0.0755 ms 100.0%
+  triton_convolution_609 0.4532 ms 16.7%
+  triton_convolution_607 0.5287 ms 14.3%
+  triton_convolution_610 0.5421 ms 13.9%
+  triton_convolution_604 0.5903 ms 12.8%
+  triton_convolution_608 0.6571 ms 11.5%
+  triton_convolution_605 0.7044 ms 10.7%
+  triton_convolution_606 1.9472 ms 3.9%
+SingleProcess AUTOTUNE takes 3.8744 seconds
+[2023-12-12 12:29:36,834] [1/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+malloc_consolidate(): invalid chunk size
+Run failed with return code:  -6
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead
+loading model: 0it [00:06, ?it/s]
+WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead
+doctr_reco_predictor
+cuda eval  doctr_reco_predictor                int8dynamic-bs32          
+WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead
+AUTOTUNE convolution(32x3x32x128, 64x3x3x3)
+  convolution 0.0318 ms 100.0%
+  triton_convolution_4 0.0574 ms 55.4%
+  triton_convolution_3 0.0589 ms 53.9%
+  triton_convolution_5 0.0715 ms 44.4%
+  triton_convolution_0 0.0780 ms 40.7%
+  triton_convolution_2 0.0884 ms 35.9%
+  triton_convolution_1 0.1215 ms 26.2%
+SingleProcess AUTOTUNE takes 3.2712 seconds
+AUTOTUNE convolution(32x64x32x128, 64x64x3x3)
+  convolution 0.0705 ms 100.0%
+  triton_convolution_12 0.2816 ms 25.0%
+  triton_convolution_6 0.3041 ms 23.2%
+  triton_convolution_11 0.3408 ms 20.7%
+  triton_convolution_7 0.4860 ms 14.5%
+  triton_convolution_9 0.4985 ms 14.1%
+  triton_convolution_10 0.5173 ms 13.6%
+  triton_convolution_8 1.2101 ms 5.8%
+SingleProcess AUTOTUNE takes 3.8315 seconds
+AUTOTUNE convolution(32x64x16x64, 128x64x3x3)
+  convolution 0.0373 ms 100.0%
+  triton_convolution_19 0.1657 ms 22.5%
+  triton_convolution_16 0.1666 ms 22.4%
+  triton_convolution_13 0.1748 ms 21.3%
+  triton_convolution_18 0.1751 ms 21.3%
+  triton_convolution_17 0.2274 ms 16.4%
+  triton_convolution_14 0.2968 ms 12.6%
+  triton_convolution_15 0.7098 ms 5.3%
+SingleProcess AUTOTUNE takes 4.2842 seconds
+AUTOTUNE convolution(32x128x16x64, 128x128x3x3)
+  convolution 0.0586 ms 100.0%
+  triton_convolution_26 0.3252 ms 18.0%
+  triton_convolution_23 0.3260 ms 18.0%
+  triton_convolution_25 0.3440 ms 17.0%
+  triton_convolution_20 0.3656 ms 16.0%
+  triton_convolution_24 0.4554 ms 12.9%
+  triton_convolution_21 0.5996 ms 9.8%
+  triton_convolution_22 1.4033 ms 4.2%
+SingleProcess AUTOTUNE takes 4.2807 seconds
+AUTOTUNE convolution(32x128x8x32, 256x128x3x3)
+  convolution 0.0357 ms 100.0%
+  triton_convolution_32 0.1917 ms 18.6%
+  triton_convolution_30 0.1935 ms 18.5%
+  triton_convolution_33 0.2181 ms 16.4%
+  triton_convolution_31 0.2183 ms 16.4%
+  triton_convolution_27 0.2816 ms 12.7%
+  triton_convolution_28 0.3154 ms 11.3%
+  triton_convolution_29 0.9038 ms 4.0%
+SingleProcess AUTOTUNE takes 4.9046 seconds
+AUTOTUNE convolution(32x256x8x32, 256x256x3x3)
+  convolution 0.0589 ms 100.0%
+  triton_convolution_37 0.4123 ms 14.3%
+  triton_convolution_39 0.4150 ms 14.2%
+  triton_convolution_40 0.4764 ms 12.4%
+  triton_convolution_38 0.5434 ms 10.8%
+  triton_convolution_34 0.5598 ms 10.5%
+  triton_convolution_35 0.6081 ms 9.7%
+  triton_convolution_36 1.7971 ms 3.3%
+SingleProcess AUTOTUNE takes 4.5707 seconds
+AUTOTUNE convolution(32x256x4x32, 512x256x3x3)
+  convolution 0.0661 ms 100.0%
+  triton_convolution_53 0.3858 ms 17.1%
+  triton_convolution_51 0.3921 ms 16.9%
+  triton_convolution_54 0.4848 ms 13.6%
+  triton_convolution_52 0.5012 ms 13.2%
+  triton_convolution_48 0.5331 ms 12.4%
+  triton_convolution_49 0.6044 ms 10.9%
+  triton_convolution_50 1.6766 ms 3.9%
+SingleProcess AUTOTUNE takes 4.5779 seconds
+AUTOTUNE convolution(32x512x4x32, 512x512x3x3)
+  convolution 0.1247 ms 100.0%
+  triton_convolution_60 0.8306 ms 15.0%
+  triton_convolution_61 1.0271 ms 12.1%
+  triton_convolution_55 1.0974 ms 11.4%
+  triton_convolution_58 1.1143 ms 11.2%
+  triton_convolution_56 1.2073 ms 10.3%
+  triton_convolution_59 1.3907 ms 9.0%
+  triton_convolution_57 3.3575 ms 3.7%
+SingleProcess AUTOTUNE takes 4.8452 seconds
+AUTOTUNE convolution(32x512x2x32, 512x512x3x3)
+  convolution 0.0695 ms 100.0%
+  triton_convolution_74 0.4582 ms 15.2%
+  triton_convolution_72 0.5551 ms 12.5%
+  triton_convolution_75 0.5563 ms 12.5%
+  triton_convolution_73 0.8652 ms 8.0%
+  triton_convolution_70 1.0102 ms 6.9%
+  triton_convolution_69 1.0883 ms 6.4%
+  triton_convolution_71 1.4248 ms 4.9%
+SingleProcess AUTOTUNE takes 5.1199 seconds
+AUTOTUNE int_mm(1024x256, 256x124, 1024x124)
+  triton_mm_96 0.0094 ms 100.0%
+  triton_mm_98 0.0096 ms 97.7%
+  triton_mm_95 0.0096 ms 97.3%
+  triton_mm_93 0.0120 ms 78.1%
+  triton_mm_94 0.0124 ms 75.3%
+  triton_mm_90 0.0126 ms 74.2%
+  triton_mm_92 0.0127 ms 73.8%
+  triton_mm_91 0.0132 ms 71.1%
+  triton_mm_99 0.0146 ms 64.1%
+  triton_mm_97 0.0193 ms 48.7%
+SingleProcess AUTOTUNE takes 6.8260 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  20%|██        | 6/30 [00:00<00:00, 54.34it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 58.56it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 59.91it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 60.52it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 59.87it/s]
+4825.676ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+drq
+cuda eval  drq                                 int8dynamic-bs32          
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 126.38it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 130.23it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 129.85it/s]
+15327.622ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+fastNLP_Bert
+cuda eval  fastNLP_Bert                        int8dynamic-bs32          
+AUTOTUNE int_mm(15200x768, 768x768, 15200x768)
+  triton_mm_10 0.0842 ms 100.0%
+  triton_mm_9 0.0850 ms 99.1%
+  triton_mm_1 0.0962 ms 87.6%
+  triton_mm_2 0.0971 ms 86.7%
+  triton_mm_8 0.1042 ms 80.8%
+  triton_mm_4 0.1095 ms 76.9%
+  triton_mm_3 0.1131 ms 74.5%
+  triton_mm_7 0.1153 ms 73.1%
+  triton_mm_0 0.1273 ms 66.2%
+  triton_mm_5 0.2793 ms 30.2%
+SingleProcess AUTOTUNE takes 7.1673 seconds
+AUTOTUNE bmm(384x475x64, 384x64x475)
+  triton_bmm_24 0.2659 ms 100.0%
+  triton_bmm_23 0.2924 ms 90.9%
+  triton_bmm_22 0.3308 ms 80.4%
+  triton_bmm_32 0.3477 ms 76.5%
+  triton_bmm_26 0.3483 ms 76.3%
+  triton_bmm_25 0.3534 ms 75.2%
+  triton_bmm_33 0.3922 ms 67.8%
+  triton_bmm_29 0.4124 ms 64.5%
+  triton_bmm_31 0.4156 ms 64.0%
+  bmm 0.4715 ms 56.4%
+SingleProcess AUTOTUNE takes 4.9685 seconds
+AUTOTUNE bmm(384x475x475, 384x475x64)
+  triton_bmm_48 0.2285 ms 100.0%
+  triton_bmm_47 0.2337 ms 97.8%
+  triton_bmm_53 0.2357 ms 97.0%
+  triton_bmm_49 0.2369 ms 96.5%
+  triton_bmm_46 0.2467 ms 92.6%
+  triton_bmm_51 0.2630 ms 86.9%
+  triton_bmm_45 0.2913 ms 78.4%
+  triton_bmm_52 0.3013 ms 75.9%
+  triton_bmm_55 0.3065 ms 74.6%
+  triton_bmm_56 0.3117 ms 73.3%
+SingleProcess AUTOTUNE takes 4.5104 seconds
+AUTOTUNE int_mm(15200x768, 768x3072, 15200x3072)
+  triton_mm_77 0.2707 ms 100.0%
+  triton_mm_78 0.2742 ms 98.7%
+  triton_mm_69 0.3201 ms 84.6%
+  triton_mm_70 0.3290 ms 82.3%
+  triton_mm_75 0.3521 ms 76.9%
+  triton_mm_76 0.3836 ms 70.6%
+  triton_mm_72 0.3920 ms 69.0%
+  triton_mm_71 0.4012 ms 67.5%
+  triton_mm_68 0.4307 ms 62.8%
+  triton_mm_73 1.0945 ms 24.7%
+SingleProcess AUTOTUNE takes 8.0090 seconds
+AUTOTUNE int_mm(15200x3072, 3072x768, 15200x768)
+  triton_mm_89 0.1964 ms 100.0%
+  triton_mm_88 0.1965 ms 100.0%
+  triton_mm_81 0.3102 ms 63.3%
+  triton_mm_87 0.3110 ms 63.2%
+  triton_mm_80 0.3112 ms 63.1%
+  triton_mm_83 0.3379 ms 58.1%
+  triton_mm_82 0.3408 ms 57.6%
+  triton_mm_86 0.3768 ms 52.1%
+  triton_mm_79 0.4462 ms 44.0%
+  triton_mm_84 0.9541 ms 20.6%
+SingleProcess AUTOTUNE takes 7.7416 seconds
+AUTOTUNE int_mm(32x768, 768x768, 32x768)
+  triton_mm_1090 0.0115 ms 100.0%
+  triton_mm_1088 0.0121 ms 95.0%
+  triton_mm_1085 0.0122 ms 94.4%
+  triton_mm_1086 0.0133 ms 86.5%
+  triton_mm_1089 0.0136 ms 84.9%
+  triton_mm_1084 0.0136 ms 84.7%
+  triton_mm_1083 0.0157 ms 73.5%
+  triton_mm_1082 0.0168 ms 68.6%
+  triton_mm_1081 0.0187 ms 61.5%
+  triton_mm_1080 0.0230 ms 50.1%
+SingleProcess AUTOTUNE takes 4.2783 seconds
+AUTOTUNE int_mm(15136x768, 768x2, 15136x2)
+  triton_mm_1100 0.0185 ms 100.0%
+  triton_mm_1101 0.0187 ms 98.8%
+  triton_mm_1099 0.0192 ms 96.2%
+  triton_mm_1097 0.0206 ms 89.9%
+  triton_mm_1094 0.0211 ms 87.7%
+  triton_mm_1096 0.0212 ms 87.0%
+  triton_mm_1092 0.0235 ms 78.7%
+  triton_mm_1093 0.0249 ms 74.2%
+  triton_mm_1091 0.0274 ms 67.6%
+  triton_mm_1095 0.0279 ms 66.3%
+SingleProcess AUTOTUNE takes 4.3306 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:09,  3.01it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:08,  3.11it/s]running benchmark:  10%|█         | 3/30 [00:00<00:08,  3.15it/s]running benchmark:  13%|█▎        | 4/30 [00:01<00:08,  3.17it/s]running benchmark:  17%|█▋        | 5/30 [00:01<00:07,  3.18it/s]running benchmark:  20%|██        | 6/30 [00:01<00:07,  3.19it/s]running benchmark:  23%|██▎       | 7/30 [00:02<00:07,  3.19it/s]running benchmark:  27%|██▋       | 8/30 [00:02<00:06,  3.20it/s]running benchmark:  30%|███       | 9/30 [00:02<00:06,  3.21it/s]running benchmark:  33%|███▎      | 10/30 [00:03<00:06,  3.22it/s]running benchmark:  37%|███▋      | 11/30 [00:03<00:05,  3.22it/s]running benchmark:  40%|████      | 12/30 [00:03<00:05,  3.23it/s]running benchmark:  43%|████▎     | 13/30 [00:04<00:05,  3.23it/s]running benchmark:  47%|████▋     | 14/30 [00:04<00:04,  3.24it/s]running benchmark:  50%|█████     | 15/30 [00:04<00:04,  3.22it/s]running benchmark:  53%|█████▎    | 16/30 [00:04<00:04,  3.23it/s]running benchmark:  57%|█████▋    | 17/30 [00:05<00:04,  3.24it/s]running benchmark:  60%|██████    | 18/30 [00:05<00:03,  3.24it/s]running benchmark:  63%|██████▎   | 19/30 [00:05<00:03,  3.25it/s]running benchmark:  67%|██████▋   | 20/30 [00:06<00:03,  3.25it/s]running benchmark:  70%|███████   | 21/30 [00:06<00:02,  3.25it/s]running benchmark:  73%|███████▎  | 22/30 [00:06<00:02,  3.25it/s]running benchmark:  77%|███████▋  | 23/30 [00:07<00:02,  3.25it/s]running benchmark:  80%|████████  | 24/30 [00:07<00:01,  3.25it/s]running benchmark:  83%|████████▎ | 25/30 [00:07<00:01,  3.25it/s]running benchmark:  87%|████████▋ | 26/30 [00:08<00:01,  3.25it/s]running benchmark:  90%|█████████ | 27/30 [00:08<00:00,  3.25it/s]running benchmark:  93%|█████████▎| 28/30 [00:08<00:00,  3.24it/s]running benchmark:  97%|█████████▋| 29/30 [00:08<00:00,  3.25it/s]running benchmark: 100%|██████████| 30/30 [00:09<00:00,  3.25it/s]running benchmark: 100%|██████████| 30/30 [00:09<00:00,  3.22it/s]
+4869.811ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+functorch_dp_cifar10
+cuda eval  functorch_dp_cifar10                int8dynamic-bs32          
+AUTOTUNE convolution(32x3x32x32, 64x3x7x7)
+  convolution 0.0292 ms 100.0%
+  triton_convolution_4 0.0475 ms 61.5%
+  triton_convolution_3 0.0509 ms 57.3%
+  triton_convolution_0 0.0564 ms 51.8%
+  triton_convolution_5 0.0693 ms 42.1%
+  triton_convolution_2 0.0810 ms 36.0%
+  triton_convolution_1 0.1410 ms 20.7%
+SingleProcess AUTOTUNE takes 3.1981 seconds
+AUTOTUNE convolution(32x64x8x8, 64x64x3x3)
+  convolution 0.0200 ms 100.0%
+  triton_convolution_11 0.0366 ms 54.5%
+  triton_convolution_10 0.0484 ms 41.2%
+  triton_convolution_6 0.0510 ms 39.1%
+  triton_convolution_9 0.0586 ms 34.1%
+  triton_convolution_12 0.0701 ms 28.5%
+  triton_convolution_7 0.1100 ms 18.1%
+  triton_convolution_8 0.2200 ms 9.1%
+SingleProcess AUTOTUNE takes 4.1420 seconds
+AUTOTUNE convolution(32x64x8x8, 128x64x3x3)
+  convolution 0.0116 ms 100.0%
+  triton_convolution_39 0.0719 ms 16.1%
+  triton_convolution_38 0.0953 ms 12.1%
+  triton_convolution_34 0.1030 ms 11.2%
+  triton_convolution_40 0.1062 ms 10.9%
+  triton_convolution_37 0.1280 ms 9.0%
+  triton_convolution_36 0.1503 ms 7.7%
+  triton_convolution_35 0.1788 ms 6.5%
+SingleProcess AUTOTUNE takes 4.1377 seconds
+AUTOTUNE convolution(32x128x4x4, 128x128x3x3)
+  convolution 0.0130 ms 100.0%
+  triton_convolution_46 0.0922 ms 14.1%
+  triton_convolution_45 0.0994 ms 13.1%
+  triton_convolution_41 0.1228 ms 10.6%
+  triton_convolution_47 0.1459 ms 8.9%
+  triton_convolution_44 0.1468 ms 8.9%
+  triton_convolution_43 0.2124 ms 6.1%
+  triton_convolution_42 0.2599 ms 5.0%
+SingleProcess AUTOTUNE takes 4.2125 seconds
+AUTOTUNE convolution(32x64x8x8, 128x64x1x1)
+  convolution 0.0090 ms 100.0%
+  triton_convolution_52 0.0093 ms 96.6%
+  triton_convolution_48 0.0098 ms 91.5%
+  triton_convolution_53 0.0101 ms 88.9%
+  triton_convolution_51 0.0116 ms 77.1%
+  triton_convolution_54 0.0124 ms 72.4%
+  triton_convolution_49 0.0147 ms 60.9%
+  triton_convolution_50 0.0236 ms 37.9%
+SingleProcess AUTOTUNE takes 4.3716 seconds
+AUTOTUNE convolution(32x128x4x4, 256x128x3x3)
+  convolution 0.0133 ms 100.0%
+  triton_convolution_71 0.1174 ms 11.3%
+  triton_convolution_75 0.1676 ms 7.9%
+  triton_convolution_74 0.1860 ms 7.1%
+  triton_convolution_70 0.2445 ms 5.4%
+  triton_convolution_73 0.2455 ms 5.4%
+  triton_convolution_72 0.2833 ms 4.7%
+  triton_convolution_69 0.3375 ms 3.9%
+SingleProcess AUTOTUNE takes 4.6222 seconds
+AUTOTUNE convolution(32x256x2x2, 256x256x3x3)
+  convolution 0.0204 ms 100.0%
+  triton_convolution_78 0.1372 ms 14.8%
+  triton_convolution_80 0.2748 ms 7.4%
+  triton_convolution_81 0.2807 ms 7.2%
+  triton_convolution_82 0.3013 ms 6.8%
+  triton_convolution_79 0.3492 ms 5.8%
+  triton_convolution_77 0.4445 ms 4.6%
+  triton_convolution_76 0.5843 ms 3.5%
+SingleProcess AUTOTUNE takes 3.8030 seconds
+AUTOTUNE convolution(32x128x4x4, 256x128x1x1)
+  convolution 0.0096 ms 100.0%
+  triton_convolution_87 0.0117 ms 82.2%
+  triton_convolution_89 0.0163 ms 58.9%
+  triton_convolution_86 0.0168 ms 57.1%
+  triton_convolution_88 0.0176 ms 54.6%
+  triton_convolution_85 0.0191 ms 50.2%
+  triton_convolution_84 0.0204 ms 46.9%
+  triton_convolution_83 0.0226 ms 42.6%
+SingleProcess AUTOTUNE takes 4.9364 seconds
+AUTOTUNE convolution(32x256x2x2, 512x256x3x3)
+  convolution 0.0204 ms 100.0%
+  triton_convolution_106 0.1181 ms 17.3%
+  triton_convolution_108 0.1438 ms 14.2%
+  triton_convolution_105 0.1743 ms 11.7%
+  triton_convolution_109 0.1782 ms 11.5%
+  triton_convolution_110 0.1899 ms 10.8%
+  triton_convolution_107 0.2286 ms 8.9%
+  triton_convolution_104 0.4832 ms 4.2%
+SingleProcess AUTOTUNE takes 3.5933 seconds
+AUTOTUNE convolution(32x512x1x1, 512x512x3x3)
+  convolution 0.0224 ms 100.0%
+  triton_convolution_113 0.2080 ms 10.8%
+  triton_convolution_115 0.2161 ms 10.4%
+  triton_convolution_117 0.2516 ms 8.9%
+  triton_convolution_114 0.2714 ms 8.3%
+  triton_convolution_116 0.3020 ms 7.4%
+  triton_convolution_112 0.4852 ms 4.6%
+  triton_convolution_111 0.9973 ms 2.2%
+SingleProcess AUTOTUNE takes 2.6299 seconds
+AUTOTUNE convolution(32x256x2x2, 512x256x1x1)
+  convolution 0.0100 ms 100.0%
+  triton_convolution_122 0.0160 ms 62.2%
+  triton_convolution_124 0.0176 ms 56.6%
+  triton_convolution_120 0.0199 ms 50.0%
+  triton_convolution_119 0.0199 ms 49.9%
+  triton_convolution_121 0.0225 ms 44.2%
+  triton_convolution_123 0.0262 ms 38.0%
+  triton_convolution_118 0.0677 ms 14.7%
+SingleProcess AUTOTUNE takes 3.3689 seconds
+AUTOTUNE int_mm(32x512, 512x1000, 32x1000)
+  triton_mm_144 0.0107 ms 100.0%
+  triton_mm_149 0.0108 ms 99.1%
+  triton_mm_147 0.0110 ms 97.1%
+  triton_mm_145 0.0111 ms 96.0%
+  triton_mm_143 0.0121 ms 87.9%
+  triton_mm_148 0.0124 ms 86.0%
+  triton_mm_142 0.0136 ms 78.4%
+  triton_mm_141 0.0137 ms 77.8%
+  triton_mm_140 0.0147 ms 72.4%
+  triton_mm_139 0.0165 ms 64.4%
+SingleProcess AUTOTUNE takes 4.3728 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 160.75it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 165.70it/s]
+7201.677ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+functorch_maml_omniglot
+cuda eval  functorch_maml_omniglot             int8dynamic-bs32          
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 154.51it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 154.96it/s]
+23643.818ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+hf_Albert
+cuda eval  hf_Albert                           int8dynamic-bs32          
+AUTOTUNE int_mm(16384x128, 128x768, 16384x768)
+  triton_mm_2 0.0414 ms 100.0%
+  triton_mm_1 0.0435 ms 95.3%
+  triton_mm_7 0.0460 ms 90.0%
+  triton_mm_4 0.0467 ms 88.7%
+  triton_mm_0 0.0470 ms 88.2%
+  triton_mm_8 0.0476 ms 87.0%
+  triton_mm_3 0.0513 ms 80.7%
+  triton_mm_9 0.0976 ms 42.5%
+  triton_mm_5 0.1000 ms 41.5%
+  triton_mm_6 0.1075 ms 38.6%
+SingleProcess AUTOTUNE takes 5.7787 seconds
+AUTOTUNE int_mm(16384x768, 768x768, 16384x768)
+  triton_mm_20 0.0856 ms 100.0%
+  triton_mm_21 0.0858 ms 99.7%
+  triton_mm_13 0.1028 ms 83.3%
+  triton_mm_12 0.1031 ms 83.1%
+  triton_mm_19 0.1113 ms 76.9%
+  triton_mm_15 0.1169 ms 73.2%
+  triton_mm_18 0.1197 ms 71.5%
+  triton_mm_14 0.1209 ms 70.8%
+  triton_mm_11 0.1381 ms 62.0%
+  triton_mm_16 0.3006 ms 28.5%
+SingleProcess AUTOTUNE takes 7.5686 seconds
+AUTOTUNE int_mm(16384x768, 768x3072, 16384x3072)
+  triton_mm_64 0.2880 ms 100.0%
+  triton_mm_65 0.2918 ms 98.7%
+  triton_mm_56 0.3448 ms 83.5%
+  triton_mm_57 0.3551 ms 81.1%
+  triton_mm_62 0.3747 ms 76.9%
+  triton_mm_63 0.4124 ms 69.8%
+  triton_mm_59 0.4207 ms 68.5%
+  triton_mm_58 0.4311 ms 66.8%
+  triton_mm_55 0.4654 ms 61.9%
+  triton_mm_60 1.1770 ms 24.5%
+SingleProcess AUTOTUNE takes 7.8736 seconds
+AUTOTUNE int_mm(16384x3072, 3072x768, 16384x768)
+  triton_mm_75 0.1970 ms 100.0%
+  triton_mm_76 0.1973 ms 99.8%
+  triton_mm_68 0.3325 ms 59.3%
+  triton_mm_74 0.3334 ms 59.1%
+  triton_mm_67 0.3350 ms 58.8%
+  triton_mm_70 0.3610 ms 54.6%
+  triton_mm_69 0.3649 ms 54.0%
+  triton_mm_73 0.3813 ms 51.7%
+  triton_mm_66 0.4769 ms 41.3%
+  triton_mm_71 1.0262 ms 19.2%
+SingleProcess AUTOTUNE takes 7.6556 seconds
+AUTOTUNE int_mm(16384x768, 768x128, 16384x128)
+  triton_mm_813 0.0276 ms 100.0%
+  triton_mm_812 0.0306 ms 90.5%
+  triton_mm_811 0.0321 ms 86.1%
+  triton_mm_805 0.0362 ms 76.5%
+  triton_mm_807 0.0371 ms 74.6%
+  triton_mm_810 0.0400 ms 69.1%
+  triton_mm_804 0.0439 ms 63.0%
+  triton_mm_806 0.0467 ms 59.2%
+  triton_mm_803 0.0475 ms 58.3%
+  triton_mm_808 0.0593 ms 46.7%
+SingleProcess AUTOTUNE takes 6.8451 seconds
+AUTOTUNE int_mm(16384x128, 128x30000, 16384x30000)
+  triton_mm_815 1.2190 ms 100.0%
+  triton_mm_816 1.2767 ms 95.5%
+  triton_mm_814 1.3644 ms 89.3%
+  triton_mm_821 1.4402 ms 84.6%
+  triton_mm_818 1.5151 ms 80.5%
+  triton_mm_822 1.5686 ms 77.7%
+  triton_mm_817 1.6005 ms 76.2%
+  triton_mm_823 3.0709 ms 39.7%
+  triton_mm_819 3.5410 ms 34.4%
+  triton_mm_820 3.8769 ms 31.4%
+SingleProcess AUTOTUNE takes 6.2159 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:10,  2.86it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:08,  3.30it/s]running benchmark:  10%|█         | 3/30 [00:00<00:07,  3.47it/s]running benchmark:  13%|█▎        | 4/30 [00:01<00:07,  3.55it/s]running benchmark:  17%|█▋        | 5/30 [00:01<00:06,  3.60it/s]running benchmark:  20%|██        | 6/30 [00:01<00:06,  3.63it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:06,  3.65it/s]running benchmark:  27%|██▋       | 8/30 [00:02<00:06,  3.66it/s]running benchmark:  30%|███       | 9/30 [00:02<00:05,  3.68it/s]running benchmark:  33%|███▎      | 10/30 [00:02<00:05,  3.69it/s]running benchmark:  37%|███▋      | 11/30 [00:03<00:05,  3.68it/s]running benchmark:  40%|████      | 12/30 [00:03<00:04,  3.69it/s]running benchmark:  43%|████▎     | 13/30 [00:03<00:04,  3.68it/s]running benchmark:  47%|████▋     | 14/30 [00:03<00:04,  3.68it/s]running benchmark:  50%|█████     | 15/30 [00:04<00:04,  3.68it/s]running benchmark:  53%|█████▎    | 16/30 [00:04<00:03,  3.68it/s]running benchmark:  57%|█████▋    | 17/30 [00:04<00:03,  3.68it/s]running benchmark:  60%|██████    | 18/30 [00:04<00:03,  3.68it/s]running benchmark:  63%|██████▎   | 19/30 [00:05<00:02,  3.68it/s]running benchmark:  67%|██████▋   | 20/30 [00:05<00:02,  3.69it/s]running benchmark:  70%|███████   | 21/30 [00:05<00:02,  3.69it/s]running benchmark:  73%|███████▎  | 22/30 [00:06<00:02,  3.68it/s]running benchmark:  77%|███████▋  | 23/30 [00:06<00:01,  3.69it/s]running benchmark:  80%|████████  | 24/30 [00:06<00:01,  3.69it/s]running benchmark:  83%|████████▎ | 25/30 [00:06<00:01,  3.68it/s]running benchmark:  87%|████████▋ | 26/30 [00:07<00:01,  3.67it/s]running benchmark:  90%|█████████ | 27/30 [00:07<00:00,  3.68it/s]running benchmark:  93%|█████████▎| 28/30 [00:07<00:00,  3.68it/s]running benchmark:  97%|█████████▋| 29/30 [00:07<00:00,  3.68it/s]running benchmark: 100%|██████████| 30/30 [00:08<00:00,  3.69it/s]running benchmark: 100%|██████████| 30/30 [00:08<00:00,  3.65it/s]
+11498.081ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:07, ?it/s]
+hf_Bart
+cuda eval  hf_Bart                             int8dynamic-bs32          
+AUTOTUNE bmm(384x512x64, 384x64x512)
+  triton_bmm_419 0.1797 ms 100.0%
+  triton_bmm_420 0.1810 ms 99.3%
+  bmm 0.1979 ms 90.8%
+  triton_bmm_421 0.2005 ms 89.6%
+  triton_bmm_422 0.2036 ms 88.3%
+  triton_bmm_426 0.2139 ms 84.0%
+  triton_bmm_418 0.2322 ms 77.4%
+  triton_bmm_428 0.2543 ms 70.6%
+  triton_bmm_425 0.2600 ms 69.1%
+  triton_bmm_427 0.3284 ms 54.7%
+SingleProcess AUTOTUNE takes 1.9522 seconds
+AUTOTUNE bmm(384x512x512, 384x512x64)
+  triton_bmm_443 0.1829 ms 100.0%
+  triton_bmm_442 0.1835 ms 99.7%
+  bmm 0.1840 ms 99.4%
+  triton_bmm_449 0.1844 ms 99.2%
+  triton_bmm_445 0.1860 ms 98.3%
+  triton_bmm_444 0.1873 ms 97.6%
+  triton_bmm_447 0.2082 ms 87.9%
+  triton_bmm_446 0.2189 ms 83.5%
+  triton_bmm_450 0.2200 ms 83.1%
+  triton_bmm_448 0.2237 ms 81.7%
+SingleProcess AUTOTUNE takes 2.0408 seconds
+AUTOTUNE int_mm(16384x768, 768x50265, 16384x50265)
+  triton_mm_1209 5.0592 ms 100.0%
+  triton_mm_1210 5.8086 ms 87.1%
+  triton_mm_1201 6.0727 ms 83.3%
+  triton_mm_1202 6.5863 ms 76.8%
+  triton_mm_1208 7.7963 ms 64.9%
+  triton_mm_1203 7.7987 ms 64.9%
+  triton_mm_1204 7.9414 ms 63.7%
+  triton_mm_1200 8.0192 ms 63.1%
+  triton_mm_1207 11.9920 ms 42.2%
+  triton_mm_1205 19.0910 ms 26.5%
+SingleProcess AUTOTUNE takes 9.3809 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [03:50<1:51:30, 230.72s/it]running benchmark:   7%|▋         | 2/30 [07:42<1:48:02, 231.53s/it]running benchmark:  10%|█         | 3/30 [11:36<1:44:33, 232.34s/it]TIMEOUT
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:06, ?it/s]
+WARNING:root:hf_BigBird failed to load
+hf_BigBird
+Original Error: CUDA out of memory. Tried to allocate 192.00 MiB. GPU 0 has a total capacity of 79.15 GiB of which 21.69 MiB is free. Including non-PyTorch memory, this process has 79.13 GiB memory in use. Of the allocated memory 73.27 GiB is allocated by PyTorch, and 5.35 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Eager model failed to run
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model
+    self.model_iter_fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", line 2463, in forward
+    outputs = self.bert(
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", line 2146, in forward
+    encoder_outputs = self.encoder(
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", line 1640, in forward
+    layer_outputs = layer_module(
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", line 1492, in forward
+    self_attention_outputs = self.attention(
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", line 1405, in forward
+    self_outputs = self.self(
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", line 472, in forward
+    context_layer, attention_probs = self.bigbird_block_sparse_attention(
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", line 826, in bigbird_block_sparse_attention
+    last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -1], key_layer, ndim=4)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", line 512, in torch_bmm_nd_transpose
+    return torch.bmm(
+torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 192.00 MiB. GPU 0 has a total capacity of 79.15 GiB of which 21.69 MiB is free. Including non-PyTorch memory, this process has 79.13 GiB memory in use. Of the allocated memory 73.27 GiB is allocated by PyTorch, and 5.35 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+The above exception was the direct cause of the following exception:
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model
+    self.validate_model(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model
+    raise NotImplementedError("Eager model failed to run") from e
+NotImplementedError: Eager model failed to run
+
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:05, ?it/s]
+hf_DistilBert
+cuda eval  hf_DistilBert                       int8dynamic-bs32          
+AUTOTUNE bmm(384x512x64, 384x64x512)
+  triton_bmm_23 0.1825 ms 100.0%
+  triton_bmm_24 0.1881 ms 97.0%
+  bmm 0.1972 ms 92.6%
+  triton_bmm_30 0.2125 ms 85.9%
+  triton_bmm_25 0.2143 ms 85.1%
+  triton_bmm_26 0.2200 ms 83.0%
+  triton_bmm_32 0.2315 ms 78.8%
+  triton_bmm_22 0.2353 ms 77.6%
+  triton_bmm_29 0.2746 ms 66.5%
+  triton_bmm_33 0.3453 ms 52.8%
+SingleProcess AUTOTUNE takes 5.0372 seconds
+AUTOTUNE int_mm(16384x768, 768x30522, 16384x30522)
+  triton_mm_561 2.9527 ms 100.0%
+  triton_mm_560 3.1200 ms 94.6%
+  triton_mm_552 3.5937 ms 82.2%
+  triton_mm_553 3.6345 ms 81.2%
+  triton_mm_558 4.3187 ms 68.4%
+  triton_mm_559 4.3257 ms 68.3%
+  triton_mm_554 4.4997 ms 65.6%
+  triton_mm_555 4.6183 ms 63.9%
+  triton_mm_551 4.7766 ms 61.8%
+  triton_mm_556 11.5852 ms 25.5%
+SingleProcess AUTOTUNE takes 8.5230 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [02:21<1:08:10, 141.04s/it]running benchmark:   7%|▋         | 2/30 [04:40<1:05:20, 140.01s/it]running benchmark:  10%|█         | 3/30 [07:00<1:03:01, 140.07s/it]running benchmark:  13%|█▎        | 4/30 [09:18<1:00:23, 139.38s/it]running benchmark:  17%|█▋        | 5/30 [11:36<57:53, 138.94s/it]  running benchmark:  20%|██        | 6/30 [13:57<55:44, 139.35s/it]running benchmark:  23%|██▎       | 7/30 [16:19<53:47, 140.34s/it]running benchmark:  27%|██▋       | 8/30 [18:38<51:19, 139.97s/it]TIMEOUT
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:06, ?it/s]
+hf_GPT2
+cuda eval  hf_GPT2                             int8dynamic-bs32          
+TIMEOUT
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:21, ?it/s]
+WARNING:root:hf_GPT2_large failed to load
+hf_GPT2_large
+Original Error: CUDA out of memory. Tried to allocate 1.25 GiB. GPU 0 has a total capacity of 79.15 GiB of which 71.69 MiB is free. Including non-PyTorch memory, this process has 79.08 GiB memory in use. Of the allocated memory 78.31 GiB is allocated by PyTorch, and 265.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Eager model failed to run
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model
+    self.model_iter_fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 1076, in forward
+    transformer_outputs = self.transformer(
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 900, in forward
+    outputs = block(
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 390, in forward
+    attn_outputs = self.attn(
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 331, in forward
+    attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 186, in _attn
+    attn_weights = attn_weights / torch.full(
+torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.25 GiB. GPU 0 has a total capacity of 79.15 GiB of which 71.69 MiB is free. Including non-PyTorch memory, this process has 79.08 GiB memory in use. Of the allocated memory 78.31 GiB is allocated by PyTorch, and 265.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+The above exception was the direct cause of the following exception:
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model
+    self.validate_model(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model
+    raise NotImplementedError("Eager model failed to run") from e
+NotImplementedError: Eager model failed to run
+
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:06, ?it/s]
+WARNING:root:hf_Longformer failed to load
+hf_Longformer
+Original Error: CUDA out of memory. Tried to allocate 3.01 GiB. GPU 0 has a total capacity of 79.15 GiB of which 1.93 GiB is free. Including non-PyTorch memory, this process has 77.22 GiB memory in use. Of the allocated memory 76.39 GiB is allocated by PyTorch, and 334.73 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Eager model failed to run
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model
+    self.model_iter_fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/longformer/modeling_longformer.py", line 1844, in forward
+    outputs = self.longformer(
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/longformer/modeling_longformer.py", line 1747, in forward
+    encoder_outputs = self.encoder(
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/longformer/modeling_longformer.py", line 1323, in forward
+    layer_outputs = layer_module(
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/longformer/modeling_longformer.py", line 1246, in forward
+    self_attn_outputs = self.attention(
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/longformer/modeling_longformer.py", line 1182, in forward
+    self_outputs = self.self(
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/longformer/modeling_longformer.py", line 626, in forward
+    attn_probs = nn.functional.softmax(
+  File "/home/cdhernandez/local/pytorch/torch/nn/functional.py", line 1887, in softmax
+    ret = input.softmax(dim, dtype=dtype)
+torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.01 GiB. GPU 0 has a total capacity of 79.15 GiB of which 1.93 GiB is free. Including non-PyTorch memory, this process has 77.22 GiB memory in use. Of the allocated memory 76.39 GiB is allocated by PyTorch, and 334.73 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+The above exception was the direct cause of the following exception:
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model
+    self.validate_model(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model
+    raise NotImplementedError("Eager model failed to run") from e
+NotImplementedError: Eager model failed to run
+
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+hf_Reformer
+cuda eval  hf_Reformer                         int8dynamic-bs32          
+AUTOTUNE int_mm(131072x256, 256x768, 131072x768)
+  triton_mm_2 0.3495 ms 100.0%
+  triton_mm_1 0.3504 ms 99.7%
+  triton_mm_10 0.3786 ms 92.3%
+  triton_mm_9 0.3805 ms 91.9%
+  triton_mm_7 0.3926 ms 89.0%
+  triton_mm_0 0.4167 ms 83.9%
+  triton_mm_8 0.4188 ms 83.5%
+  triton_mm_4 0.4223 ms 82.8%
+  triton_mm_3 0.4458 ms 78.4%
+  triton_mm_5 1.0517 ms 33.2%
+SingleProcess AUTOTUNE takes 7.4904 seconds
+AUTOTUNE bmm(24576x64x64, 24576x64x128)
+  triton_bmm_30 0.6024 ms 100.0%
+  triton_bmm_23 0.6032 ms 99.9%
+  triton_bmm_24 0.6145 ms 98.0%
+  triton_bmm_25 0.6154 ms 97.9%
+  bmm 0.6222 ms 96.8%
+  triton_bmm_29 0.6511 ms 92.5%
+  triton_bmm_22 0.6514 ms 92.5%
+  triton_bmm_32 0.6561 ms 91.8%
+  triton_bmm_26 0.6580 ms 91.6%
+  triton_bmm_33 0.7495 ms 80.4%
+SingleProcess AUTOTUNE takes 4.3233 seconds
+AUTOTUNE bmm(24576x64x128, 24576x128x64)
+  triton_bmm_46 0.6105 ms 100.0%
+  triton_bmm_51 0.6106 ms 100.0%
+  triton_bmm_47 0.6320 ms 96.6%
+  triton_bmm_45 0.6326 ms 96.5%
+  triton_bmm_50 0.6542 ms 93.3%
+  bmm 0.6571 ms 92.9%
+  triton_bmm_53 0.6626 ms 92.1%
+  triton_bmm_49 0.6672 ms 91.5%
+  triton_bmm_48 0.6739 ms 90.6%
+  triton_bmm_54 0.7465 ms 81.8%
+SingleProcess AUTOTUNE takes 3.7575 seconds
+AUTOTUNE int_mm(131072x768, 768x256, 131072x256)
+  triton_mm_64 0.2062 ms 100.0%
+  triton_mm_65 0.2073 ms 99.5%
+  triton_mm_57 0.2754 ms 74.9%
+  triton_mm_63 0.2824 ms 73.0%
+  triton_mm_59 0.2925 ms 70.5%
+  triton_mm_62 0.3062 ms 67.4%
+  triton_mm_56 0.3072 ms 67.1%
+  triton_mm_58 0.3280 ms 62.9%
+  triton_mm_55 0.3911 ms 52.7%
+  triton_mm_60 0.7859 ms 26.2%
+SingleProcess AUTOTUNE takes 7.6412 seconds
+AUTOTUNE int_mm(131072x256, 256x512, 131072x512)
+  triton_mm_68 0.2453 ms 100.0%
+  triton_mm_76 0.2576 ms 95.2%
+  triton_mm_75 0.2628 ms 93.4%
+  triton_mm_73 0.2742 ms 89.5%
+  triton_mm_67 0.2827 ms 86.8%
+  triton_mm_74 0.2830 ms 86.7%
+  triton_mm_70 0.2845 ms 86.2%
+  triton_mm_69 0.3141 ms 78.1%
+  triton_mm_66 0.3293 ms 74.5%
+  triton_mm_71 0.7001 ms 35.0%
+SingleProcess AUTOTUNE takes 7.9370 seconds
+AUTOTUNE int_mm(131072x512, 512x256, 131072x256)
+  triton_mm_87 0.1739 ms 100.0%
+  triton_mm_86 0.1767 ms 98.4%
+  triton_mm_85 0.2158 ms 80.5%
+  triton_mm_79 0.2213 ms 78.6%
+  triton_mm_81 0.2238 ms 77.7%
+  triton_mm_84 0.2318 ms 75.0%
+  triton_mm_78 0.2717 ms 64.0%
+  triton_mm_80 0.2769 ms 62.8%
+  triton_mm_77 0.3153 ms 55.1%
+  triton_mm_82 0.5600 ms 31.0%
+SingleProcess AUTOTUNE takes 7.6996 seconds
+AUTOTUNE bmm(12x131072x64, 12x64x64)
+  triton_bmm_107 0.2641 ms 100.0%
+  triton_bmm_101 0.2719 ms 97.1%
+  triton_bmm_100 0.2755 ms 95.9%
+  triton_bmm_99 0.2831 ms 93.3%
+  triton_bmm_103 0.2839 ms 93.0%
+  triton_bmm_106 0.2914 ms 90.6%
+  bmm 0.2917 ms 90.5%
+  triton_bmm_102 0.2938 ms 89.9%
+  triton_bmm_109 0.3405 ms 77.6%
+  triton_bmm_108 0.3543 ms 74.5%
+SingleProcess AUTOTUNE takes 4.4479 seconds
+AUTOTUNE bmm(24576x64x64, 24576x64x128)
+  triton_bmm_119 0.6021 ms 100.0%
+  triton_bmm_112 0.6069 ms 99.2%
+  triton_bmm_113 0.6166 ms 97.6%
+  triton_bmm_114 0.6194 ms 97.2%
+  bmm 0.6266 ms 96.1%
+  triton_bmm_118 0.6289 ms 95.7%
+  triton_bmm_115 0.6378 ms 94.4%
+  triton_bmm_111 0.6535 ms 92.1%
+  triton_bmm_121 0.6845 ms 88.0%
+  triton_bmm_120 0.7297 ms 82.5%
+SingleProcess AUTOTUNE takes 4.7624 seconds
+AUTOTUNE int_mm(131072x512, 512x320, 131072x320)
+  triton_mm_533 0.2407 ms 100.0%
+  triton_mm_541 0.2422 ms 99.4%
+  triton_mm_539 0.2665 ms 90.3%
+  triton_mm_535 0.2769 ms 86.9%
+  triton_mm_532 0.2804 ms 85.8%
+  triton_mm_540 0.3076 ms 78.2%
+  triton_mm_531 0.3124 ms 77.1%
+  triton_mm_538 0.3210 ms 75.0%
+  triton_mm_534 0.3368 ms 71.5%
+  triton_mm_536 0.7012 ms 34.3%
+SingleProcess AUTOTUNE takes 7.6258 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:10,  2.89it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:08,  3.16it/s]running benchmark:  10%|█         | 3/30 [00:00<00:08,  3.24it/s]running benchmark:  13%|█▎        | 4/30 [00:01<00:07,  3.30it/s]running benchmark:  17%|█▋        | 5/30 [00:01<00:07,  3.33it/s]running benchmark:  20%|██        | 6/30 [00:01<00:07,  3.34it/s]running benchmark:  23%|██▎       | 7/30 [00:02<00:06,  3.35it/s]running benchmark:  27%|██▋       | 8/30 [00:02<00:06,  3.36it/s]running benchmark:  30%|███       | 9/30 [00:02<00:06,  3.37it/s]running benchmark:  33%|███▎      | 10/30 [00:03<00:05,  3.37it/s]running benchmark:  37%|███▋      | 11/30 [00:03<00:05,  3.37it/s]running benchmark:  40%|████      | 12/30 [00:03<00:05,  3.38it/s]running benchmark:  43%|████▎     | 13/30 [00:03<00:05,  3.38it/s]running benchmark:  47%|████▋     | 14/30 [00:04<00:04,  3.38it/s]running benchmark:  50%|█████     | 15/30 [00:04<00:04,  3.38it/s]running benchmark:  53%|█████▎    | 16/30 [00:04<00:04,  3.37it/s]running benchmark:  57%|█████▋    | 17/30 [00:05<00:03,  3.37it/s]running benchmark:  60%|██████    | 18/30 [00:05<00:03,  3.37it/s]running benchmark:  63%|██████▎   | 19/30 [00:05<00:03,  3.37it/s]running benchmark:  67%|██████▋   | 20/30 [00:05<00:02,  3.37it/s]running benchmark:  70%|███████   | 21/30 [00:06<00:02,  3.37it/s]running benchmark:  73%|███████▎  | 22/30 [00:06<00:02,  3.38it/s]running benchmark:  77%|███████▋  | 23/30 [00:06<00:02,  3.38it/s]running benchmark:  80%|████████  | 24/30 [00:07<00:01,  3.38it/s]running benchmark:  83%|████████▎ | 25/30 [00:07<00:01,  3.38it/s]running benchmark:  87%|████████▋ | 26/30 [00:07<00:01,  3.38it/s]running benchmark:  90%|█████████ | 27/30 [00:08<00:00,  3.38it/s]running benchmark:  93%|█████████▎| 28/30 [00:08<00:00,  3.38it/s]running benchmark:  97%|█████████▋| 29/30 [00:08<00:00,  3.38it/s]running benchmark: 100%|██████████| 30/30 [00:08<00:00,  3.38it/s]running benchmark: 100%|██████████| 30/30 [00:08<00:00,  3.36it/s]
+4860.024ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:05, ?it/s]
+WARNING:root:hf_T5 failed to load
+hf_T5
+Original Error: CUDA out of memory. Tried to allocate 4.00 GiB. GPU 0 has a total capacity of 79.15 GiB of which 1.62 GiB is free. Including non-PyTorch memory, this process has 77.53 GiB memory in use. Of the allocated memory 74.87 GiB is allocated by PyTorch, and 2.16 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Eager model failed to run
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model
+    self.model_iter_fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/util/framework/huggingface/model_factory.py", line 55, in forward
+    return self.model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/util/framework/huggingface/model_factory.py", line 55, in forward
+    return self.model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 1746, in forward
+    decoder_outputs = self.decoder(
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 1123, in forward
+    layer_outputs = layer_module(
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 725, in forward
+    cross_attention_outputs = self.layer[1](
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 636, in forward
+    attention_output = self.EncDecAttention(
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 562, in forward
+    attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
+  File "/home/cdhernandez/local/pytorch/torch/nn/functional.py", line 1885, in softmax
+    ret = input.softmax(dim)
+torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU 0 has a total capacity of 79.15 GiB of which 1.62 GiB is free. Including non-PyTorch memory, this process has 77.53 GiB memory in use. Of the allocated memory 74.87 GiB is allocated by PyTorch, and 2.16 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+The above exception was the direct cause of the following exception:
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model
+    self.validate_model(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model
+    raise NotImplementedError("Eager model failed to run") from e
+NotImplementedError: Eager model failed to run
+
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:07, ?it/s]
+WARNING:root:hf_T5_base failed to load
+hf_T5_base
+Original Error: CUDA out of memory. Tried to allocate 6.00 GiB. GPU 0 has a total capacity of 79.15 GiB of which 4.65 GiB is free. Including non-PyTorch memory, this process has 74.50 GiB memory in use. Of the allocated memory 71.97 GiB is allocated by PyTorch, and 2.02 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Eager model failed to run
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model
+    self.model_iter_fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/util/framework/huggingface/model_factory.py", line 55, in forward
+    return self.model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/util/framework/huggingface/model_factory.py", line 55, in forward
+    return self.model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 1709, in forward
+    encoder_outputs = self.encoder(
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 1123, in forward
+    layer_outputs = layer_module(
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 695, in forward
+    self_attention_outputs = self.layer[0](
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 602, in forward
+    attention_output = self.SelfAttention(
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 562, in forward
+    attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
+torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 6.00 GiB. GPU 0 has a total capacity of 79.15 GiB of which 4.65 GiB is free. Including non-PyTorch memory, this process has 74.50 GiB memory in use. Of the allocated memory 71.97 GiB is allocated by PyTorch, and 2.02 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+The above exception was the direct cause of the following exception:
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model
+    self.validate_model(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model
+    raise NotImplementedError("Eager model failed to run") from e
+NotImplementedError: Eager model failed to run
+
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:07, ?it/s]
+hf_T5_generate
+cuda eval  hf_T5_generate                      int8dynamic-bs32          
+AUTOTUNE int_mm(65536x512, 512x512, 65536x512)
+  triton_mm_10 0.1682 ms 100.0%
+  triton_mm_9 0.1724 ms 97.6%
+  triton_mm_2 0.1886 ms 89.2%
+  triton_mm_1 0.1900 ms 88.5%
+  triton_mm_8 0.2127 ms 79.1%
+  triton_mm_7 0.2184 ms 77.0%
+  triton_mm_4 0.2203 ms 76.4%
+  triton_mm_3 0.2296 ms 73.3%
+  triton_mm_0 0.2370 ms 71.0%
+  triton_mm_5 0.5617 ms 29.9%
+SingleProcess AUTOTUNE takes 7.7992 seconds
+AUTOTUNE bmm(256x2048x64, 256x64x2048)
+  triton_bmm_23 1.7332 ms 100.0%
+  triton_bmm_24 1.7848 ms 97.1%
+  triton_bmm_32 2.1128 ms 82.0%
+  triton_bmm_25 2.1325 ms 81.3%
+  triton_bmm_22 2.1393 ms 81.0%
+  triton_bmm_26 2.2001 ms 78.8%
+  bmm 2.3217 ms 74.7%
+  triton_bmm_30 2.3400 ms 74.1%
+  triton_bmm_29 2.4491 ms 70.8%
+  triton_bmm_33 3.7192 ms 46.6%
+SingleProcess AUTOTUNE takes 5.6589 seconds
+AUTOTUNE bmm(256x2048x2048, 256x2048x64)
+  bmm 1.4633 ms 100.0%
+  triton_bmm_47 1.5139 ms 96.7%
+  triton_bmm_53 1.5207 ms 96.2%
+  triton_bmm_49 1.5264 ms 95.9%
+  triton_bmm_46 1.5336 ms 95.4%
+  triton_bmm_48 1.5575 ms 94.0%
+  triton_bmm_45 1.6042 ms 91.2%
+  triton_bmm_52 1.6533 ms 88.5%
+  triton_bmm_51 2.0219 ms 72.4%
+  triton_bmm_50 2.0681 ms 70.8%
+SingleProcess AUTOTUNE takes 4.9103 seconds
+AUTOTUNE int_mm(65536x512, 512x2048, 65536x2048)
+  triton_mm_78 0.6094 ms 100.0%
+  triton_mm_77 0.6139 ms 99.3%
+  triton_mm_69 0.6694 ms 91.0%
+  triton_mm_70 0.6864 ms 88.8%
+  triton_mm_75 0.7522 ms 81.0%
+  triton_mm_76 0.8081 ms 75.4%
+  triton_mm_68 0.8427 ms 72.3%
+  triton_mm_72 0.8460 ms 72.0%
+  triton_mm_71 0.8682 ms 70.2%
+  triton_mm_73 2.2236 ms 27.4%
+SingleProcess AUTOTUNE takes 7.4235 seconds
+AUTOTUNE int_mm(65536x2048, 2048x512, 65536x512)
+  triton_mm_89 0.3555 ms 100.0%
+  triton_mm_88 0.3571 ms 99.6%
+  triton_mm_87 0.5916 ms 60.1%
+  triton_mm_81 0.5993 ms 59.3%
+  triton_mm_80 0.6081 ms 58.5%
+  triton_mm_83 0.6642 ms 53.5%
+  triton_mm_86 0.6766 ms 52.5%
+  triton_mm_82 0.6768 ms 52.5%
+  triton_mm_79 0.7878 ms 45.1%
+  triton_mm_84 1.8269 ms 19.5%
+SingleProcess AUTOTUNE takes 7.4638 seconds
+AUTOTUNE bmm(256x1x64, 256x64x1)
+  triton_bmm_566 0.0065 ms 100.0%
+  triton_bmm_563 0.0067 ms 97.6%
+  triton_bmm_564 0.0067 ms 97.6%
+  triton_bmm_562 0.0071 ms 91.0%
+  triton_bmm_565 0.0072 ms 90.2%
+  triton_bmm_567 0.0072 ms 90.2%
+  triton_bmm_568 0.0078 ms 83.5%
+  triton_bmm_569 0.0081 ms 80.1%
+  bmm 0.0108 ms 60.1%
+SingleProcess AUTOTUNE takes 2.7187 seconds
+AUTOTUNE bmm(256x1x1, 256x1x64)
+  triton_bmm_581 0.0067 ms 100.0%
+  triton_bmm_582 0.0067 ms 100.0%
+  triton_bmm_583 0.0067 ms 100.0%
+  triton_bmm_585 0.0067 ms 100.0%
+  triton_bmm_584 0.0072 ms 92.7%
+  triton_bmm_586 0.0072 ms 92.4%
+  triton_bmm_587 0.0072 ms 92.4%
+  bmm 0.0092 ms 72.0%
+SingleProcess AUTOTUNE takes 2.0082 seconds
+AUTOTUNE bmm(256x1x64, 256x64x2048)
+  triton_bmm_622 0.0635 ms 100.0%
+  triton_bmm_624 0.0664 ms 95.6%
+  triton_bmm_628 0.0664 ms 95.6%
+  triton_bmm_629 0.0666 ms 95.3%
+  triton_bmm_625 0.0670 ms 94.7%
+  triton_bmm_632 0.0676 ms 93.9%
+  triton_bmm_621 0.0680 ms 93.3%
+  triton_bmm_623 0.0682 ms 93.1%
+  triton_bmm_630 0.0691 ms 91.9%
+  triton_bmm_627 0.0691 ms 91.9%
+SingleProcess AUTOTUNE takes 3.4693 seconds
+AUTOTUNE bmm(256x1x2048, 256x2048x64)
+  triton_bmm_648 0.0636 ms 100.0%
+  triton_bmm_649 0.0640 ms 99.4%
+  triton_bmm_646 0.0647 ms 98.3%
+  triton_bmm_650 0.0655 ms 97.1%
+  triton_bmm_647 0.0666 ms 95.6%
+  triton_bmm_645 0.0677 ms 94.0%
+  bmm 0.0745 ms 85.4%
+  triton_bmm_644 0.0747 ms 85.2%
+  triton_bmm_652 0.0968 ms 65.7%
+  triton_bmm_651 0.0986 ms 64.5%
+SingleProcess AUTOTUNE takes 3.2208 seconds
+AUTOTUNE int_mm(32x512, 512x2048, 32x2048)
+  triton_mm_669 0.0107 ms 100.0%
+  triton_mm_672 0.0107 ms 99.7%
+  triton_mm_674 0.0108 ms 98.8%
+  triton_mm_670 0.0118 ms 90.9%
+  triton_mm_668 0.0122 ms 87.4%
+  triton_mm_673 0.0127 ms 84.5%
+  triton_mm_667 0.0131 ms 81.5%
+  triton_mm_666 0.0138 ms 77.7%
+  triton_mm_665 0.0148 ms 72.1%
+  triton_mm_664 0.0166 ms 64.2%
+SingleProcess AUTOTUNE takes 4.1606 seconds
+AUTOTUNE int_mm(32x2048, 2048x512, 32x512)
+  triton_mm_685 0.0172 ms 100.0%
+  triton_mm_684 0.0200 ms 85.9%
+  triton_mm_680 0.0206 ms 83.5%
+  triton_mm_683 0.0212 ms 80.9%
+  triton_mm_681 0.0218 ms 78.7%
+  triton_mm_679 0.0241 ms 71.3%
+  triton_mm_678 0.0284 ms 60.5%
+  triton_mm_677 0.0334 ms 51.5%
+  triton_mm_676 0.0377 ms 45.6%
+  triton_mm_675 0.0513 ms 33.5%
+SingleProcess AUTOTUNE takes 4.5694 seconds
+AUTOTUNE int_mm(32x512, 512x32128, 32x32128)
+  triton_mm_1418 0.0264 ms 100.0%
+  triton_mm_1417 0.0272 ms 97.1%
+  triton_mm_1426 0.0276 ms 95.5%
+  triton_mm_1416 0.0280 ms 94.3%
+  triton_mm_1424 0.0290 ms 90.9%
+  triton_mm_1425 0.0293 ms 90.1%
+  triton_mm_1419 0.0294 ms 89.7%
+  triton_mm_1420 0.0299 ms 88.3%
+  triton_mm_1422 0.0299 ms 88.3%
+  triton_mm_1423 0.0304 ms 86.8%
+SingleProcess AUTOTUNE takes 4.3376 seconds
+AUTOTUNE bmm(256x1x64, 256x64x2)
+  triton_bmm_1451 0.0069 ms 100.0%
+  triton_bmm_1452 0.0069 ms 100.0%
+  triton_bmm_1454 0.0072 ms 96.2%
+  triton_bmm_1453 0.0072 ms 96.0%
+  triton_bmm_1450 0.0074 ms 92.9%
+  triton_bmm_1456 0.0076 ms 90.7%
+  triton_bmm_1449 0.0079 ms 87.4%
+  triton_bmm_1455 0.0085 ms 80.7%
+  bmm 0.0101 ms 67.8%
+SingleProcess AUTOTUNE takes 2.4438 seconds
+AUTOTUNE bmm(256x1x2, 256x2x64)
+  triton_bmm_1470 0.0067 ms 100.0%
+  triton_bmm_1474 0.0069 ms 96.7%
+  triton_bmm_1468 0.0072 ms 93.1%
+  triton_bmm_1469 0.0072 ms 92.4%
+  triton_bmm_1472 0.0072 ms 92.4%
+  triton_bmm_1471 0.0074 ms 90.4%
+  triton_bmm_1473 0.0074 ms 89.7%
+  bmm 0.0101 ms 65.6%
+SingleProcess AUTOTUNE takes 2.7197 seconds
+AUTOTUNE bmm(256x1x64, 256x64x3)
+  triton_bmm_2208 0.0067 ms 100.0%
+  triton_bmm_2209 0.0067 ms 100.0%
+  triton_bmm_2207 0.0067 ms 99.5%
+  triton_bmm_2205 0.0073 ms 91.4%
+  triton_bmm_2206 0.0074 ms 89.7%
+  triton_bmm_2211 0.0078 ms 85.6%
+  triton_bmm_2204 0.0081 ms 81.9%
+  triton_bmm_2210 0.0084 ms 79.7%
+  bmm 0.0107 ms 62.0%
+SingleProcess AUTOTUNE takes 2.6151 seconds
+AUTOTUNE bmm(256x1x3, 256x3x64)
+  triton_bmm_2223 0.0067 ms 100.0%
+  triton_bmm_2224 0.0067 ms 100.0%
+  triton_bmm_2225 0.0067 ms 100.0%
+  triton_bmm_2227 0.0067 ms 100.0%
+  triton_bmm_2229 0.0070 ms 95.2%
+  triton_bmm_2228 0.0075 ms 88.9%
+  triton_bmm_2226 0.0076 ms 88.1%
+  bmm 0.0105 ms 63.4%
+SingleProcess AUTOTUNE takes 2.0928 seconds
+AUTOTUNE bmm(256x1x64, 256x64x4)
+  triton_bmm_2962 0.0067 ms 100.0%
+  triton_bmm_2960 0.0067 ms 99.5%
+  triton_bmm_2963 0.0072 ms 92.4%
+  triton_bmm_2964 0.0072 ms 92.4%
+  triton_bmm_2961 0.0074 ms 90.0%
+  triton_bmm_2966 0.0077 ms 86.0%
+  triton_bmm_2959 0.0078 ms 85.6%
+  triton_bmm_2965 0.0078 ms 85.6%
+  bmm 0.0103 ms 64.8%
+SingleProcess AUTOTUNE takes 2.4506 seconds
+AUTOTUNE bmm(256x1x4, 256x4x64)
+  triton_bmm_2980 0.0067 ms 100.0%
+  triton_bmm_2981 0.0069 ms 96.7%
+  triton_bmm_2982 0.0071 ms 93.5%
+  triton_bmm_2979 0.0072 ms 92.9%
+  triton_bmm_2978 0.0072 ms 92.4%
+  triton_bmm_2984 0.0075 ms 89.3%
+  triton_bmm_2983 0.0076 ms 87.8%
+  bmm 0.0098 ms 67.8%
+SingleProcess AUTOTUNE takes 2.0561 seconds
+AUTOTUNE bmm(256x1x64, 256x64x5)
+  triton_bmm_3718 0.0067 ms 100.0%
+  triton_bmm_3719 0.0071 ms 94.1%
+  triton_bmm_3716 0.0072 ms 92.9%
+  triton_bmm_3717 0.0073 ms 91.2%
+  triton_bmm_3714 0.0073 ms 90.8%
+  triton_bmm_3715 0.0074 ms 90.0%
+  triton_bmm_3720 0.0078 ms 85.6%
+  triton_bmm_3721 0.0078 ms 85.6%
+  bmm 0.0108 ms 61.5%
+SingleProcess AUTOTUNE takes 2.6742 seconds
+AUTOTUNE bmm(256x1x5, 256x5x64)
+  triton_bmm_3735 0.0067 ms 100.0%
+  triton_bmm_3738 0.0069 ms 95.9%
+  triton_bmm_3736 0.0070 ms 95.4%
+  triton_bmm_3739 0.0070 ms 94.5%
+  triton_bmm_3737 0.0071 ms 93.3%
+  triton_bmm_3733 0.0072 ms 92.4%
+  triton_bmm_3734 0.0072 ms 92.4%
+  bmm 0.0106 ms 62.8%
+SingleProcess AUTOTUNE takes 2.3862 seconds
+AUTOTUNE bmm(256x1x64, 256x64x6)
+  triton_bmm_4471 0.0067 ms 100.0%
+  triton_bmm_4473 0.0067 ms 100.0%
+  triton_bmm_4474 0.0067 ms 100.0%
+  triton_bmm_4470 0.0067 ms 99.5%
+  triton_bmm_4469 0.0073 ms 90.8%
+  triton_bmm_4472 0.0073 ms 90.8%
+  triton_bmm_4475 0.0078 ms 85.6%
+  triton_bmm_4476 0.0078 ms 85.6%
+  bmm 0.0103 ms 64.8%
+SingleProcess AUTOTUNE takes 2.6585 seconds
+AUTOTUNE bmm(256x1x6, 256x6x64)
+  triton_bmm_4489 0.0067 ms 100.0%
+  triton_bmm_4490 0.0067 ms 100.0%
+  triton_bmm_4492 0.0067 ms 100.0%
+  triton_bmm_4488 0.0071 ms 93.7%
+  triton_bmm_4494 0.0075 ms 89.3%
+  triton_bmm_4491 0.0075 ms 89.1%
+  triton_bmm_4493 0.0076 ms 87.0%
+  bmm 0.0112 ms 59.6%
+SingleProcess AUTOTUNE takes 2.0367 seconds
+AUTOTUNE bmm(256x1x64, 256x64x7)
+  triton_bmm_5228 0.0067 ms 100.0%
+  triton_bmm_5229 0.0067 ms 100.0%
+  triton_bmm_5225 0.0068 ms 97.2%
+  triton_bmm_5226 0.0068 ms 97.2%
+  triton_bmm_5227 0.0068 ms 97.2%
+  triton_bmm_5224 0.0078 ms 85.2%
+  triton_bmm_5230 0.0078 ms 85.2%
+  triton_bmm_5231 0.0083 ms 80.0%
+  bmm 0.0108 ms 61.4%
+SingleProcess AUTOTUNE takes 3.0510 seconds
+AUTOTUNE bmm(256x1x7, 256x7x64)
+  triton_bmm_5245 0.0067 ms 100.0%
+  triton_bmm_5247 0.0067 ms 100.0%
+  triton_bmm_5246 0.0071 ms 93.7%
+  triton_bmm_5243 0.0073 ms 91.4%
+  triton_bmm_5244 0.0073 ms 91.0%
+  triton_bmm_5249 0.0076 ms 88.1%
+  triton_bmm_5248 0.0076 ms 87.0%
+  bmm 0.0105 ms 63.2%
+SingleProcess AUTOTUNE takes 2.0999 seconds
+AUTOTUNE bmm(256x1x64, 256x64x8)
+  triton_bmm_5980 0.0067 ms 100.0%
+  triton_bmm_5981 0.0067 ms 100.0%
+  triton_bmm_5982 0.0067 ms 100.0%
+  triton_bmm_5984 0.0067 ms 100.0%
+  triton_bmm_5983 0.0072 ms 92.4%
+  triton_bmm_5985 0.0078 ms 85.6%
+  triton_bmm_5986 0.0078 ms 85.6%
+  triton_bmm_5979 0.0078 ms 85.2%
+  bmm 0.0107 ms 62.5%
+SingleProcess AUTOTUNE takes 2.4352 seconds
+AUTOTUNE bmm(256x1x8, 256x8x64)
+  triton_bmm_5998 0.0067 ms 100.0%
+  triton_bmm_6002 0.0067 ms 100.0%
+  triton_bmm_6004 0.0071 ms 93.7%
+  triton_bmm_5999 0.0072 ms 92.4%
+  triton_bmm_6000 0.0072 ms 92.2%
+  triton_bmm_6001 0.0076 ms 87.0%
+  triton_bmm_6003 0.0076 ms 87.0%
+  bmm 0.0100 ms 66.9%
+SingleProcess AUTOTUNE takes 2.0942 seconds
+AUTOTUNE bmm(256x1x64, 256x64x9)
+  triton_bmm_6736 0.0067 ms 100.0%
+  triton_bmm_6739 0.0067 ms 100.0%
+  triton_bmm_6738 0.0072 ms 92.4%
+  triton_bmm_6735 0.0073 ms 91.6%
+  triton_bmm_6737 0.0073 ms 91.0%
+  triton_bmm_6734 0.0078 ms 84.9%
+  triton_bmm_6741 0.0083 ms 79.8%
+  triton_bmm_6740 0.0084 ms 79.7%
+  bmm 0.0110 ms 60.3%
+SingleProcess AUTOTUNE takes 2.4783 seconds
+AUTOTUNE bmm(256x1x9, 256x9x64)
+  triton_bmm_6753 0.0073 ms 100.0%
+  triton_bmm_6754 0.0074 ms 99.4%
+  triton_bmm_6755 0.0074 ms 98.9%
+  triton_bmm_6757 0.0074 ms 98.9%
+  triton_bmm_6758 0.0076 ms 96.0%
+  triton_bmm_6759 0.0076 ms 96.0%
+  triton_bmm_6756 0.0077 ms 95.6%
+  bmm 0.0107 ms 68.5%
+SingleProcess AUTOTUNE takes 2.0258 seconds
+AUTOTUNE bmm(256x1x64, 256x64x10)
+  triton_bmm_7493 0.0068 ms 100.0%
+  triton_bmm_7490 0.0069 ms 99.1%
+  triton_bmm_7491 0.0069 ms 99.1%
+  triton_bmm_7492 0.0069 ms 99.1%
+  triton_bmm_7494 0.0072 ms 94.2%
+  triton_bmm_7489 0.0080 ms 85.2%
+  triton_bmm_7495 0.0080 ms 85.2%
+  triton_bmm_7496 0.0083 ms 82.1%
+  bmm 0.0105 ms 65.1%
+SingleProcess AUTOTUNE takes 2.5855 seconds
+AUTOTUNE bmm(256x1x10, 256x10x64)
+  triton_bmm_7508 0.0068 ms 100.0%
+  triton_bmm_7509 0.0068 ms 100.0%
+  triton_bmm_7513 0.0071 ms 96.4%
+  triton_bmm_7512 0.0074 ms 92.4%
+  triton_bmm_7510 0.0074 ms 92.2%
+  triton_bmm_7511 0.0076 ms 89.5%
+  triton_bmm_7514 0.0076 ms 89.5%
+  bmm 0.0118 ms 57.8%
+SingleProcess AUTOTUNE takes 2.0172 seconds
+AUTOTUNE bmm(256x1x64, 256x64x11)
+  triton_bmm_8249 0.0067 ms 100.0%
+  triton_bmm_8245 0.0069 ms 97.2%
+  triton_bmm_8246 0.0069 ms 97.2%
+  triton_bmm_8248 0.0073 ms 91.1%
+  triton_bmm_8247 0.0074 ms 90.9%
+  triton_bmm_8244 0.0078 ms 85.7%
+  triton_bmm_8251 0.0078 ms 85.7%
+  triton_bmm_8250 0.0085 ms 78.3%
+  bmm 0.0111 ms 60.4%
+SingleProcess AUTOTUNE takes 2.6832 seconds
+AUTOTUNE bmm(256x1x11, 256x11x64)
+  triton_bmm_8263 0.0069 ms 100.0%
+  triton_bmm_8266 0.0071 ms 96.4%
+  triton_bmm_8268 0.0071 ms 96.4%
+  triton_bmm_8264 0.0074 ms 93.5%
+  triton_bmm_8265 0.0074 ms 92.7%
+  triton_bmm_8267 0.0074 ms 92.7%
+  triton_bmm_8269 0.0076 ms 90.7%
+  bmm 0.0113 ms 60.7%
+SingleProcess AUTOTUNE takes 2.1178 seconds
+AUTOTUNE bmm(256x1x64, 256x64x12)
+  triton_bmm_9003 0.0068 ms 100.0%
+  triton_bmm_9004 0.0068 ms 100.0%
+  triton_bmm_9000 0.0069 ms 99.5%
+  triton_bmm_9001 0.0073 ms 94.3%
+  triton_bmm_9002 0.0074 ms 92.2%
+  triton_bmm_9005 0.0078 ms 87.7%
+  triton_bmm_8999 0.0079 ms 87.0%
+  triton_bmm_9006 0.0083 ms 82.3%
+  bmm 0.0105 ms 65.2%
+SingleProcess AUTOTUNE takes 2.6142 seconds
+AUTOTUNE bmm(256x1x12, 256x12x64)
+  triton_bmm_9019 0.0068 ms 100.0%
+  triton_bmm_9020 0.0068 ms 100.0%
+  triton_bmm_9022 0.0068 ms 100.0%
+  triton_bmm_9021 0.0071 ms 96.4%
+  triton_bmm_9024 0.0071 ms 96.4%
+  triton_bmm_9018 0.0072 ms 94.7%
+  triton_bmm_9023 0.0077 ms 89.2%
+  bmm 0.0113 ms 60.8%
+SingleProcess AUTOTUNE takes 2.0632 seconds
+AUTOTUNE bmm(256x1x64, 256x64x13)
+  triton_bmm_9756 0.0069 ms 100.0%
+  triton_bmm_9759 0.0069 ms 100.0%
+  triton_bmm_9755 0.0074 ms 92.7%
+  triton_bmm_9757 0.0074 ms 92.7%
+  triton_bmm_9758 0.0074 ms 92.7%
+  triton_bmm_9754 0.0079 ms 87.0%
+  triton_bmm_9760 0.0080 ms 86.0%
+  triton_bmm_9761 0.0084 ms 82.4%
+  bmm 0.0111 ms 62.0%
+SingleProcess AUTOTUNE takes 2.6857 seconds
+AUTOTUNE bmm(256x1x13, 256x13x64)
+  triton_bmm_9773 0.0069 ms 100.0%
+  triton_bmm_9778 0.0073 ms 93.9%
+  triton_bmm_9779 0.0073 ms 93.9%
+  triton_bmm_9774 0.0074 ms 92.7%
+  triton_bmm_9775 0.0074 ms 92.7%
+  triton_bmm_9777 0.0074 ms 92.7%
+  triton_bmm_9776 0.0078 ms 87.9%
+  bmm 0.0115 ms 59.7%
+SingleProcess AUTOTUNE takes 2.1182 seconds
+AUTOTUNE bmm(256x1x64, 256x64x14)
+  triton_bmm_10510 0.0069 ms 100.0%
+  triton_bmm_10512 0.0069 ms 100.0%
+  triton_bmm_10513 0.0069 ms 100.0%
+  triton_bmm_10509 0.0073 ms 93.9%
+  triton_bmm_10511 0.0074 ms 92.7%
+  triton_bmm_10514 0.0074 ms 92.7%
+  triton_bmm_10515 0.0080 ms 86.3%
+  triton_bmm_10516 0.0084 ms 82.4%
+  bmm 0.0106 ms 65.2%
+SingleProcess AUTOTUNE takes 2.6228 seconds
+AUTOTUNE bmm(256x1x14, 256x14x64)
+  triton_bmm_10528 0.0069 ms 100.0%
+  triton_bmm_10530 0.0069 ms 100.0%
+  triton_bmm_10532 0.0069 ms 100.0%
+  triton_bmm_10534 0.0073 ms 93.9%
+  triton_bmm_10529 0.0074 ms 92.7%
+  triton_bmm_10531 0.0078 ms 88.3%
+  triton_bmm_10533 0.0079 ms 87.6%
+  bmm 0.0125 ms 55.0%
+SingleProcess AUTOTUNE takes 2.1822 seconds
+AUTOTUNE bmm(256x1x64, 256x64x15)
+  triton_bmm_11265 0.0069 ms 100.0%
+  triton_bmm_11266 0.0069 ms 100.0%
+  triton_bmm_11268 0.0069 ms 100.0%
+  triton_bmm_11269 0.0074 ms 92.7%
+  triton_bmm_11267 0.0075 ms 92.3%
+  triton_bmm_11264 0.0075 ms 91.9%
+  triton_bmm_11271 0.0078 ms 88.1%
+  triton_bmm_11270 0.0080 ms 86.0%
+  bmm 0.0112 ms 61.4%
+SingleProcess AUTOTUNE takes 2.5356 seconds
+AUTOTUNE bmm(256x1x15, 256x15x64)
+  triton_bmm_11283 0.0069 ms 100.0%
+  triton_bmm_11284 0.0069 ms 100.0%
+  triton_bmm_11285 0.0069 ms 100.0%
+  triton_bmm_11288 0.0073 ms 93.9%
+  triton_bmm_11287 0.0074 ms 92.7%
+  triton_bmm_11286 0.0079 ms 87.6%
+  triton_bmm_11289 0.0079 ms 87.4%
+  bmm 0.0115 ms 59.7%
+SingleProcess AUTOTUNE takes 2.0453 seconds
+AUTOTUNE bmm(256x1x64, 256x64x16)
+  triton_bmm_12020 0.0069 ms 100.0%
+  triton_bmm_12021 0.0069 ms 100.0%
+  triton_bmm_12024 0.0074 ms 92.7%
+  triton_bmm_12023 0.0075 ms 92.3%
+  triton_bmm_12019 0.0075 ms 91.5%
+  triton_bmm_12022 0.0076 ms 90.0%
+  triton_bmm_12025 0.0080 ms 86.0%
+  triton_bmm_12026 0.0084 ms 82.4%
+  bmm 0.0108 ms 63.4%
+SingleProcess AUTOTUNE takes 2.4939 seconds
+AUTOTUNE bmm(256x1x16, 256x16x64)
+  triton_bmm_12042 0.0071 ms 100.0%
+  triton_bmm_12044 0.0071 ms 100.0%
+  triton_bmm_12038 0.0075 ms 94.8%
+  triton_bmm_12040 0.0075 ms 94.8%
+  triton_bmm_12041 0.0076 ms 93.2%
+  triton_bmm_12039 0.0076 ms 92.7%
+  triton_bmm_12043 0.0076 ms 92.5%
+  bmm 0.0109 ms 65.0%
+SingleProcess AUTOTUNE takes 2.1899 seconds
+AUTOTUNE bmm(256x1x64, 256x64x17)
+  triton_bmm_12775 0.0072 ms 100.0%
+  triton_bmm_12779 0.0076 ms 94.5%
+  triton_bmm_12774 0.0076 ms 94.1%
+  triton_bmm_12778 0.0076 ms 94.1%
+  triton_bmm_12776 0.0079 ms 90.7%
+  triton_bmm_12777 0.0079 ms 90.7%
+  triton_bmm_12780 0.0083 ms 86.8%
+  triton_bmm_12781 0.0086 ms 83.3%
+  bmm 0.0114 ms 62.7%
+SingleProcess AUTOTUNE takes 2.6461 seconds
+AUTOTUNE bmm(256x1x17, 256x17x64)
+  triton_bmm_12794 0.0074 ms 100.0%
+  triton_bmm_12797 0.0074 ms 100.0%
+  triton_bmm_12796 0.0076 ms 97.5%
+  triton_bmm_12793 0.0079 ms 93.1%
+  triton_bmm_12795 0.0079 ms 93.1%
+  triton_bmm_12799 0.0081 ms 91.7%
+  triton_bmm_12798 0.0081 ms 90.9%
+  triton_bmm_12800 0.0083 ms 88.8%
+  bmm 0.0095 ms 78.0%
+SingleProcess AUTOTUNE takes 2.9098 seconds
+AUTOTUNE bmm(256x1x64, 256x64x18)
+  triton_bmm_13536 0.0071 ms 100.0%
+  triton_bmm_13537 0.0071 ms 100.0%
+  triton_bmm_13539 0.0076 ms 94.1%
+  triton_bmm_13540 0.0076 ms 94.1%
+  triton_bmm_13538 0.0078 ms 91.2%
+  triton_bmm_13542 0.0078 ms 91.0%
+  triton_bmm_13535 0.0082 ms 87.5%
+  triton_bmm_13541 0.0086 ms 82.9%
+  bmm 0.0108 ms 65.8%
+SingleProcess AUTOTUNE takes 2.6943 seconds
+AUTOTUNE bmm(256x1x18, 256x18x64)
+  triton_bmm_13554 0.0071 ms 100.0%
+  triton_bmm_13556 0.0072 ms 99.6%
+  triton_bmm_13558 0.0072 ms 99.6%
+  triton_bmm_13557 0.0076 ms 94.1%
+  triton_bmm_13559 0.0076 ms 94.1%
+  triton_bmm_13560 0.0076 ms 94.1%
+  triton_bmm_13555 0.0077 ms 92.7%
+  triton_bmm_13561 0.0078 ms 91.0%
+  bmm 0.0090 ms 79.6%
+SingleProcess AUTOTUNE takes 2.4131 seconds
+AUTOTUNE bmm(256x1x64, 256x64x19)
+  triton_bmm_14297 0.0072 ms 100.0%
+  triton_bmm_14299 0.0074 ms 97.8%
+  triton_bmm_14296 0.0076 ms 94.5%
+  triton_bmm_14300 0.0076 ms 94.5%
+  triton_bmm_14301 0.0076 ms 94.1%
+  triton_bmm_14298 0.0079 ms 91.1%
+  triton_bmm_14303 0.0080 ms 89.6%
+  triton_bmm_14302 0.0083 ms 87.0%
+  bmm 0.0115 ms 62.5%
+SingleProcess AUTOTUNE takes 2.8664 seconds
+AUTOTUNE bmm(256x1x19, 256x19x64)
+  triton_bmm_14318 0.0076 ms 100.0%
+  triton_bmm_14322 0.0079 ms 96.3%
+  triton_bmm_14315 0.0079 ms 96.0%
+  triton_bmm_14317 0.0080 ms 95.4%
+  triton_bmm_14316 0.0081 ms 93.9%
+  triton_bmm_14319 0.0081 ms 93.3%
+  triton_bmm_14320 0.0081 ms 93.3%
+  triton_bmm_14321 0.0083 ms 91.9%
+  bmm 0.0099 ms 76.3%
+SingleProcess AUTOTUNE takes 2.4345 seconds
+AUTOTUNE bmm(256x1x64, 256x64x20)
+  triton_bmm_15059 0.0071 ms 100.0%
+  triton_bmm_15057 0.0076 ms 94.1%
+  triton_bmm_15061 0.0076 ms 94.1%
+  triton_bmm_15062 0.0076 ms 94.1%
+  triton_bmm_15058 0.0077 ms 92.5%
+  triton_bmm_15060 0.0078 ms 91.8%
+  triton_bmm_15064 0.0080 ms 89.2%
+  triton_bmm_15063 0.0081 ms 88.5%
+  bmm 0.0108 ms 66.0%
+SingleProcess AUTOTUNE takes 2.5139 seconds
+AUTOTUNE bmm(256x1x20, 256x20x64)
+  triton_bmm_15077 0.0074 ms 100.0%
+  triton_bmm_15080 0.0074 ms 100.0%
+  triton_bmm_15082 0.0076 ms 96.6%
+  triton_bmm_15076 0.0079 ms 93.5%
+  triton_bmm_15078 0.0079 ms 93.5%
+  triton_bmm_15079 0.0079 ms 92.7%
+  triton_bmm_15081 0.0080 ms 91.8%
+  triton_bmm_15083 0.0084 ms 87.8%
+  bmm 0.0100 ms 73.7%
+SingleProcess AUTOTUNE takes 2.3661 seconds
+AUTOTUNE bmm(256x1x64, 256x64x21)
+  triton_bmm_15819 0.0073 ms 100.0%
+  triton_bmm_15818 0.0077 ms 94.6%
+  triton_bmm_15823 0.0078 ms 94.2%
+  triton_bmm_15820 0.0079 ms 92.7%
+  triton_bmm_15821 0.0079 ms 92.7%
+  triton_bmm_15825 0.0081 ms 90.9%
+  triton_bmm_15822 0.0084 ms 87.7%
+  triton_bmm_15824 0.0088 ms 83.0%
+  bmm 0.0116 ms 63.4%
+SingleProcess AUTOTUNE takes 2.5184 seconds
+AUTOTUNE bmm(256x1x21, 256x21x64)
+  triton_bmm_15838 0.0076 ms 100.0%
+  triton_bmm_15841 0.0076 ms 100.0%
+  triton_bmm_15839 0.0076 ms 99.6%
+  triton_bmm_15840 0.0078 ms 97.5%
+  triton_bmm_15842 0.0078 ms 97.5%
+  triton_bmm_15837 0.0081 ms 93.3%
+  triton_bmm_15844 0.0086 ms 88.4%
+  triton_bmm_15843 0.0088 ms 85.9%
+  bmm 0.0100 ms 76.2%
+SingleProcess AUTOTUNE takes 2.3822 seconds
+AUTOTUNE bmm(256x1x64, 256x64x22)
+  triton_bmm_16582 0.0073 ms 100.0%
+  triton_bmm_16579 0.0076 ms 96.2%
+  triton_bmm_16583 0.0078 ms 93.9%
+  triton_bmm_16584 0.0078 ms 93.9%
+  triton_bmm_16580 0.0078 ms 93.5%
+  triton_bmm_16581 0.0079 ms 92.7%
+  triton_bmm_16586 0.0080 ms 91.2%
+  triton_bmm_16585 0.0081 ms 90.9%
+  bmm 0.0109 ms 67.2%
+SingleProcess AUTOTUNE takes 2.4796 seconds
+AUTOTUNE bmm(256x1x22, 256x22x64)
+  triton_bmm_16599 0.0074 ms 100.0%
+  triton_bmm_16600 0.0074 ms 100.0%
+  triton_bmm_16598 0.0078 ms 94.3%
+  triton_bmm_16602 0.0079 ms 93.1%
+  triton_bmm_16605 0.0080 ms 91.6%
+  triton_bmm_16601 0.0084 ms 88.1%
+  triton_bmm_16603 0.0084 ms 88.1%
+  triton_bmm_16604 0.0084 ms 88.1%
+  bmm 0.0102 ms 72.4%
+SingleProcess AUTOTUNE takes 2.3497 seconds
+AUTOTUNE bmm(256x1x64, 256x64x23)
+  triton_bmm_17341 0.0074 ms 100.0%
+  triton_bmm_17343 0.0074 ms 99.6%
+  triton_bmm_17340 0.0078 ms 94.7%
+  triton_bmm_17344 0.0078 ms 94.3%
+  triton_bmm_17342 0.0079 ms 93.1%
+  triton_bmm_17346 0.0083 ms 88.8%
+  triton_bmm_17345 0.0084 ms 88.1%
+  triton_bmm_17347 0.0086 ms 85.3%
+  bmm 0.0116 ms 63.5%
+SingleProcess AUTOTUNE takes 2.6495 seconds
+AUTOTUNE bmm(256x1x23, 256x23x64)
+  triton_bmm_17361 0.0076 ms 100.0%
+  triton_bmm_17363 0.0076 ms 100.0%
+  triton_bmm_17364 0.0078 ms 97.1%
+  triton_bmm_17366 0.0081 ms 94.0%
+  triton_bmm_17359 0.0081 ms 93.3%
+  triton_bmm_17360 0.0082 ms 92.9%
+  triton_bmm_17365 0.0083 ms 91.9%
+  triton_bmm_17362 0.0084 ms 90.8%
+  bmm 0.0096 ms 79.0%
+SingleProcess AUTOTUNE takes 2.7590 seconds
+AUTOTUNE bmm(256x1x64, 256x64x24)
+  triton_bmm_18103 0.0074 ms 100.0%
+  triton_bmm_18104 0.0074 ms 100.0%
+  triton_bmm_18101 0.0076 ms 96.6%
+  triton_bmm_18102 0.0079 ms 93.7%
+  triton_bmm_18106 0.0082 ms 89.5%
+  triton_bmm_18105 0.0084 ms 88.1%
+  triton_bmm_18108 0.0086 ms 85.8%
+  triton_bmm_18107 0.0087 ms 85.0%
+  bmm 0.0113 ms 65.0%
+SingleProcess AUTOTUNE takes 2.6010 seconds
+AUTOTUNE bmm(256x1x24, 256x24x64)
+  triton_bmm_18123 0.0076 ms 100.0%
+  triton_bmm_18124 0.0078 ms 96.7%
+  triton_bmm_18127 0.0078 ms 96.7%
+  triton_bmm_18121 0.0079 ms 96.3%
+  triton_bmm_18120 0.0079 ms 96.0%
+  triton_bmm_18122 0.0079 ms 96.0%
+  triton_bmm_18125 0.0081 ms 93.3%
+  triton_bmm_18126 0.0083 ms 91.0%
+  bmm 0.0090 ms 84.3%
+SingleProcess AUTOTUNE takes 2.4189 seconds
+AUTOTUNE bmm(256x1x64, 256x64x25)
+  triton_bmm_18862 0.0078 ms 100.0%
+  triton_bmm_18863 0.0079 ms 98.0%
+  triton_bmm_18864 0.0079 ms 98.0%
+  triton_bmm_18865 0.0079 ms 98.0%
+  triton_bmm_18867 0.0083 ms 93.5%
+  triton_bmm_18866 0.0084 ms 93.1%
+  triton_bmm_18868 0.0084 ms 92.0%
+  triton_bmm_18869 0.0088 ms 88.7%
+  bmm 0.0119 ms 65.5%
+SingleProcess AUTOTUNE takes 3.2549 seconds
+AUTOTUNE bmm(256x1x25, 256x25x64)
+  triton_bmm_18882 0.0076 ms 100.0%
+  triton_bmm_18885 0.0076 ms 99.6%
+  triton_bmm_18884 0.0078 ms 97.1%
+  triton_bmm_18881 0.0081 ms 93.3%
+  triton_bmm_18883 0.0081 ms 93.3%
+  triton_bmm_18886 0.0084 ms 90.8%
+  triton_bmm_18888 0.0086 ms 88.4%
+  triton_bmm_18887 0.0090 ms 84.3%
+  bmm 0.0101 ms 74.8%
+SingleProcess AUTOTUNE takes 2.4439 seconds
+AUTOTUNE bmm(256x1x64, 256x64x26)
+  triton_bmm_19625 0.0074 ms 100.0%
+  triton_bmm_19626 0.0074 ms 100.0%
+  triton_bmm_19623 0.0078 ms 94.7%
+  triton_bmm_19628 0.0078 ms 94.7%
+  triton_bmm_19624 0.0079 ms 93.1%
+  triton_bmm_19627 0.0082 ms 89.5%
+  triton_bmm_19630 0.0085 ms 86.1%
+  triton_bmm_19629 0.0088 ms 83.9%
+  bmm 0.0119 ms 61.8%
+SingleProcess AUTOTUNE takes 2.4727 seconds
+AUTOTUNE bmm(256x1x26, 256x26x64)
+  triton_bmm_19644 0.0076 ms 100.0%
+  triton_bmm_19645 0.0078 ms 96.7%
+  triton_bmm_19643 0.0079 ms 95.5%
+  triton_bmm_19646 0.0080 ms 94.8%
+  triton_bmm_19642 0.0081 ms 93.3%
+  triton_bmm_19649 0.0082 ms 91.8%
+  triton_bmm_19647 0.0084 ms 90.4%
+  triton_bmm_19648 0.0084 ms 90.4%
+  bmm 0.0101 ms 74.4%
+SingleProcess AUTOTUNE takes 2.4578 seconds
+AUTOTUNE bmm(256x1x64, 256x64x27)
+  triton_bmm_20389 0.0078 ms 100.0%
+  triton_bmm_20385 0.0080 ms 97.0%
+  triton_bmm_20386 0.0080 ms 96.8%
+  triton_bmm_20387 0.0081 ms 96.4%
+  triton_bmm_20391 0.0082 ms 94.6%
+  triton_bmm_20384 0.0084 ms 93.1%
+  triton_bmm_20388 0.0084 ms 93.1%
+  triton_bmm_20390 0.0090 ms 86.2%
+  bmm 0.0119 ms 65.1%
+SingleProcess AUTOTUNE takes 2.5326 seconds
+AUTOTUNE bmm(256x1x27, 256x27x64)
+  triton_bmm_20403 0.0078 ms 100.0%
+  triton_bmm_20404 0.0078 ms 100.0%
+  triton_bmm_20408 0.0079 ms 98.0%
+  triton_bmm_20405 0.0083 ms 93.3%
+  triton_bmm_20406 0.0084 ms 93.1%
+  triton_bmm_20407 0.0084 ms 93.1%
+  triton_bmm_20410 0.0086 ms 90.7%
+  triton_bmm_20409 0.0090 ms 86.5%
+  bmm 0.0108 ms 71.9%
+SingleProcess AUTOTUNE takes 2.4570 seconds
+AUTOTUNE bmm(256x1x64, 256x64x28)
+  triton_bmm_21148 0.0076 ms 100.0%
+  triton_bmm_21149 0.0078 ms 97.1%
+  triton_bmm_21150 0.0078 ms 97.1%
+  triton_bmm_21147 0.0079 ms 95.5%
+  triton_bmm_21146 0.0079 ms 95.2%
+  triton_bmm_21151 0.0083 ms 91.5%
+  triton_bmm_21145 0.0083 ms 90.8%
+  triton_bmm_21152 0.0086 ms 87.7%
+  bmm 0.0120 ms 63.1%
+SingleProcess AUTOTUNE takes 2.6885 seconds
+AUTOTUNE bmm(256x1x28, 256x28x64)
+  triton_bmm_21165 0.0076 ms 100.0%
+  triton_bmm_21166 0.0076 ms 100.0%
+  triton_bmm_21167 0.0078 ms 97.1%
+  triton_bmm_21169 0.0078 ms 97.1%
+  triton_bmm_21170 0.0078 ms 96.7%
+  triton_bmm_21164 0.0081 ms 93.3%
+  triton_bmm_21168 0.0081 ms 93.3%
+  triton_bmm_21171 0.0086 ms 88.1%
+  bmm 0.0103 ms 73.5%
+SingleProcess AUTOTUNE takes 2.4437 seconds
+AUTOTUNE bmm(256x1x64, 256x64x29)
+  triton_bmm_21907 0.0076 ms 100.0%
+  triton_bmm_21908 0.0076 ms 100.0%
+  triton_bmm_21909 0.0077 ms 97.9%
+  triton_bmm_21910 0.0078 ms 97.1%
+  triton_bmm_21911 0.0078 ms 97.1%
+  triton_bmm_21906 0.0080 ms 94.8%
+  triton_bmm_21913 0.0084 ms 89.8%
+  triton_bmm_21912 0.0085 ms 89.4%
+  bmm 0.0120 ms 63.2%
+SingleProcess AUTOTUNE takes 2.6445 seconds
+AUTOTUNE bmm(256x1x29, 256x29x64)
+  triton_bmm_21925 0.0078 ms 100.0%
+  triton_bmm_21927 0.0078 ms 100.0%
+  triton_bmm_21928 0.0080 ms 97.6%
+  triton_bmm_21930 0.0080 ms 97.6%
+  triton_bmm_21929 0.0084 ms 93.5%
+  triton_bmm_21926 0.0084 ms 93.1%
+  triton_bmm_21932 0.0087 ms 89.4%
+  triton_bmm_21931 0.0090 ms 86.5%
+  bmm 0.0108 ms 72.2%
+SingleProcess AUTOTUNE takes 2.5939 seconds
+AUTOTUNE bmm(256x1x64, 256x64x30)
+  triton_bmm_22671 0.0080 ms 100.0%
+  triton_bmm_22672 0.0080 ms 100.0%
+  triton_bmm_22669 0.0081 ms 98.6%
+  triton_bmm_22668 0.0081 ms 98.4%
+  triton_bmm_22670 0.0082 ms 97.7%
+  triton_bmm_22674 0.0083 ms 96.9%
+  triton_bmm_22673 0.0084 ms 94.7%
+  triton_bmm_22667 0.0086 ms 93.3%
+  bmm 0.0121 ms 66.3%
+SingleProcess AUTOTUNE takes 2.6107 seconds
+AUTOTUNE bmm(256x1x30, 256x30x64)
+  triton_bmm_22687 0.0076 ms 100.0%
+  triton_bmm_22688 0.0076 ms 100.0%
+  triton_bmm_22690 0.0076 ms 100.0%
+  triton_bmm_22692 0.0080 ms 94.8%
+  triton_bmm_22686 0.0081 ms 93.3%
+  triton_bmm_22689 0.0084 ms 90.8%
+  triton_bmm_22693 0.0084 ms 89.8%
+  triton_bmm_22691 0.0085 ms 89.4%
+  bmm 0.0103 ms 73.6%
+SingleProcess AUTOTUNE takes 2.4009 seconds
+AUTOTUNE bmm(256x1x64, 256x64x31)
+  triton_bmm_23430 0.0076 ms 100.0%
+  triton_bmm_23431 0.0077 ms 99.2%
+  triton_bmm_23429 0.0081 ms 93.5%
+  triton_bmm_23435 0.0084 ms 90.2%
+  triton_bmm_23428 0.0085 ms 89.8%
+  triton_bmm_23434 0.0086 ms 88.8%
+  bmm 0.0120 ms 63.5%
+  triton_bmm_23433 0.0411 ms 18.6%
+  triton_bmm_23432 0.0770 ms 9.9%
+SingleProcess AUTOTUNE takes 2.6335 seconds
+AUTOTUNE bmm(256x1x31, 256x31x64)
+  triton_bmm_23447 0.0078 ms 100.0%
+  triton_bmm_23450 0.0080 ms 97.6%
+  triton_bmm_23454 0.0082 ms 95.3%
+  triton_bmm_23451 0.0084 ms 93.9%
+  triton_bmm_23448 0.0084 ms 93.0%
+  triton_bmm_23449 0.0085 ms 91.8%
+  triton_bmm_23452 0.0086 ms 91.4%
+  triton_bmm_23453 0.0092 ms 84.8%
+  bmm 0.0103 ms 76.3%
+SingleProcess AUTOTUNE takes 2.8189 seconds
+AUTOTUNE bmm(256x1x64, 256x64x32)
+  triton_bmm_24190 0.0076 ms 100.0%
+  triton_bmm_24191 0.0076 ms 99.6%
+  triton_bmm_24192 0.0076 ms 99.6%
+  triton_bmm_24193 0.0078 ms 96.7%
+  triton_bmm_24194 0.0078 ms 96.7%
+  triton_bmm_24189 0.0080 ms 94.4%
+  triton_bmm_24196 0.0083 ms 91.5%
+  triton_bmm_24195 0.0088 ms 85.5%
+  bmm 0.0116 ms 65.0%
+SingleProcess AUTOTUNE takes 2.4409 seconds
+AUTOTUNE bmm(256x1x32, 256x32x64)
+  triton_bmm_24210 0.0076 ms 100.0%
+  triton_bmm_24214 0.0080 ms 95.0%
+  triton_bmm_24208 0.0081 ms 92.9%
+  triton_bmm_24209 0.0081 ms 92.9%
+  triton_bmm_24212 0.0081 ms 92.9%
+  triton_bmm_24211 0.0084 ms 90.4%
+  triton_bmm_24213 0.0084 ms 90.4%
+  triton_bmm_24215 0.0084 ms 90.4%
+  bmm 0.0098 ms 76.9%
+SingleProcess AUTOTUNE takes 2.5504 seconds
+AUTOTUNE bmm(256x1x64, 256x64x33)
+  triton_bmm_24953 0.0085 ms 100.0%
+  triton_bmm_24951 0.0088 ms 96.0%
+  triton_bmm_24950 0.0089 ms 95.7%
+  triton_bmm_24952 0.0089 ms 95.7%
+  triton_bmm_24954 0.0089 ms 95.3%
+  triton_bmm_24956 0.0090 ms 94.3%
+  triton_bmm_24955 0.0090 ms 94.0%
+  triton_bmm_24957 0.0092 ms 92.7%
+  triton_bmm_24958 0.0093 ms 91.4%
+  bmm 0.0124 ms 68.7%
+SingleProcess AUTOTUNE takes 2.9923 seconds
+AUTOTUNE bmm(256x1x33, 256x33x64)
+  triton_bmm_24971 0.0085 ms 100.0%
+  triton_bmm_24972 0.0085 ms 100.0%
+  triton_bmm_24975 0.0085 ms 100.0%
+  triton_bmm_24976 0.0089 ms 95.0%
+  triton_bmm_24974 0.0090 ms 94.0%
+  triton_bmm_24978 0.0092 ms 91.9%
+  triton_bmm_24973 0.0093 ms 91.5%
+  triton_bmm_24970 0.0095 ms 89.5%
+  triton_bmm_24977 0.0099 ms 85.5%
+  bmm 0.0103 ms 82.3%
+SingleProcess AUTOTUNE takes 2.8731 seconds
+AUTOTUNE bmm(256x1x64, 256x64x34)
+  triton_bmm_25725 0.0083 ms 100.0%
+  triton_bmm_25728 0.0084 ms 97.7%
+  triton_bmm_25726 0.0085 ms 97.4%
+  triton_bmm_25729 0.0085 ms 97.4%
+  triton_bmm_25731 0.0087 ms 94.9%
+  triton_bmm_25724 0.0088 ms 94.2%
+  triton_bmm_25723 0.0088 ms 93.5%
+  triton_bmm_25727 0.0088 ms 93.5%
+  triton_bmm_25730 0.0095 ms 86.9%
+  bmm 0.0127 ms 65.2%
+SingleProcess AUTOTUNE takes 3.0443 seconds
+AUTOTUNE bmm(256x1x34, 256x34x64)
+  triton_bmm_25744 0.0082 ms 100.0%
+  triton_bmm_25745 0.0083 ms 99.6%
+  triton_bmm_25750 0.0085 ms 97.0%
+  triton_bmm_25748 0.0086 ms 95.9%
+  triton_bmm_25749 0.0087 ms 94.5%
+  triton_bmm_25746 0.0089 ms 92.1%
+  triton_bmm_25747 0.0089 ms 92.1%
+  triton_bmm_25743 0.0090 ms 91.1%
+  triton_bmm_25751 0.0092 ms 88.9%
+  bmm 0.0104 ms 79.1%
+SingleProcess AUTOTUNE takes 3.1360 seconds
+AUTOTUNE bmm(256x1x64, 256x64x35)
+  triton_bmm_26500 0.0084 ms 100.0%
+  triton_bmm_26504 0.0087 ms 96.7%
+  triton_bmm_26501 0.0088 ms 96.0%
+  triton_bmm_26497 0.0088 ms 95.7%
+  triton_bmm_26498 0.0089 ms 95.3%
+  triton_bmm_26503 0.0090 ms 94.3%
+  triton_bmm_26496 0.0090 ms 93.6%
+  triton_bmm_26499 0.0090 ms 93.6%
+  triton_bmm_26502 0.0090 ms 93.6%
+  bmm 0.0124 ms 68.2%
+SingleProcess AUTOTUNE takes 3.3668 seconds
+AUTOTUNE bmm(256x1x35, 256x35x64)
+  triton_bmm_26518 0.0085 ms 100.0%
+  triton_bmm_26520 0.0085 ms 100.0%
+  triton_bmm_26519 0.0088 ms 96.7%
+  triton_bmm_26522 0.0089 ms 95.0%
+  triton_bmm_26521 0.0090 ms 94.0%
+  triton_bmm_26517 0.0091 ms 93.6%
+  triton_bmm_26524 0.0092 ms 92.3%
+  triton_bmm_26516 0.0095 ms 89.5%
+  triton_bmm_26523 0.0096 ms 88.6%
+  bmm 0.0109 ms 77.5%
+SingleProcess AUTOTUNE takes 2.7591 seconds
+AUTOTUNE bmm(256x1x64, 256x64x36)
+  triton_bmm_27270 0.0083 ms 100.0%
+  triton_bmm_27273 0.0083 ms 100.0%
+  triton_bmm_27272 0.0085 ms 97.4%
+  triton_bmm_27271 0.0088 ms 93.8%
+  triton_bmm_27275 0.0088 ms 93.8%
+  triton_bmm_27269 0.0089 ms 92.5%
+  triton_bmm_27276 0.0090 ms 92.1%
+  triton_bmm_27274 0.0090 ms 91.5%
+  triton_bmm_27277 0.0093 ms 88.7%
+  bmm 0.0127 ms 65.2%
+SingleProcess AUTOTUNE takes 2.8615 seconds
+AUTOTUNE bmm(256x1x36, 256x36x64)
+  triton_bmm_27290 0.0080 ms 100.0%
+  triton_bmm_27294 0.0083 ms 97.3%
+  triton_bmm_27292 0.0085 ms 94.4%
+  triton_bmm_27295 0.0087 ms 92.3%
+  triton_bmm_27297 0.0087 ms 91.9%
+  triton_bmm_27291 0.0088 ms 91.6%
+  triton_bmm_27293 0.0088 ms 91.3%
+  triton_bmm_27289 0.0088 ms 90.9%
+  triton_bmm_27296 0.0091 ms 88.7%
+  bmm 0.0110 ms 72.8%
+SingleProcess AUTOTUNE takes 2.9582 seconds
+AUTOTUNE bmm(256x1x64, 256x64x37)
+  triton_bmm_28048 0.0085 ms 100.0%
+  triton_bmm_28042 0.0085 ms 99.6%
+  triton_bmm_28045 0.0090 ms 94.0%
+  triton_bmm_28047 0.0090 ms 94.0%
+  triton_bmm_28043 0.0091 ms 93.6%
+  triton_bmm_28044 0.0091 ms 93.6%
+  triton_bmm_28046 0.0091 ms 93.6%
+  triton_bmm_28049 0.0092 ms 92.7%
+  triton_bmm_28050 0.0095 ms 89.5%
+  bmm 0.0124 ms 68.1%
+SingleProcess AUTOTUNE takes 3.2114 seconds
+AUTOTUNE bmm(256x1x37, 256x37x64)
+  triton_bmm_28067 0.0085 ms 100.0%
+  triton_bmm_28066 0.0087 ms 97.4%
+  triton_bmm_28070 0.0087 ms 97.4%
+  triton_bmm_28065 0.0089 ms 95.0%
+  triton_bmm_28063 0.0092 ms 91.7%
+  triton_bmm_28064 0.0092 ms 91.7%
+  triton_bmm_28068 0.0095 ms 89.2%
+  triton_bmm_28062 0.0097 ms 87.7%
+  triton_bmm_28069 0.0101 ms 84.0%
+  bmm 0.0110 ms 77.3%
+SingleProcess AUTOTUNE takes 2.9120 seconds
+AUTOTUNE bmm(256x1x64, 256x64x38)
+  triton_bmm_28816 0.0083 ms 100.0%
+  triton_bmm_28817 0.0083 ms 99.6%
+  triton_bmm_28819 0.0084 ms 98.1%
+  triton_bmm_28815 0.0085 ms 97.7%
+  triton_bmm_28818 0.0085 ms 97.4%
+  triton_bmm_28820 0.0090 ms 91.8%
+  triton_bmm_28821 0.0090 ms 91.8%
+  triton_bmm_28822 0.0092 ms 90.6%
+  triton_bmm_28823 0.0093 ms 89.0%
+  bmm 0.0128 ms 64.7%
+SingleProcess AUTOTUNE takes 2.8264 seconds
+AUTOTUNE bmm(256x1x38, 256x38x64)
+  triton_bmm_28840 0.0082 ms 100.0%
+  triton_bmm_28836 0.0083 ms 99.6%
+  triton_bmm_28835 0.0087 ms 94.8%
+  triton_bmm_28842 0.0087 ms 94.8%
+  triton_bmm_28841 0.0089 ms 92.8%
+  triton_bmm_28843 0.0089 ms 92.1%
+  triton_bmm_28837 0.0090 ms 91.5%
+  triton_bmm_28839 0.0090 ms 91.5%
+  triton_bmm_28838 0.0091 ms 90.2%
+  bmm 0.0105 ms 78.1%
+SingleProcess AUTOTUNE takes 2.8913 seconds
+AUTOTUNE bmm(256x1x64, 256x64x39)
+  triton_bmm_29593 0.0085 ms 100.0%
+  triton_bmm_29590 0.0085 ms 99.6%
+  triton_bmm_29588 0.0087 ms 97.4%
+  triton_bmm_29591 0.0087 ms 97.4%
+  triton_bmm_29596 0.0088 ms 96.7%
+  triton_bmm_29589 0.0091 ms 93.6%
+  triton_bmm_29592 0.0091 ms 93.3%
+  triton_bmm_29595 0.0092 ms 92.7%
+  triton_bmm_29594 0.0092 ms 92.0%
+  bmm 0.0126 ms 67.4%
+SingleProcess AUTOTUNE takes 2.8290 seconds
+AUTOTUNE bmm(256x1x39, 256x39x64)
+  triton_bmm_29613 0.0087 ms 100.0%
+  triton_bmm_29611 0.0090 ms 96.4%
+  triton_bmm_29608 0.0092 ms 94.8%
+  triton_bmm_29610 0.0092 ms 93.8%
+  triton_bmm_29612 0.0092 ms 93.8%
+  triton_bmm_29616 0.0092 ms 93.8%
+  triton_bmm_29609 0.0093 ms 93.6%
+  triton_bmm_29615 0.0096 ms 90.3%
+  triton_bmm_29614 0.0096 ms 90.0%
+  bmm 0.0110 ms 79.0%
+SingleProcess AUTOTUNE takes 2.8597 seconds
+AUTOTUNE bmm(256x1x64, 256x64x40)
+  triton_bmm_30362 0.0085 ms 100.0%
+  triton_bmm_30363 0.0085 ms 100.0%
+  triton_bmm_30365 0.0085 ms 100.0%
+  triton_bmm_30366 0.0085 ms 100.0%
+  triton_bmm_30364 0.0085 ms 99.6%
+  triton_bmm_30369 0.0087 ms 97.1%
+  triton_bmm_30368 0.0090 ms 94.6%
+  triton_bmm_30367 0.0090 ms 94.0%
+  triton_bmm_30361 0.0091 ms 93.6%
+  bmm 0.0122 ms 69.4%
+SingleProcess AUTOTUNE takes 3.1467 seconds
+AUTOTUNE bmm(256x1x40, 256x40x64)
+  triton_bmm_30382 0.0083 ms 100.0%
+  triton_bmm_30383 0.0089 ms 92.5%
+  triton_bmm_30384 0.0089 ms 92.5%
+  triton_bmm_30387 0.0089 ms 92.5%
+  triton_bmm_30381 0.0090 ms 91.5%
+  triton_bmm_30385 0.0090 ms 91.5%
+  triton_bmm_30388 0.0092 ms 89.3%
+  triton_bmm_30389 0.0097 ms 85.1%
+  bmm 0.0102 ms 80.9%
+  triton_bmm_30386 0.0724 ms 11.4%
+SingleProcess AUTOTUNE takes 2.9683 seconds
+AUTOTUNE bmm(256x1x64, 256x64x41)
+  triton_bmm_31139 0.0085 ms 100.0%
+  triton_bmm_31134 0.0087 ms 98.2%
+  triton_bmm_31137 0.0087 ms 98.2%
+  triton_bmm_31140 0.0087 ms 98.2%
+  triton_bmm_31142 0.0089 ms 95.7%
+  triton_bmm_31135 0.0091 ms 94.2%
+  triton_bmm_31136 0.0091 ms 94.0%
+  triton_bmm_31141 0.0092 ms 93.0%
+  triton_bmm_31138 0.0092 ms 92.7%
+  bmm 0.0129 ms 66.3%
+SingleProcess AUTOTUNE takes 2.8332 seconds
+AUTOTUNE bmm(256x1x41, 256x41x64)
+  triton_bmm_31159 0.0087 ms 100.0%
+  triton_bmm_31155 0.0087 ms 99.6%
+  triton_bmm_31156 0.0087 ms 99.6%
+  triton_bmm_31162 0.0087 ms 99.6%
+  triton_bmm_31158 0.0094 ms 92.7%
+  triton_bmm_31157 0.0095 ms 91.3%
+  triton_bmm_31161 0.0096 ms 90.7%
+  triton_bmm_31154 0.0097 ms 89.8%
+  triton_bmm_31160 0.0097 ms 89.8%
+  bmm 0.0115 ms 75.6%
+SingleProcess AUTOTUNE takes 2.7494 seconds
+AUTOTUNE bmm(256x1x64, 256x64x42)
+  triton_bmm_31907 0.0086 ms 100.0%
+  triton_bmm_31912 0.0087 ms 99.6%
+  triton_bmm_31910 0.0087 ms 99.3%
+  triton_bmm_31908 0.0090 ms 95.7%
+  triton_bmm_31909 0.0090 ms 95.7%
+  triton_bmm_31911 0.0091 ms 95.4%
+  triton_bmm_31913 0.0091 ms 95.4%
+  triton_bmm_31914 0.0092 ms 94.1%
+  triton_bmm_31915 0.0095 ms 90.9%
+  bmm 0.0129 ms 67.0%
+SingleProcess AUTOTUNE takes 2.8400 seconds
+AUTOTUNE bmm(256x1x42, 256x42x64)
+  triton_bmm_31928 0.0084 ms 100.0%
+  triton_bmm_31929 0.0085 ms 99.6%
+  triton_bmm_31927 0.0087 ms 97.1%
+  triton_bmm_31934 0.0089 ms 95.0%
+  triton_bmm_31933 0.0089 ms 94.6%
+  triton_bmm_31932 0.0089 ms 94.5%
+  triton_bmm_31931 0.0091 ms 93.3%
+  triton_bmm_31935 0.0092 ms 92.1%
+  triton_bmm_31930 0.0097 ms 87.1%
+  bmm 0.0112 ms 75.4%
+SingleProcess AUTOTUNE takes 2.8634 seconds
+AUTOTUNE bmm(256x1x64, 256x64x43)
+  triton_bmm_32680 0.0087 ms 100.0%
+  triton_bmm_32688 0.0090 ms 97.2%
+  triton_bmm_32681 0.0092 ms 95.5%
+  triton_bmm_32682 0.0092 ms 95.0%
+  triton_bmm_32685 0.0092 ms 94.5%
+  triton_bmm_32683 0.0093 ms 94.1%
+  triton_bmm_32686 0.0093 ms 94.1%
+  triton_bmm_32687 0.0098 ms 89.5%
+  bmm 0.0129 ms 67.6%
+  triton_bmm_32684 0.0486 ms 18.0%
+SingleProcess AUTOTUNE takes 2.9746 seconds
+AUTOTUNE bmm(256x1x43, 256x43x64)
+  triton_bmm_32705 0.0087 ms 100.0%
+  triton_bmm_32701 0.0089 ms 97.5%
+  triton_bmm_32708 0.0093 ms 93.8%
+  triton_bmm_32700 0.0093 ms 93.2%
+  triton_bmm_32702 0.0095 ms 91.9%
+  triton_bmm_32704 0.0095 ms 91.9%
+  triton_bmm_32703 0.0097 ms 89.6%
+  triton_bmm_32706 0.0098 ms 89.2%
+  triton_bmm_32707 0.0098 ms 88.9%
+  bmm 0.0116 ms 74.7%
+SingleProcess AUTOTUNE takes 2.8492 seconds
+AUTOTUNE bmm(256x1x64, 256x64x44)
+  triton_bmm_33455 0.0085 ms 100.0%
+  triton_bmm_33457 0.0085 ms 99.6%
+  triton_bmm_33459 0.0087 ms 98.2%
+  triton_bmm_33461 0.0090 ms 95.0%
+  triton_bmm_33454 0.0091 ms 94.0%
+  triton_bmm_33453 0.0092 ms 92.0%
+  triton_bmm_33456 0.0092 ms 92.0%
+  triton_bmm_33458 0.0092 ms 92.0%
+  triton_bmm_33460 0.0093 ms 91.1%
+  bmm 0.0130 ms 65.5%
+SingleProcess AUTOTUNE takes 2.9601 seconds
+AUTOTUNE bmm(256x1x44, 256x44x64)
+  triton_bmm_33474 0.0084 ms 100.0%
+  triton_bmm_33478 0.0087 ms 97.4%
+  triton_bmm_33480 0.0089 ms 95.0%
+  triton_bmm_33479 0.0089 ms 94.6%
+  triton_bmm_33475 0.0090 ms 93.6%
+  triton_bmm_33477 0.0090 ms 93.6%
+  triton_bmm_33473 0.0092 ms 91.3%
+  triton_bmm_33476 0.0093 ms 91.0%
+  triton_bmm_33481 0.0097 ms 87.1%
+  bmm 0.0117 ms 71.9%
+SingleProcess AUTOTUNE takes 2.8591 seconds
+AUTOTUNE bmm(256x1x64, 256x64x45)
+  triton_bmm_34226 0.0087 ms 100.0%
+  triton_bmm_34228 0.0087 ms 100.0%
+  triton_bmm_34234 0.0090 ms 97.5%
+  triton_bmm_34231 0.0092 ms 94.5%
+  triton_bmm_34227 0.0093 ms 94.1%
+  triton_bmm_34230 0.0093 ms 94.1%
+  triton_bmm_34229 0.0093 ms 93.8%
+  triton_bmm_34232 0.0094 ms 92.7%
+  triton_bmm_34233 0.0098 ms 89.5%
+  bmm 0.0130 ms 67.2%
+SingleProcess AUTOTUNE takes 2.8590 seconds
+AUTOTUNE bmm(256x1x45, 256x45x64)
+  triton_bmm_34251 0.0089 ms 100.0%
+  triton_bmm_34254 0.0089 ms 99.6%
+  triton_bmm_34248 0.0091 ms 97.9%
+  triton_bmm_34250 0.0095 ms 93.9%
+  triton_bmm_34247 0.0095 ms 93.6%
+  triton_bmm_34252 0.0097 ms 91.4%
+  triton_bmm_34249 0.0098 ms 90.8%
+  triton_bmm_34253 0.0098 ms 90.6%
+  triton_bmm_34246 0.0099 ms 89.7%
+  bmm 0.0122 ms 73.0%
+SingleProcess AUTOTUNE takes 2.7886 seconds
+AUTOTUNE bmm(256x1x64, 256x64x46)
+  triton_bmm_35000 0.0086 ms 100.0%
+  triton_bmm_34999 0.0087 ms 98.5%
+  triton_bmm_35003 0.0088 ms 98.2%
+  triton_bmm_35007 0.0090 ms 95.7%
+  triton_bmm_35004 0.0091 ms 94.7%
+  triton_bmm_35001 0.0091 ms 94.4%
+  triton_bmm_35006 0.0092 ms 93.4%
+  triton_bmm_35005 0.0093 ms 92.8%
+  triton_bmm_35002 0.0093 ms 92.4%
+  bmm 0.0130 ms 66.1%
+SingleProcess AUTOTUNE takes 2.9002 seconds
+AUTOTUNE bmm(256x1x46, 256x46x64)
+  triton_bmm_35024 0.0085 ms 100.0%
+  triton_bmm_35020 0.0085 ms 99.6%
+  triton_bmm_35023 0.0087 ms 97.4%
+  triton_bmm_35026 0.0089 ms 95.7%
+  triton_bmm_35021 0.0091 ms 93.7%
+  triton_bmm_35019 0.0093 ms 91.4%
+  triton_bmm_35025 0.0095 ms 89.3%
+  triton_bmm_35027 0.0097 ms 87.5%
+  triton_bmm_35022 0.0098 ms 86.6%
+  bmm 0.0112 ms 75.8%
+SingleProcess AUTOTUNE takes 2.9259 seconds
+AUTOTUNE bmm(256x1x64, 256x64x47)
+  triton_bmm_35772 0.0088 ms 100.0%
+  triton_bmm_35778 0.0088 ms 100.0%
+  triton_bmm_35773 0.0092 ms 95.1%
+  triton_bmm_35777 0.0093 ms 94.5%
+  triton_bmm_35774 0.0093 ms 94.2%
+  triton_bmm_35775 0.0093 ms 94.2%
+  triton_bmm_35776 0.0093 ms 94.2%
+  triton_bmm_35779 0.0094 ms 93.2%
+  triton_bmm_35780 0.0095 ms 91.9%
+  bmm 0.0131 ms 66.8%
+SingleProcess AUTOTUNE takes 2.9353 seconds
+AUTOTUNE bmm(256x1x47, 256x47x64)
+  triton_bmm_35797 0.0089 ms 100.0%
+  triton_bmm_35793 0.0090 ms 98.9%
+  triton_bmm_35796 0.0090 ms 98.9%
+  triton_bmm_35798 0.0092 ms 96.5%
+  triton_bmm_35800 0.0095 ms 93.9%
+  triton_bmm_35794 0.0095 ms 93.3%
+  triton_bmm_35795 0.0098 ms 91.1%
+  triton_bmm_35792 0.0100 ms 89.4%
+  triton_bmm_35799 0.0104 ms 85.5%
+  bmm 0.0117 ms 76.2%
+SingleProcess AUTOTUNE takes 3.1531 seconds
+AUTOTUNE bmm(256x1x64, 256x64x48)
+  triton_bmm_36549 0.0085 ms 100.0%
+  triton_bmm_36551 0.0087 ms 98.2%
+  triton_bmm_36546 0.0091 ms 94.2%
+  triton_bmm_36547 0.0091 ms 94.0%
+  triton_bmm_36545 0.0091 ms 93.7%
+  triton_bmm_36552 0.0092 ms 92.7%
+  triton_bmm_36548 0.0093 ms 92.1%
+  triton_bmm_36550 0.0093 ms 92.1%
+  triton_bmm_36553 0.0094 ms 90.8%
+  bmm 0.0125 ms 68.5%
+SingleProcess AUTOTUNE takes 3.1269 seconds
+AUTOTUNE bmm(256x1x48, 256x48x64)
+  triton_bmm_36567 0.0085 ms 100.0%
+  triton_bmm_36568 0.0087 ms 97.4%
+  triton_bmm_36573 0.0087 ms 97.4%
+  triton_bmm_36570 0.0089 ms 96.0%
+  triton_bmm_36566 0.0091 ms 94.0%
+  triton_bmm_36565 0.0091 ms 93.3%
+  triton_bmm_36569 0.0092 ms 92.2%
+  triton_bmm_36572 0.0093 ms 91.7%
+  triton_bmm_36571 0.0093 ms 91.1%
+  bmm 0.0099 ms 85.8%
+SingleProcess AUTOTUNE takes 3.1330 seconds
+AUTOTUNE bmm(256x1x64, 256x64x49)
+  triton_bmm_37318 0.0088 ms 100.0%
+  triton_bmm_37322 0.0088 ms 99.6%
+  triton_bmm_37319 0.0093 ms 94.2%
+  triton_bmm_37320 0.0093 ms 94.2%
+  triton_bmm_37323 0.0094 ms 93.5%
+  triton_bmm_37321 0.0094 ms 93.2%
+  triton_bmm_37324 0.0095 ms 92.9%
+  triton_bmm_37326 0.0096 ms 92.0%
+  triton_bmm_37325 0.0099 ms 89.3%
+  bmm 0.0131 ms 67.1%
+SingleProcess AUTOTUNE takes 2.8750 seconds
+AUTOTUNE bmm(256x1x49, 256x49x64)
+  triton_bmm_37343 0.0090 ms 100.0%
+  triton_bmm_37339 0.0092 ms 97.9%
+  triton_bmm_37340 0.0092 ms 97.7%
+  triton_bmm_37346 0.0095 ms 94.6%
+  triton_bmm_37342 0.0097 ms 92.1%
+  triton_bmm_37338 0.0100 ms 90.0%
+  triton_bmm_37341 0.0100 ms 89.7%
+  triton_bmm_37344 0.0102 ms 87.5%
+  triton_bmm_37345 0.0106 ms 84.8%
+  bmm 0.0118 ms 75.9%
+SingleProcess AUTOTUNE takes 3.3401 seconds
+TIMEOUT
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:16, ?it/s]
+WARNING:root:hf_T5_large failed to load
+hf_T5_large
+Original Error: CUDA out of memory. Tried to allocate 256.00 MiB. GPU 0 has a total capacity of 79.15 GiB of which 97.69 MiB is free. Including non-PyTorch memory, this process has 79.05 GiB memory in use. Of the allocated memory 78.31 GiB is allocated by PyTorch, and 236.92 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Eager model failed to run
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model
+    self.model_iter_fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/util/framework/huggingface/model_factory.py", line 55, in forward
+    return self.model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/util/framework/huggingface/model_factory.py", line 55, in forward
+    return self.model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 1746, in forward
+    decoder_outputs = self.decoder(
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 1123, in forward
+    layer_outputs = layer_module(
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 725, in forward
+    cross_attention_outputs = self.layer[1](
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 636, in forward
+    attention_output = self.EncDecAttention(
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 532, in forward
+    scores = torch.matmul(
+torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 MiB. GPU 0 has a total capacity of 79.15 GiB of which 97.69 MiB is free. Including non-PyTorch memory, this process has 79.05 GiB memory in use. Of the allocated memory 78.31 GiB is allocated by PyTorch, and 236.92 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+The above exception was the direct cause of the following exception:
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model
+    self.validate_model(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model
+    raise NotImplementedError("Eager model failed to run") from e
+NotImplementedError: Eager model failed to run
+
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+hf_Whisper
+cuda eval  hf_Whisper                          int8dynamic-bs32          
+AUTOTUNE int_mm(48000x256, 256x256, 48000x256)
+  triton_mm_2 0.0552 ms 100.0%
+  triton_mm_1 0.0588 ms 93.9%
+  triton_mm_8 0.0594 ms 93.0%
+  triton_mm_4 0.0601 ms 91.9%
+  triton_mm_10 0.0627 ms 88.1%
+  triton_mm_7 0.0627 ms 88.0%
+  triton_mm_9 0.0635 ms 87.0%
+  triton_mm_3 0.0658 ms 83.9%
+  triton_mm_0 0.0687 ms 80.3%
+  triton_mm_5 0.1347 ms 41.0%
+SingleProcess AUTOTUNE takes 7.6130 seconds
+AUTOTUNE int_mm(48000x256, 256x1536, 48000x1536)
+  triton_mm_45 0.2471 ms 100.0%
+  triton_mm_46 0.2575 ms 96.0%
+  triton_mm_54 0.2784 ms 88.8%
+  triton_mm_53 0.2790 ms 88.6%
+  triton_mm_51 0.2814 ms 87.8%
+  triton_mm_44 0.2850 ms 86.7%
+  triton_mm_52 0.3080 ms 80.2%
+  triton_mm_48 0.3116 ms 79.3%
+  triton_mm_47 0.3270 ms 75.6%
+  triton_mm_49 0.7737 ms 31.9%
+SingleProcess AUTOTUNE takes 7.6957 seconds
+AUTOTUNE int_mm(48000x1536, 1536x256, 48000x256)
+  triton_mm_64 0.1255 ms 100.0%
+  triton_mm_65 0.1282 ms 97.9%
+  triton_mm_63 0.1838 ms 68.3%
+  triton_mm_57 0.1851 ms 67.8%
+  triton_mm_56 0.1873 ms 67.0%
+  triton_mm_59 0.1978 ms 63.5%
+  triton_mm_58 0.2060 ms 60.9%
+  triton_mm_62 0.2177 ms 57.6%
+  triton_mm_55 0.2596 ms 48.3%
+  triton_mm_60 0.5320 ms 23.6%
+SingleProcess AUTOTUNE takes 8.0907 seconds
+AUTOTUNE int_mm(32x256, 256x2, 32x2)
+  triton_mm_412 0.0068 ms 100.0%
+  triton_mm_411 0.0076 ms 88.3%
+  triton_mm_410 0.0078 ms 86.1%
+  triton_mm_409 0.0079 ms 85.8%
+  triton_mm_408 0.0090 ms 75.1%
+  triton_mm_407 0.0093 ms 72.8%
+SingleProcess AUTOTUNE takes 2.1363 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:05,  5.71it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:04,  6.50it/s]running benchmark:  10%|█         | 3/30 [00:00<00:03,  6.80it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:03,  6.97it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:03,  7.06it/s]running benchmark:  20%|██        | 6/30 [00:00<00:03,  7.09it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:03,  7.13it/s]running benchmark:  27%|██▋       | 8/30 [00:01<00:03,  7.17it/s]running benchmark:  30%|███       | 9/30 [00:01<00:02,  7.18it/s]running benchmark:  33%|███▎      | 10/30 [00:01<00:02,  7.19it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:02,  7.18it/s]running benchmark:  40%|████      | 12/30 [00:01<00:02,  7.17it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:02,  7.17it/s]running benchmark:  47%|████▋     | 14/30 [00:01<00:02,  7.17it/s]running benchmark:  50%|█████     | 15/30 [00:02<00:02,  7.17it/s]running benchmark:  53%|█████▎    | 16/30 [00:02<00:01,  7.19it/s]running benchmark:  57%|█████▋    | 17/30 [00:02<00:01,  7.19it/s]running benchmark:  60%|██████    | 18/30 [00:02<00:01,  7.18it/s]running benchmark:  63%|██████▎   | 19/30 [00:02<00:01,  7.17it/s]running benchmark:  67%|██████▋   | 20/30 [00:02<00:01,  7.17it/s]running benchmark:  70%|███████   | 21/30 [00:02<00:01,  7.17it/s]running benchmark:  73%|███████▎  | 22/30 [00:03<00:01,  7.17it/s]running benchmark:  77%|███████▋  | 23/30 [00:03<00:00,  7.17it/s]running benchmark:  80%|████████  | 24/30 [00:03<00:00,  7.17it/s]running benchmark:  83%|████████▎ | 25/30 [00:03<00:00,  7.17it/s]running benchmark:  87%|████████▋ | 26/30 [00:03<00:00,  7.17it/s]running benchmark:  90%|█████████ | 27/30 [00:03<00:00,  7.18it/s]running benchmark:  93%|█████████▎| 28/30 [00:03<00:00,  7.19it/s]running benchmark:  97%|█████████▋| 29/30 [00:04<00:00,  7.18it/s]running benchmark: 100%|██████████| 30/30 [00:04<00:00,  7.18it/s]running benchmark: 100%|██████████| 30/30 [00:04<00:00,  7.12it/s]
+10309.440ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:05, ?it/s]
+WARNING:root:hf_clip failed to load
+hf_clip
+Original Error: 'str' object has no attribute 'shape'
+Eager model failed to run
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model
+    self.model_iter_fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.py", line 1142, in forward
+    vision_outputs = self.vision_model(
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.py", line 883, in forward
+    hidden_states = self.embeddings(pixel_values)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.py", line 194, in forward
+    batch_size = pixel_values.shape[0]
+AttributeError: 'str' object has no attribute 'shape'
+
+The above exception was the direct cause of the following exception:
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model
+    self.validate_model(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model
+    raise NotImplementedError("Eager model failed to run") from e
+NotImplementedError: Eager model failed to run
+
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+lennard_jones
+cuda eval  lennard_jones                       int8dynamic-bs32          
+AUTOTUNE mm(32x1, 1x16)
+  triton_mm_0 0.0060 ms 100.0%
+  triton_mm_1 0.0060 ms 100.0%
+  triton_mm_2 0.0060 ms 100.0%
+  triton_mm_4 0.0060 ms 100.0%
+  mm 0.0064 ms 93.0%
+  triton_mm_3 0.0064 ms 92.5%
+SingleProcess AUTOTUNE takes 1.9253 seconds
+AUTOTUNE mm(32x16, 16x16)
+  triton_mm_6 0.0060 ms 100.0%
+  triton_mm_7 0.0060 ms 100.0%
+  triton_mm_9 0.0060 ms 99.5%
+  triton_mm_5 0.0064 ms 93.0%
+  triton_mm_8 0.0065 ms 91.6%
+  mm 0.0080 ms 74.1%
+SingleProcess AUTOTUNE takes 1.5568 seconds
+AUTOTUNE addmm(32x1, 32x16, 16x1)
+  triton_mm_21 0.0060 ms 100.0%
+  triton_mm_23 0.0060 ms 100.0%
+  triton_mm_24 0.0060 ms 100.0%
+  triton_mm_20 0.0066 ms 91.0%
+  triton_mm_22 0.0067 ms 89.9%
+  addmm 0.0093 ms 64.5%
+SingleProcess AUTOTUNE takes 1.6588 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 1275.47it/s]
+1399.172ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+llama
+cuda eval  llama                               int8dynamic-bs32          
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:04,  6.74it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:04,  6.71it/s]running benchmark:  10%|█         | 3/30 [00:00<00:04,  6.71it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:03,  6.68it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:03,  6.67it/s]running benchmark:  20%|██        | 6/30 [00:00<00:03,  6.72it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:03,  6.75it/s]running benchmark:  27%|██▋       | 8/30 [00:01<00:03,  6.73it/s]running benchmark:  30%|███       | 9/30 [00:01<00:03,  6.73it/s]running benchmark:  33%|███▎      | 10/30 [00:01<00:02,  6.72it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:02,  6.66it/s]running benchmark:  40%|████      | 12/30 [00:01<00:02,  6.72it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:02,  6.76it/s]running benchmark:  47%|████▋     | 14/30 [00:02<00:02,  6.75it/s]running benchmark:  50%|█████     | 15/30 [00:02<00:02,  6.78it/s]running benchmark:  53%|█████▎    | 16/30 [00:02<00:02,  6.82it/s]running benchmark:  57%|█████▋    | 17/30 [00:02<00:01,  6.85it/s]running benchmark:  60%|██████    | 18/30 [00:02<00:01,  6.84it/s]running benchmark:  63%|██████▎   | 19/30 [00:02<00:01,  6.83it/s]running benchmark:  67%|██████▋   | 20/30 [00:02<00:01,  6.84it/s]running benchmark:  70%|███████   | 21/30 [00:03<00:01,  6.85it/s]running benchmark:  73%|███████▎  | 22/30 [00:03<00:01,  6.87it/s]running benchmark:  77%|███████▋  | 23/30 [00:03<00:01,  6.88it/s]running benchmark:  80%|████████  | 24/30 [00:03<00:00,  6.89it/s]running benchmark:  83%|████████▎ | 25/30 [00:03<00:00,  6.82it/s]running benchmark:  87%|████████▋ | 26/30 [00:03<00:00,  6.81it/s]running benchmark:  90%|█████████ | 27/30 [00:03<00:00,  6.82it/s]running benchmark:  93%|█████████▎| 28/30 [00:04<00:00,  6.84it/s]running benchmark:  97%|█████████▋| 29/30 [00:04<00:00,  6.86it/s]running benchmark: 100%|██████████| 30/30 [00:04<00:00,  6.85it/s]running benchmark: 100%|██████████| 30/30 [00:04<00:00,  6.79it/s]
+50095.435ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:59, ?it/s]
+WARNING:root:llama_v2_7b_16h failed to load
+llama_v2_7b_16h
+Original Error: CUDA out of memory. Tried to allocate 344.00 MiB. GPU 0 has a total capacity of 79.15 GiB of which 181.69 MiB is free. Including non-PyTorch memory, this process has 78.97 GiB memory in use. Of the allocated memory 77.12 GiB is allocated by PyTorch, and 1.35 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Eager model failed to run
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model
+    self.model_iter_fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py", line 820, in forward
+    outputs = self.model(
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py", line 708, in forward
+    layer_outputs = decoder_layer(
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py", line 437, in forward
+    hidden_states = self.mlp(hidden_states)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py", line 220, in forward
+    down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/linear.py", line 116, in forward
+    return F.linear(input, self.weight, self.bias)
+torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 344.00 MiB. GPU 0 has a total capacity of 79.15 GiB of which 181.69 MiB is free. Including non-PyTorch memory, this process has 78.97 GiB memory in use. Of the allocated memory 77.12 GiB is allocated by PyTorch, and 1.35 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+The above exception was the direct cause of the following exception:
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model
+    self.validate_model(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model
+    raise NotImplementedError("Eager model failed to run") from e
+NotImplementedError: Eager model failed to run
+
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+maml_omniglot
+cuda eval  maml_omniglot                       int8dynamic-bs32          
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 150.24it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 150.58it/s]
+23028.034ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+mnasnet1_0
+cuda eval  mnasnet1_0                          int8dynamic-bs32          
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 82.45it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 93.38it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 96.22it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 94.23it/s]
+4097.599ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+mobilenet_v2
+cuda eval  mobilenet_v2                        int8dynamic-bs32          
+AUTOTUNE mm(401408x16, 16x96)
+  triton_mm_26 0.0644 ms 100.0%
+  triton_mm_22 0.0652 ms 98.9%
+  triton_mm_27 0.0661 ms 97.4%
+  triton_mm_17 0.0675 ms 95.4%
+  triton_mm_21 0.0676 ms 95.2%
+  triton_mm_24 0.0686 ms 93.8%
+  triton_mm_19 0.0689 ms 93.5%
+  triton_mm_25 0.0689 ms 93.5%
+  triton_mm_20 0.0689 ms 93.4%
+  triton_mm_18 0.0690 ms 93.3%
+SingleProcess AUTOTUNE takes 3.7870 seconds
+AUTOTUNE mm(100352x96, 96x24)
+  triton_mm_35 0.0296 ms 100.0%
+  triton_mm_31 0.0300 ms 98.5%
+  triton_mm_32 0.0303 ms 97.5%
+  triton_mm_33 0.0306 ms 96.6%
+  triton_mm_30 0.0309 ms 95.8%
+  triton_mm_36 0.0316 ms 93.4%
+  triton_mm_39 0.0316 ms 93.4%
+  triton_mm_38 0.0317 ms 93.3%
+  triton_mm_29 0.0318 ms 93.0%
+  triton_mm_28 0.0323 ms 91.7%
+SingleProcess AUTOTUNE takes 3.9924 seconds
+AUTOTUNE mm(100352x24, 24x144)
+  triton_mm_42 0.0323 ms 100.0%
+  triton_mm_40 0.0333 ms 96.9%
+  triton_mm_44 0.0342 ms 94.4%
+  triton_mm_51 0.0347 ms 93.3%
+  triton_mm_48 0.0355 ms 91.1%
+  triton_mm_45 0.0355 ms 91.0%
+  triton_mm_49 0.0379 ms 85.3%
+  triton_mm_41 0.0379 ms 85.2%
+  triton_mm_47 0.0391 ms 82.7%
+  triton_mm_46 0.0396 ms 81.5%
+SingleProcess AUTOTUNE takes 5.0647 seconds
+AUTOTUNE mm(100352x144, 144x24)
+  triton_mm_54 0.0408 ms 100.0%
+  triton_mm_56 0.0413 ms 98.7%
+  triton_mm_60 0.0418 ms 97.6%
+  triton_mm_57 0.0418 ms 97.5%
+  triton_mm_59 0.0419 ms 97.3%
+  triton_mm_55 0.0420 ms 97.2%
+  triton_mm_63 0.0420 ms 97.2%
+  triton_mm_53 0.0430 ms 94.8%
+  triton_mm_62 0.0430 ms 94.7%
+  mm 0.0453 ms 90.0%
+SingleProcess AUTOTUNE takes 4.4281 seconds
+AUTOTUNE mm(25088x144, 144x32)
+  triton_mm_76 0.0147 ms 100.0%
+  triton_mm_77 0.0149 ms 98.9%
+  triton_mm_83 0.0149 ms 98.7%
+  triton_mm_78 0.0151 ms 97.5%
+  triton_mm_80 0.0156 ms 94.7%
+  triton_mm_84 0.0165 ms 89.3%
+  triton_mm_87 0.0166 ms 88.8%
+  triton_mm_81 0.0168 ms 87.6%
+  triton_mm_79 0.0169 ms 87.3%
+  triton_mm_86 0.0169 ms 87.1%
+SingleProcess AUTOTUNE takes 3.8625 seconds
+AUTOTUNE mm(25088x32, 32x192)
+  triton_mm_88 0.0135 ms 100.0%
+  triton_mm_96 0.0145 ms 93.4%
+  triton_mm_90 0.0147 ms 92.0%
+  triton_mm_92 0.0150 ms 90.4%
+  triton_mm_94 0.0150 ms 90.2%
+  triton_mm_91 0.0158 ms 85.8%
+  triton_mm_89 0.0158 ms 85.5%
+  triton_mm_99 0.0163 ms 82.9%
+  triton_mm_93 0.0164 ms 82.5%
+  triton_mm_97 0.0164 ms 82.5%
+SingleProcess AUTOTUNE takes 4.1047 seconds
+AUTOTUNE mm(25088x192, 192x32)
+  triton_mm_104 0.0165 ms 100.0%
+  triton_mm_101 0.0166 ms 99.6%
+  triton_mm_108 0.0167 ms 99.0%
+  triton_mm_102 0.0168 ms 98.3%
+  triton_mm_106 0.0170 ms 97.5%
+  triton_mm_100 0.0172 ms 95.9%
+  triton_mm_107 0.0175 ms 94.5%
+  triton_mm_109 0.0176 ms 93.8%
+  triton_mm_103 0.0178 ms 93.2%
+  triton_mm_105 0.0180 ms 92.0%
+SingleProcess AUTOTUNE takes 4.0920 seconds
+AUTOTUNE mm(6272x192, 192x64)
+  triton_mm_156 0.0104 ms 100.0%
+  triton_mm_149 0.0104 ms 99.7%
+  triton_mm_151 0.0105 ms 99.4%
+  triton_mm_154 0.0109 ms 95.0%
+  triton_mm_152 0.0111 ms 93.7%
+  triton_mm_153 0.0112 ms 92.9%
+  mm 0.0113 ms 91.8%
+  triton_mm_150 0.0114 ms 91.5%
+  triton_mm_157 0.0119 ms 87.6%
+  triton_mm_148 0.0127 ms 81.9%
+SingleProcess AUTOTUNE takes 4.6062 seconds
+AUTOTUNE mm(6272x64, 64x384)
+  triton_mm_161 0.0114 ms 100.0%
+  triton_mm_162 0.0116 ms 98.6%
+  triton_mm_164 0.0126 ms 90.8%
+  triton_mm_168 0.0126 ms 90.8%
+  triton_mm_163 0.0126 ms 90.4%
+  triton_mm_160 0.0133 ms 86.0%
+  mm 0.0140 ms 81.9%
+  triton_mm_170 0.0142 ms 80.4%
+  triton_mm_167 0.0144 ms 79.3%
+  triton_mm_169 0.0149 ms 76.6%
+SingleProcess AUTOTUNE takes 5.0516 seconds
+AUTOTUNE mm(6272x384, 384x64)
+  triton_mm_180 0.0127 ms 100.0%
+  triton_mm_175 0.0128 ms 98.8%
+  triton_mm_176 0.0132 ms 95.9%
+  triton_mm_177 0.0133 ms 95.0%
+  mm 0.0137 ms 92.3%
+  triton_mm_173 0.0138 ms 91.9%
+  triton_mm_178 0.0142 ms 89.4%
+  triton_mm_181 0.0142 ms 89.2%
+  triton_mm_174 0.0148 ms 85.7%
+  triton_mm_172 0.0187 ms 67.7%
+SingleProcess AUTOTUNE takes 4.6523 seconds
+AUTOTUNE mm(6272x384, 384x96)
+  triton_mm_248 0.0134 ms 100.0%
+  triton_mm_247 0.0135 ms 99.8%
+  triton_mm_252 0.0141 ms 95.0%
+  triton_mm_245 0.0149 ms 90.1%
+  mm 0.0150 ms 89.7%
+  triton_mm_246 0.0155 ms 86.8%
+  triton_mm_249 0.0163 ms 82.5%
+  triton_mm_253 0.0172 ms 78.4%
+  triton_mm_244 0.0196 ms 68.5%
+  triton_mm_250 0.0207 ms 65.0%
+SingleProcess AUTOTUNE takes 4.9193 seconds
+AUTOTUNE mm(1568x576, 576x160)
+  mm 0.0118 ms 100.0%
+  triton_mm_324 0.0121 ms 97.5%
+  triton_mm_320 0.0139 ms 85.3%
+  triton_mm_319 0.0141 ms 83.6%
+  triton_mm_321 0.0141 ms 83.6%
+  triton_mm_325 0.0144 ms 82.1%
+  triton_mm_322 0.0151 ms 78.5%
+  triton_mm_317 0.0162 ms 73.2%
+  triton_mm_318 0.0163 ms 72.5%
+  triton_mm_316 0.0228 ms 51.9%
+SingleProcess AUTOTUNE takes 4.9487 seconds
+AUTOTUNE mm(1568x960, 960x320)
+  mm 0.0152 ms 100.0%
+  triton_mm_391 0.0186 ms 81.9%
+  triton_mm_392 0.0186 ms 81.8%
+  triton_mm_396 0.0194 ms 78.4%
+  triton_mm_389 0.0225 ms 67.7%
+  triton_mm_390 0.0226 ms 67.4%
+  triton_mm_393 0.0256 ms 59.4%
+  triton_mm_394 0.0261 ms 58.3%
+  triton_mm_397 0.0273 ms 55.8%
+  triton_mm_388 0.0336 ms 45.2%
+SingleProcess AUTOTUNE takes 5.0660 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 83.74it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 91.20it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 93.80it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 92.42it/s]
+4195.981ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+WARNING:root:mobilenet_v2_quantized_qat failed to load
+mobilenet_v2_quantized_qat
+The eval test only supports CPU.
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 443, in load_model
+    benchmark = benchmark_cls(
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/util/model.py", line 24, in __call__
+    obj = type.__call__(cls, *args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/models/mobilenet_v2_quantized_qat/__init__.py", line 21, in __init__
+    raise NotImplementedError("The eval test only supports CPU.")
+NotImplementedError: The eval test only supports CPU.
+
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+mobilenet_v3_large
+cuda eval  mobilenet_v3_large                  int8dynamic-bs32          
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:00, 68.31it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 73.79it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 75.45it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 75.07it/s]
+6028.052ms
+loading model: 0it [00:00, ?it/s]NCCL version 2.19.3+cuda12.0
+loading model: 0it [00:03, ?it/s]
+moco
+cuda eval  moco                                int8dynamic-bs32          
+ERROR:common:Backend eager failed in warmup()
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2380, in warmup
+    fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/parallel/distributed.py", line 1523, in forward
+    else self._run_ddp_forward(*inputs, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/parallel/distributed.py", line 1359, in _run_ddp_forward
+    return self.module(*inputs, **kwargs)  # type: ignore[index]
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/models/moco/moco/builder.py", line 130, in forward
+    self._momentum_update_key_encoder()  # update the key encoder
+  File "/home/cdhernandez/local/pytorch/torch/utils/_contextlib.py", line 115, in decorate_context
+    return func(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/models/moco/moco/builder.py", line 50, in _momentum_update_key_encoder
+    param_k.mul_(self.m).add_(param_q.mul(1. - self.m))
+TypeError: add_(): argument 'other' (position 1) must be Tensor, not NoneType
+Run failed with return code:  255
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:06, ?it/s]
+nanogpt
+number of parameters: 123.69M
+num decayed parameter tensors: 50, with 124,354,560 parameters
+num non-decayed parameter tensors: 98, with 121,344 parameters
+using fused AdamW: True
+cuda eval  nanogpt                             int8dynamic-bs32          
+AUTOTUNE int_mm(2048x768, 768x2304, 2048x2304)
+  triton_mm_10 0.0452 ms 100.0%
+  triton_mm_9 0.0456 ms 99.2%
+  triton_mm_8 0.0472 ms 95.9%
+  triton_mm_1 0.0483 ms 93.6%
+  triton_mm_2 0.0496 ms 91.1%
+  triton_mm_4 0.0518 ms 87.3%
+  triton_mm_3 0.0532 ms 85.1%
+  triton_mm_7 0.0546 ms 82.9%
+  triton_mm_0 0.0613 ms 73.8%
+  triton_mm_5 0.1202 ms 37.6%
+SingleProcess AUTOTUNE takes 7.4803 seconds
+AUTOTUNE int_mm(32x768, 768x50304, 32x50304)
+  triton_mm_538 0.0486 ms 100.0%
+  triton_mm_537 0.0492 ms 98.8%
+  triton_mm_536 0.0501 ms 97.1%
+  triton_mm_529 0.0565 ms 86.0%
+  triton_mm_531 0.0577 ms 84.3%
+  triton_mm_530 0.0579 ms 83.9%
+  triton_mm_532 0.0591 ms 82.2%
+  triton_mm_528 0.0594 ms 81.8%
+  triton_mm_534 0.0597 ms 81.4%
+  triton_mm_535 0.0602 ms 80.8%
+SingleProcess AUTOTUNE takes 4.0779 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:03,  9.16it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:02,  9.63it/s]running benchmark:  10%|█         | 3/30 [00:00<00:02,  9.73it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:02,  9.76it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:02,  9.82it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:02,  9.89it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:02,  9.91it/s]running benchmark:  30%|███       | 9/30 [00:00<00:02,  9.92it/s]running benchmark:  33%|███▎      | 10/30 [00:01<00:02,  9.93it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:01,  9.94it/s]running benchmark:  40%|████      | 12/30 [00:01<00:01,  9.94it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:01,  9.91it/s]running benchmark:  47%|████▋     | 14/30 [00:01<00:01,  9.88it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01,  9.90it/s]running benchmark:  53%|█████▎    | 16/30 [00:01<00:01,  9.91it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:01,  9.92it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:01,  9.95it/s]running benchmark:  67%|██████▋   | 20/30 [00:02<00:01,  9.94it/s]running benchmark:  70%|███████   | 21/30 [00:02<00:00,  9.96it/s]running benchmark:  73%|███████▎  | 22/30 [00:02<00:00,  9.96it/s]running benchmark:  77%|███████▋  | 23/30 [00:02<00:00,  9.93it/s]running benchmark:  83%|████████▎ | 25/30 [00:02<00:00,  9.96it/s]running benchmark:  90%|█████████ | 27/30 [00:02<00:00,  9.98it/s]running benchmark:  93%|█████████▎| 28/30 [00:02<00:00,  9.97it/s]running benchmark:  97%|█████████▋| 29/30 [00:02<00:00,  9.85it/s]running benchmark: 100%|██████████| 30/30 [00:03<00:00,  9.79it/s]running benchmark: 100%|██████████| 30/30 [00:03<00:00,  9.89it/s]
+25040.494ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+nvidia_deeprecommender
+cuda eval  nvidia_deeprecommender              int8dynamic-bs32          
+AUTOTUNE mm(32x197951, 197951x512)
+  mm 0.3139 ms 100.0%
+  triton_mm_9 2.1959 ms 14.3%
+  triton_mm_8 4.0077 ms 7.8%
+  triton_mm_2 6.1151 ms 5.1%
+  triton_mm_0 6.1158 ms 5.1%
+  triton_mm_1 7.6492 ms 4.1%
+  triton_mm_7 7.8104 ms 4.0%
+  triton_mm_3 7.8128 ms 4.0%
+  triton_mm_5 8.1867 ms 3.8%
+  triton_mm_6 8.4402 ms 3.7%
+SingleProcess AUTOTUNE takes 5.5925 seconds
+AUTOTUNE mm(32x512, 512x1024)
+  triton_mm_29 0.0098 ms 100.0%
+  mm 0.0106 ms 92.7%
+  triton_mm_32 0.0106 ms 92.7%
+  triton_mm_33 0.0111 ms 88.2%
+  triton_mm_30 0.0112 ms 87.3%
+  triton_mm_28 0.0113 ms 86.4%
+  triton_mm_27 0.0127 ms 77.3%
+  triton_mm_26 0.0139 ms 70.3%
+  triton_mm_25 0.0147 ms 66.7%
+  triton_mm_24 0.0201 ms 48.6%
+SingleProcess AUTOTUNE takes 4.5305 seconds
+AUTOTUNE mm(32x1024, 1024x512)
+  mm 0.0123 ms 100.0%
+  triton_mm_41 0.0135 ms 91.4%
+  triton_mm_42 0.0144 ms 85.6%
+  triton_mm_44 0.0148 ms 83.5%
+  triton_mm_45 0.0149 ms 82.6%
+  triton_mm_40 0.0160 ms 76.8%
+  triton_mm_38 0.0196 ms 62.8%
+  triton_mm_37 0.0212 ms 58.2%
+  triton_mm_36 0.0331 ms 37.3%
+  triton_mm_43 0.0372 ms 33.2%
+SingleProcess AUTOTUNE takes 5.0794 seconds
+AUTOTUNE mm(32x512, 512x197951)
+  triton_mm_61 0.1628 ms 100.0%
+  triton_mm_62 0.1636 ms 99.5%
+  triton_mm_63 0.1643 ms 99.1%
+  triton_mm_64 0.1649 ms 98.7%
+  triton_mm_69 0.1652 ms 98.5%
+  triton_mm_66 0.1664 ms 97.8%
+  triton_mm_68 0.1731 ms 94.0%
+  triton_mm_65 0.1924 ms 84.6%
+  triton_mm_67 0.2113 ms 77.0%
+  triton_mm_60 0.2139 ms 76.1%
+SingleProcess AUTOTUNE takes 4.9896 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 530.61it/s]
+1405.421ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+opacus_cifar10
+cuda eval  opacus_cifar10                      int8dynamic-bs32          
+AUTOTUNE int_mm(32x512, 512x10, 32x10)
+  triton_mm_144 0.0077 ms 100.0%
+  triton_mm_143 0.0088 ms 88.0%
+  triton_mm_141 0.0093 ms 82.9%
+  triton_mm_142 0.0096 ms 80.9%
+  triton_mm_140 0.0117 ms 66.3%
+  triton_mm_139 0.0139 ms 55.8%
+SingleProcess AUTOTUNE takes 2.0472 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 149.72it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 149.99it/s]
+6662.700ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:27, ?it/s]
+WARNING:root:phi_1_5 failed to load
+phi_1_5
+Original Error: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU 0 has a total capacity of 79.15 GiB of which 589.69 MiB is free. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 77.73 GiB is allocated by PyTorch, and 336.30 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Eager model failed to run
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model
+    self.model_iter_fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/.cache/huggingface/modules/transformers_modules/microsoft/phi-1_5/ca573e3fa39359f2d6af87bd7a05aa3c616ab1a2/modeling_phi.py", line 954, in forward
+    hidden_states = self.transformer(input_ids, past_key_values=past_key_values, attention_mask=attention_mask)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/.cache/huggingface/modules/transformers_modules/microsoft/phi-1_5/ca573e3fa39359f2d6af87bd7a05aa3c616ab1a2/modeling_phi.py", line 916, in forward
+    hidden_states = layer(
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/.cache/huggingface/modules/transformers_modules/microsoft/phi-1_5/ca573e3fa39359f2d6af87bd7a05aa3c616ab1a2/modeling_phi.py", line 770, in forward
+    attn_outputs = self.mixer(
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/.cache/huggingface/modules/transformers_modules/microsoft/phi-1_5/ca573e3fa39359f2d6af87bd7a05aa3c616ab1a2/modeling_phi.py", line 722, in forward
+    attn_output = self._forward_self_attn(x, attention_mask)
+  File "/home/cdhernandez/.cache/huggingface/modules/transformers_modules/microsoft/phi-1_5/ca573e3fa39359f2d6af87bd7a05aa3c616ab1a2/modeling_phi.py", line 621, in _forward_self_attn
+    return self.inner_attn(qkv, key_padding_mask=key_padding_mask)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/amp/autocast_mode.py", line 16, in decorate_autocast
+    return func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/amp/autocast_mode.py", line 16, in decorate_autocast
+    return func(*args, **kwargs)
+  File "/home/cdhernandez/.cache/huggingface/modules/transformers_modules/microsoft/phi-1_5/ca573e3fa39359f2d6af87bd7a05aa3c616ab1a2/modeling_phi.py", line 367, in forward
+    scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale)
+  File "/home/cdhernandez/local/pytorch/torch/functional.py", line 380, in einsum
+    return _VF.einsum(equation, operands)  # type: ignore[attr-defined]
+torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU 0 has a total capacity of 79.15 GiB of which 589.69 MiB is free. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 77.73 GiB is allocated by PyTorch, and 336.30 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+The above exception was the direct cause of the following exception:
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model
+    self.validate_model(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model
+    raise NotImplementedError("Eager model failed to run") from e
+NotImplementedError: Eager model failed to run
+
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+phlippe_densenet
+cuda eval  phlippe_densenet                    int8dynamic-bs32          
+AUTOTUNE convolution(32x3x32x32, 32x3x3x3)
+  triton_convolution_0 0.0162 ms 100.0%
+  triton_convolution_4 0.0166 ms 97.9%
+  convolution 0.0184 ms 88.2%
+  triton_convolution_3 0.0198 ms 81.8%
+  triton_convolution_2 0.0214 ms 75.8%
+  triton_convolution_5 0.0251 ms 64.7%
+  triton_convolution_1 0.0260 ms 62.4%
+SingleProcess AUTOTUNE takes 2.9950 seconds
+AUTOTUNE mm(32768x32, 32x32)
+  triton_mm_9 0.0096 ms 100.0%
+  triton_mm_11 0.0096 ms 100.0%
+  triton_mm_14 0.0096 ms 100.0%
+  triton_mm_6 0.0098 ms 98.0%
+  triton_mm_7 0.0098 ms 98.0%
+  triton_mm_17 0.0098 ms 98.0%
+  triton_mm_8 0.0099 ms 97.1%
+  triton_mm_16 0.0100 ms 96.1%
+  triton_mm_12 0.0100 ms 95.5%
+  triton_mm_15 0.0100 ms 95.5%
+SingleProcess AUTOTUNE takes 3.3572 seconds
+AUTOTUNE convolution(32x32x32x32, 16x32x3x3)
+  convolution 0.0164 ms 100.0%
+  triton_convolution_21 0.0302 ms 54.4%
+  triton_convolution_22 0.0314 ms 52.4%
+  triton_convolution_18 0.0315 ms 52.2%
+  triton_convolution_23 0.0425 ms 38.7%
+  triton_convolution_19 0.0427 ms 38.6%
+  triton_convolution_20 0.0780 ms 21.1%
+SingleProcess AUTOTUNE takes 3.1045 seconds
+AUTOTUNE mm(32768x48, 48x32)
+  triton_mm_34 0.0109 ms 100.0%
+  triton_mm_35 0.0113 ms 96.0%
+  triton_mm_26 0.0116 ms 94.2%
+  triton_mm_33 0.0116 ms 93.4%
+  triton_mm_31 0.0117 ms 92.9%
+  triton_mm_24 0.0117 ms 92.6%
+  triton_mm_32 0.0118 ms 91.9%
+  triton_mm_27 0.0121 ms 89.7%
+  triton_mm_28 0.0122 ms 89.1%
+  triton_mm_25 0.0126 ms 86.5%
+SingleProcess AUTOTUNE takes 4.0981 seconds
+AUTOTUNE mm(32768x128, 128x64)
+  triton_mm_114 0.0159 ms 100.0%
+  triton_mm_115 0.0166 ms 96.0%
+  triton_mm_121 0.0168 ms 94.7%
+  triton_mm_116 0.0170 ms 94.0%
+  triton_mm_117 0.0175 ms 90.9%
+  triton_mm_122 0.0181 ms 88.0%
+  triton_mm_118 0.0191 ms 83.4%
+  triton_mm_124 0.0204 ms 78.1%
+  mm 0.0205 ms 77.8%
+  triton_mm_120 0.0205 ms 77.6%
+SingleProcess AUTOTUNE takes 4.1403 seconds
+AUTOTUNE mm(8192x64, 64x32)
+  triton_mm_128 0.0079 ms 100.0%
+  triton_mm_126 0.0084 ms 94.6%
+  triton_mm_133 0.0084 ms 94.6%
+  triton_mm_132 0.0084 ms 94.3%
+  triton_mm_129 0.0085 ms 92.9%
+  triton_mm_135 0.0086 ms 92.2%
+  triton_mm_127 0.0086 ms 91.5%
+  triton_mm_130 0.0087 ms 90.8%
+  triton_mm_134 0.0088 ms 90.1%
+  triton_mm_137 0.0088 ms 89.5%
+SingleProcess AUTOTUNE takes 3.7232 seconds
+AUTOTUNE convolution(32x32x16x16, 16x32x3x3)
+  convolution 0.0109 ms 100.0%
+  triton_convolution_141 0.0175 ms 62.3%
+  triton_convolution_142 0.0184 ms 59.4%
+  triton_convolution_138 0.0193 ms 56.5%
+  triton_convolution_143 0.0242 ms 45.2%
+  triton_convolution_139 0.0316 ms 34.5%
+  triton_convolution_140 0.0779 ms 14.0%
+SingleProcess AUTOTUNE takes 2.6783 seconds
+AUTOTUNE mm(8192x160, 160x80)
+  triton_mm_235 0.0125 ms 100.0%
+  triton_mm_236 0.0125 ms 99.7%
+  triton_mm_234 0.0129 ms 97.0%
+  triton_mm_237 0.0130 ms 96.3%
+  triton_mm_238 0.0130 ms 96.3%
+  mm 0.0132 ms 94.7%
+  triton_mm_239 0.0137 ms 91.1%
+  triton_mm_242 0.0140 ms 89.5%
+  triton_mm_245 0.0142 ms 88.1%
+  triton_mm_241 0.0146 ms 85.7%
+SingleProcess AUTOTUNE takes 5.1406 seconds
+AUTOTUNE mm(2048x80, 80x32)
+  triton_mm_252 0.0071 ms 100.0%
+  triton_mm_249 0.0072 ms 98.2%
+  triton_mm_251 0.0072 ms 98.2%
+  triton_mm_254 0.0074 ms 96.1%
+  triton_mm_247 0.0076 ms 92.9%
+  triton_mm_250 0.0076 ms 92.5%
+  triton_mm_248 0.0079 ms 89.8%
+  triton_mm_255 0.0079 ms 89.8%
+  triton_mm_246 0.0084 ms 84.7%
+  mm 0.0084 ms 83.7%
+SingleProcess AUTOTUNE takes 4.4931 seconds
+AUTOTUNE convolution(32x32x8x8, 16x32x3x3)
+  convolution 0.0106 ms 100.0%
+  triton_convolution_262 0.0164 ms 64.6%
+  triton_convolution_258 0.0171 ms 62.2%
+  triton_convolution_261 0.0179 ms 59.3%
+  triton_convolution_263 0.0257 ms 41.4%
+  triton_convolution_259 0.0326 ms 32.6%
+  triton_convolution_260 0.0773 ms 13.7%
+SingleProcess AUTOTUNE takes 2.7801 seconds
+AUTOTUNE mm(2048x96, 96x32)
+  triton_mm_270 0.0070 ms 100.0%
+  triton_mm_267 0.0072 ms 97.8%
+  triton_mm_266 0.0074 ms 94.4%
+  triton_mm_265 0.0076 ms 91.6%
+  triton_mm_268 0.0076 ms 91.6%
+  triton_mm_269 0.0078 ms 90.3%
+  triton_mm_272 0.0079 ms 89.0%
+  triton_mm_273 0.0079 ms 89.0%
+  triton_mm_264 0.0084 ms 83.9%
+  mm 0.0084 ms 83.0%
+SingleProcess AUTOTUNE takes 3.7179 seconds
+AUTOTUNE mm(2048x112, 112x32)
+  triton_mm_288 0.0074 ms 100.0%
+  triton_mm_285 0.0076 ms 97.1%
+  triton_mm_287 0.0076 ms 97.1%
+  triton_mm_283 0.0079 ms 94.5%
+  triton_mm_290 0.0079 ms 94.3%
+  triton_mm_291 0.0079 ms 94.3%
+  triton_mm_286 0.0081 ms 91.7%
+  triton_mm_284 0.0083 ms 89.2%
+  mm 0.0084 ms 87.9%
+  triton_mm_282 0.0095 ms 78.4%
+SingleProcess AUTOTUNE takes 3.7757 seconds
+AUTOTUNE mm(2048x128, 128x32)
+  triton_mm_308 0.0074 ms 100.0%
+  triton_mm_305 0.0076 ms 96.7%
+  triton_mm_306 0.0078 ms 94.7%
+  triton_mm_303 0.0080 ms 92.0%
+  triton_mm_302 0.0081 ms 91.3%
+  triton_mm_304 0.0081 ms 91.3%
+  triton_mm_309 0.0081 ms 91.3%
+  mm 0.0081 ms 90.9%
+  triton_mm_301 0.0083 ms 88.8%
+  triton_mm_300 0.0092 ms 79.9%
+SingleProcess AUTOTUNE takes 3.7442 seconds
+AUTOTUNE mm(2048x144, 144x32)
+  triton_mm_324 0.0080 ms 100.0%
+  triton_mm_326 0.0083 ms 96.2%
+  triton_mm_327 0.0084 ms 95.8%
+  triton_mm_321 0.0084 ms 95.4%
+  triton_mm_323 0.0084 ms 94.7%
+  triton_mm_322 0.0085 ms 94.0%
+  triton_mm_320 0.0085 ms 93.6%
+  triton_mm_319 0.0086 ms 93.3%
+  mm 0.0092 ms 86.5%
+  triton_mm_318 0.0099 ms 80.9%
+SingleProcess AUTOTUNE takes 3.8789 seconds
+AUTOTUNE mm(2048x160, 160x32)
+  triton_mm_342 0.0076 ms 100.0%
+  triton_mm_341 0.0079 ms 95.9%
+  triton_mm_344 0.0079 ms 95.9%
+  triton_mm_345 0.0084 ms 90.4%
+  triton_mm_338 0.0085 ms 88.4%
+  triton_mm_337 0.0086 ms 88.1%
+  triton_mm_339 0.0086 ms 88.1%
+  triton_mm_340 0.0090 ms 83.7%
+  mm 0.0100 ms 75.4%
+  triton_mm_336 0.0104 ms 72.8%
+SingleProcess AUTOTUNE takes 4.1559 seconds
+AUTOTUNE mm(2048x176, 176x88)
+  triton_mm_359 0.0086 ms 100.0%
+  triton_mm_362 0.0088 ms 97.8%
+  triton_mm_357 0.0092 ms 92.7%
+  mm 0.0095 ms 90.5%
+  triton_mm_355 0.0095 ms 90.5%
+  triton_mm_363 0.0096 ms 89.6%
+  triton_mm_356 0.0097 ms 88.4%
+  triton_mm_358 0.0101 ms 84.8%
+  triton_mm_360 0.0106 ms 80.7%
+  triton_mm_354 0.0115 ms 74.4%
+SingleProcess AUTOTUNE takes 4.7225 seconds
+AUTOTUNE mm(512x88, 88x32)
+  triton_mm_367 0.0069 ms 100.0%
+  triton_mm_369 0.0071 ms 96.9%
+  triton_mm_372 0.0071 ms 96.9%
+  triton_mm_374 0.0071 ms 96.9%
+  triton_mm_368 0.0074 ms 93.5%
+  triton_mm_371 0.0076 ms 91.3%
+  triton_mm_370 0.0076 ms 90.8%
+  triton_mm_375 0.0081 ms 85.7%
+  triton_mm_366 0.0083 ms 83.4%
+  mm 0.0088 ms 78.8%
+SingleProcess AUTOTUNE takes 4.2952 seconds
+AUTOTUNE convolution(32x32x4x4, 16x32x3x3)
+  convolution 0.0109 ms 100.0%
+  triton_convolution_378 0.0168 ms 65.3%
+  triton_convolution_382 0.0184 ms 59.6%
+  triton_convolution_381 0.0189 ms 58.0%
+  triton_convolution_383 0.0280 ms 39.0%
+  triton_convolution_379 0.0321 ms 34.1%
+  triton_convolution_380 0.0421 ms 26.0%
+SingleProcess AUTOTUNE takes 2.5215 seconds
+AUTOTUNE mm(512x104, 104x32)
+  triton_mm_390 0.0068 ms 100.0%
+  triton_mm_385 0.0071 ms 95.7%
+  triton_mm_392 0.0071 ms 95.7%
+  triton_mm_389 0.0074 ms 92.8%
+  triton_mm_387 0.0074 ms 92.4%
+  triton_mm_388 0.0078 ms 87.1%
+  triton_mm_386 0.0079 ms 86.1%
+  triton_mm_393 0.0080 ms 84.9%
+  mm 0.0088 ms 77.9%
+  triton_mm_384 0.0090 ms 76.0%
+SingleProcess AUTOTUNE takes 3.7611 seconds
+AUTOTUNE mm(512x120, 120x32)
+  triton_mm_403 0.0072 ms 100.0%
+  triton_mm_408 0.0074 ms 97.4%
+  triton_mm_404 0.0078 ms 91.8%
+  triton_mm_410 0.0078 ms 91.6%
+  triton_mm_407 0.0078 ms 91.4%
+  triton_mm_405 0.0079 ms 91.1%
+  triton_mm_411 0.0081 ms 88.9%
+  triton_mm_406 0.0085 ms 84.7%
+  triton_mm_402 0.0092 ms 78.0%
+  mm 0.0095 ms 75.2%
+SingleProcess AUTOTUNE takes 3.8015 seconds
+AUTOTUNE mm(512x136, 136x32)
+  triton_mm_426 0.0074 ms 100.0%
+  triton_mm_421 0.0077 ms 96.7%
+  triton_mm_429 0.0079 ms 94.3%
+  triton_mm_428 0.0081 ms 91.9%
+  triton_mm_423 0.0081 ms 91.7%
+  triton_mm_425 0.0081 ms 91.7%
+  mm 0.0086 ms 86.6%
+  triton_mm_422 0.0087 ms 85.1%
+  triton_mm_424 0.0088 ms 84.7%
+  triton_mm_420 0.0099 ms 74.8%
+SingleProcess AUTOTUNE takes 4.1494 seconds
+AUTOTUNE mm(512x152, 152x32)
+  triton_mm_444 0.0071 ms 100.0%
+  triton_mm_441 0.0076 ms 93.7%
+  triton_mm_447 0.0078 ms 91.4%
+  triton_mm_446 0.0081 ms 88.5%
+  triton_mm_443 0.0081 ms 88.1%
+  triton_mm_439 0.0083 ms 85.8%
+  triton_mm_440 0.0083 ms 85.8%
+  triton_mm_442 0.0083 ms 85.8%
+  mm 0.0091 ms 78.2%
+  triton_mm_438 0.0097 ms 73.8%
+SingleProcess AUTOTUNE takes 4.0948 seconds
+AUTOTUNE mm(512x168, 168x32)
+  triton_mm_465 0.0074 ms 100.0%
+  triton_mm_464 0.0076 ms 97.5%
+  triton_mm_462 0.0076 ms 97.1%
+  triton_mm_461 0.0079 ms 94.3%
+  triton_mm_459 0.0085 ms 87.2%
+  triton_mm_457 0.0085 ms 86.9%
+  triton_mm_458 0.0090 ms 82.6%
+  mm 0.0091 ms 81.4%
+  triton_mm_460 0.0092 ms 80.8%
+  triton_mm_456 0.0104 ms 71.6%
+SingleProcess AUTOTUNE takes 4.0597 seconds
+AUTOTUNE int_mm(32x184, 184x10, 32x10)
+  triton_mm_479 0.0069 ms 100.0%
+  triton_mm_477 0.0071 ms 96.9%
+  triton_mm_478 0.0076 ms 90.8%
+  triton_mm_476 0.0078 ms 88.2%
+  triton_mm_475 0.0083 ms 83.4%
+  triton_mm_474 0.0092 ms 75.3%
+SingleProcess AUTOTUNE takes 2.5471 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  30%|███       | 9/30 [00:00<00:00, 85.77it/s]running benchmark:  63%|██████▎   | 19/30 [00:00<00:00, 88.26it/s]running benchmark:  93%|█████████▎| 28/30 [00:00<00:00, 87.55it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 87.51it/s]
+9219.530ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+phlippe_resnet
+cuda eval  phlippe_resnet                      int8dynamic-bs32          
+AUTOTUNE convolution(32x3x32x32, 16x3x3x3)
+  triton_convolution_0 0.0115 ms 100.0%
+  triton_convolution_3 0.0116 ms 98.9%
+  triton_convolution_4 0.0127 ms 90.5%
+  triton_convolution_1 0.0145 ms 79.6%
+  convolution 0.0162 ms 71.1%
+  triton_convolution_2 0.0213 ms 54.0%
+SingleProcess AUTOTUNE takes 1.9633 seconds
+AUTOTUNE convolution(32x16x32x32, 16x16x3x3)
+  triton_convolution_5 0.0145 ms 100.0%
+  triton_convolution_8 0.0155 ms 93.4%
+  convolution 0.0175 ms 82.8%
+  triton_convolution_9 0.0175 ms 82.5%
+  triton_convolution_6 0.0204 ms 70.7%
+  triton_convolution_7 0.0291 ms 49.7%
+SingleProcess AUTOTUNE takes 2.4001 seconds
+AUTOTUNE convolution(32x16x32x32, 32x16x3x3)
+  convolution 0.0129 ms 100.0%
+  triton_convolution_39 0.0135 ms 95.7%
+  triton_convolution_38 0.0157 ms 82.3%
+  triton_convolution_35 0.0174 ms 74.4%
+  triton_convolution_40 0.0219 ms 59.0%
+  triton_convolution_36 0.0341 ms 37.9%
+  triton_convolution_37 0.0430 ms 30.1%
+SingleProcess AUTOTUNE takes 2.6921 seconds
+AUTOTUNE convolution(32x32x16x16, 32x32x3x3)
+  convolution 0.0115 ms 100.0%
+  triton_convolution_46 0.0160 ms 71.7%
+  triton_convolution_44 0.0181 ms 63.3%
+  triton_convolution_45 0.0190 ms 60.3%
+  triton_convolution_41 0.0201 ms 57.1%
+  triton_convolution_47 0.0235 ms 48.8%
+  triton_convolution_42 0.0386 ms 29.8%
+  triton_convolution_43 0.0774 ms 14.8%
+SingleProcess AUTOTUNE takes 3.1019 seconds
+AUTOTUNE convolution(32x16x32x32, 32x16x1x1)
+  triton_convolution_51 0.0077 ms 100.0%
+  triton_convolution_52 0.0079 ms 98.0%
+  triton_convolution_48 0.0083 ms 93.4%
+  triton_convolution_53 0.0095 ms 80.9%
+  convolution 0.0103 ms 75.1%
+  triton_convolution_49 0.0114 ms 67.9%
+  triton_convolution_50 0.0120 ms 64.3%
+SingleProcess AUTOTUNE takes 2.6590 seconds
+AUTOTUNE convolution(32x32x16x16, 64x32x3x3)
+  convolution 0.0116 ms 100.0%
+  triton_convolution_87 0.0331 ms 34.9%
+  triton_convolution_82 0.0459 ms 25.2%
+  triton_convolution_86 0.0505 ms 22.9%
+  triton_convolution_88 0.0531 ms 21.8%
+  triton_convolution_85 0.0559 ms 20.7%
+  triton_convolution_83 0.0852 ms 13.6%
+  triton_convolution_84 0.1279 ms 9.0%
+SingleProcess AUTOTUNE takes 3.8075 seconds
+AUTOTUNE convolution(32x32x16x16, 64x32x1x1)
+  triton_convolution_101 0.0075 ms 100.0%
+  triton_convolution_96 0.0075 ms 99.8%
+  triton_convolution_100 0.0093 ms 80.7%
+  triton_convolution_99 0.0093 ms 80.5%
+  convolution 0.0110 ms 67.9%
+  triton_convolution_102 0.0119 ms 62.9%
+  triton_convolution_97 0.0160 ms 46.6%
+  triton_convolution_98 0.0214 ms 34.9%
+SingleProcess AUTOTUNE takes 3.7989 seconds
+AUTOTUNE int_mm(32x64, 64x10, 32x10)
+  triton_mm_133 0.0067 ms 100.0%
+  triton_mm_134 0.0067 ms 100.0%
+  triton_mm_131 0.0067 ms 99.5%
+  triton_mm_132 0.0072 ms 92.0%
+  triton_mm_135 0.0072 ms 92.0%
+SingleProcess AUTOTUNE takes 1.7831 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 179.46it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 181.40it/s]
+7980.026ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+pyhpc_equation_of_state
+cuda eval  pyhpc_equation_of_state             int8dynamic-bs32          
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 153.30it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 152.65it/s]
+24478.950ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+pyhpc_isoneutral_mixing
+cuda eval  pyhpc_isoneutral_mixing             int8dynamic-bs32          
+skipping cudagraphs due to ['mutated inputs']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 120.69it/s]running benchmark:  87%|████████▋ | 26/30 [00:00<00:00, 122.79it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 122.67it/s]
+6864.012ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model pyhpc_turbulent_kinetic_energy does not support bfloat16, running with amp instead
+loading model: 0it [00:01, ?it/s]
+WARNING:common:Model pyhpc_turbulent_kinetic_energy does not support bfloat16, running with amp instead
+pyhpc_turbulent_kinetic_energy
+cuda eval  pyhpc_turbulent_kinetic_energy      int8dynamic-bs32          
+WARNING:common:Model pyhpc_turbulent_kinetic_energy does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  20%|██        | 6/30 [00:00<00:00, 54.20it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 71.13it/s]running benchmark:  80%|████████  | 24/30 [00:00<00:00, 77.18it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 75.40it/s]
+4498.923ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+pytorch_CycleGAN_and_pix2pix
+cuda eval  pytorch_CycleGAN_and_pix2pix        int8dynamic-bs32          
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 149.46it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 153.59it/s]
+2331.577ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+pytorch_stargan
+cuda eval  pytorch_stargan                     int8dynamic-bs32          
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 114.10it/s]running benchmark:  83%|████████▎ | 25/30 [00:00<00:00, 119.26it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 119.30it/s]
+1928.908ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+pytorch_unet
+cuda eval  pytorch_unet                        int8dynamic-bs32          
+AUTOTUNE convolution(32x3x640x959, 64x3x3x3)
+  triton_convolution_4 7.3858 ms 100.0%
+  triton_convolution_3 7.6138 ms 97.0%
+  convolution 9.8786 ms 74.8%
+  triton_convolution_5 9.9532 ms 74.2%
+  triton_convolution_0 10.3812 ms 71.1%
+  triton_convolution_2 11.1892 ms 66.0%
+  triton_convolution_1 16.8075 ms 43.9%
+SingleProcess AUTOTUNE takes 3.7523 seconds
+AUTOTUNE convolution(32x64x640x959, 64x64x3x3)
+  convolution 8.6144 ms 100.0%
+  triton_convolution_12 41.7460 ms 20.6%
+  triton_convolution_6 41.9666 ms 20.5%
+  triton_convolution_11 48.5304 ms 17.8%
+  triton_convolution_7 61.5724 ms 14.0%
+  triton_convolution_9 71.6018 ms 12.0%
+  triton_convolution_10 74.8037 ms 11.5%
+  triton_convolution_8 172.8514 ms 5.0%
+SingleProcess AUTOTUNE takes 7.5613 seconds
+AUTOTUNE convolution(32x64x320x479, 128x64x3x3)
+  convolution 3.9594 ms 100.0%
+  triton_convolution_13 20.3041 ms 19.5%
+  triton_convolution_16 21.0405 ms 18.8%
+  triton_convolution_19 21.0918 ms 18.8%
+  triton_convolution_18 24.0136 ms 16.5%
+  triton_convolution_14 30.9460 ms 12.8%
+  triton_convolution_17 37.5865 ms 10.5%
+  triton_convolution_15 86.2990 ms 4.6%
+SingleProcess AUTOTUNE takes 5.7934 seconds
+AUTOTUNE convolution(32x128x320x479, 128x128x3x3)
+  convolution 7.0399 ms 100.0%
+  triton_convolution_23 41.7173 ms 16.9%
+  triton_convolution_26 42.7325 ms 16.5%
+  triton_convolution_25 47.2418 ms 14.9%
+  triton_convolution_20 47.6088 ms 14.8%
+  triton_convolution_21 66.0859 ms 10.7%
+  triton_convolution_24 73.1699 ms 9.6%
+  triton_convolution_22 171.6454 ms 4.1%
+SingleProcess AUTOTUNE takes 7.5834 seconds
+AUTOTUNE convolution(32x128x160x239, 256x128x3x3)
+  convolution 3.4660 ms 100.0%
+  triton_convolution_32 17.0408 ms 20.3%
+  triton_convolution_30 21.0794 ms 16.4%
+  triton_convolution_27 22.9372 ms 15.1%
+  triton_convolution_33 31.6964 ms 10.9%
+  triton_convolution_31 37.0809 ms 9.3%
+  triton_convolution_28 40.8524 ms 8.5%
+  triton_convolution_29 84.6892 ms 4.1%
+SingleProcess AUTOTUNE takes 6.4570 seconds
+AUTOTUNE convolution(32x256x160x239, 256x256x3x3)
+  convolution 6.3945 ms 100.0%
+  triton_convolution_39 37.7229 ms 17.0%
+  triton_convolution_37 43.9076 ms 14.6%
+  triton_convolution_34 45.8906 ms 13.9%
+  triton_convolution_40 69.4025 ms 9.2%
+  triton_convolution_38 97.7919 ms 6.5%
+  triton_convolution_35 110.5048 ms 5.8%
+  triton_convolution_36 173.7710 ms 3.7%
+SingleProcess AUTOTUNE takes 8.9385 seconds
+AUTOTUNE convolution(32x256x80x119, 512x256x3x3)
+  convolution 3.1433 ms 100.0%
+  triton_convolution_46 17.7725 ms 17.7%
+  triton_convolution_44 21.9940 ms 14.3%
+  triton_convolution_41 22.5750 ms 13.9%
+  triton_convolution_47 24.9208 ms 12.6%
+  triton_convolution_45 48.9268 ms 6.4%
+  triton_convolution_42 49.0272 ms 6.4%
+  triton_convolution_43 87.3934 ms 3.6%
+SingleProcess AUTOTUNE takes 6.7434 seconds
+AUTOTUNE convolution(32x512x80x119, 512x512x3x3)
+  convolution 6.0852 ms 100.0%
+  triton_convolution_53 38.5311 ms 15.8%
+  triton_convolution_48 47.0847 ms 12.9%
+  triton_convolution_51 59.7218 ms 10.2%
+  triton_convolution_54 65.7228 ms 9.3%
+  triton_convolution_49 109.0296 ms 5.6%
+  triton_convolution_52 114.0374 ms 5.3%
+  triton_convolution_50 176.1070 ms 3.5%
+SingleProcess AUTOTUNE takes 9.2358 seconds
+AUTOTUNE convolution(32x512x40x59, 512x512x3x3)
+  convolution 1.4988 ms 100.0%
+  triton_convolution_60 10.2000 ms 14.7%
+  triton_convolution_55 13.8932 ms 10.8%
+  triton_convolution_58 14.9065 ms 10.1%
+  triton_convolution_61 25.3135 ms 5.9%
+  triton_convolution_59 28.4156 ms 5.3%
+  triton_convolution_56 30.9477 ms 4.8%
+  triton_convolution_57 42.8804 ms 3.5%
+SingleProcess AUTOTUNE takes 5.9284 seconds
+AUTOTUNE convolution(32x1024x80x119, 512x1024x3x3)
+  convolution 11.8925 ms 100.0%
+  triton_convolution_74 78.5718 ms 15.1%
+  triton_convolution_69 129.1907 ms 9.2%
+  triton_convolution_72 165.2603 ms 7.2%
+  triton_convolution_75 188.8638 ms 6.3%
+  triton_convolution_70 220.1882 ms 5.4%
+  triton_convolution_73 296.7211 ms 4.0%
+  triton_convolution_71 351.6313 ms 3.4%
+SingleProcess AUTOTUNE takes 15.5939 seconds
+AUTOTUNE convolution(32x512x80x119, 256x512x3x3)
+  convolution 3.0893 ms 100.0%
+  triton_convolution_81 19.3848 ms 15.9%
+  triton_convolution_76 23.7121 ms 13.0%
+  triton_convolution_79 29.8971 ms 10.3%
+  triton_convolution_82 32.9622 ms 9.4%
+  triton_convolution_77 54.2578 ms 5.7%
+  triton_convolution_80 56.8399 ms 5.4%
+  triton_convolution_78 88.6871 ms 3.5%
+SingleProcess AUTOTUNE takes 7.0343 seconds
+AUTOTUNE convolution(32x512x160x239, 256x512x3x3)
+  convolution 12.4178 ms 100.0%
+  triton_convolution_88 80.8701 ms 15.4%
+  triton_convolution_83 106.8386 ms 11.6%
+  triton_convolution_86 119.7205 ms 10.4%
+  triton_convolution_89 202.4563 ms 6.1%
+  triton_convolution_87 226.9688 ms 5.5%
+  triton_convolution_84 244.5557 ms 5.1%
+  triton_convolution_85 351.0086 ms 3.5%
+SingleProcess AUTOTUNE takes 14.9801 seconds
+AUTOTUNE convolution(32x256x160x239, 128x256x3x3)
+  convolution 3.2727 ms 100.0%
+  triton_convolution_93 22.1306 ms 14.8%
+  triton_convolution_95 24.5248 ms 13.3%
+  triton_convolution_90 27.8669 ms 11.7%
+  triton_convolution_96 34.8257 ms 9.4%
+  triton_convolution_94 48.8057 ms 6.7%
+  triton_convolution_91 55.9060 ms 5.9%
+  triton_convolution_92 86.8545 ms 3.8%
+SingleProcess AUTOTUNE takes 6.8333 seconds
+AUTOTUNE convolution(32x256x320x479, 128x256x3x3)
+  convolution 12.9814 ms 100.0%
+  triton_convolution_100 88.1410 ms 14.7%
+  triton_convolution_102 95.0639 ms 13.7%
+  triton_convolution_103 100.5885 ms 12.9%
+  triton_convolution_97 112.2348 ms 11.6%
+  triton_convolution_98 194.7402 ms 6.7%
+  triton_convolution_101 196.0448 ms 6.6%
+  triton_convolution_99 352.5943 ms 3.7%
+SingleProcess AUTOTUNE takes 13.7062 seconds
+AUTOTUNE convolution(32x128x320x479, 64x128x3x3)
+  convolution 3.7754 ms 100.0%
+  triton_convolution_110 21.3128 ms 17.7%
+  triton_convolution_109 24.5891 ms 15.4%
+  triton_convolution_104 28.0110 ms 13.5%
+  triton_convolution_105 33.0146 ms 11.4%
+  triton_convolution_108 36.6961 ms 10.3%
+  triton_convolution_107 36.8534 ms 10.2%
+  triton_convolution_106 85.9863 ms 4.4%
+SingleProcess AUTOTUNE takes 5.8843 seconds
+AUTOTUNE addmm(19640320x2, 19640320x64, 64x2)
+  triton_mm_120 1.6494 ms 100.0%
+  triton_mm_125 1.6612 ms 99.3%
+  triton_mm_126 1.6638 ms 99.1%
+  triton_mm_118 1.6722 ms 98.6%
+  triton_mm_119 1.6949 ms 97.3%
+  triton_mm_127 1.7006 ms 97.0%
+  triton_mm_128 1.7028 ms 96.9%
+  triton_mm_122 1.7215 ms 95.8%
+  triton_mm_121 1.7462 ms 94.5%
+  triton_mm_129 1.7644 ms 93.5%
+SingleProcess AUTOTUNE takes 4.5421 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:01<00:43,  1.50s/it]running benchmark:   7%|▋         | 2/30 [00:02<00:26,  1.04it/s]running benchmark:  10%|█         | 3/30 [00:02<00:21,  1.27it/s]running benchmark:  13%|█▎        | 4/30 [00:03<00:18,  1.41it/s]running benchmark:  17%|█▋        | 5/30 [00:03<00:16,  1.51it/s]running benchmark:  20%|██        | 6/30 [00:04<00:15,  1.57it/s]running benchmark:  23%|██▎       | 7/30 [00:05<00:14,  1.61it/s]running benchmark:  27%|██▋       | 8/30 [00:05<00:13,  1.64it/s]running benchmark:  30%|███       | 9/30 [00:06<00:12,  1.66it/s]running benchmark:  33%|███▎      | 10/30 [00:06<00:11,  1.68it/s]running benchmark:  37%|███▋      | 11/30 [00:07<00:11,  1.69it/s]running benchmark:  40%|████      | 12/30 [00:07<00:10,  1.70it/s]running benchmark:  43%|████▎     | 13/30 [00:08<00:10,  1.70it/s]running benchmark:  47%|████▋     | 14/30 [00:09<00:09,  1.70it/s]running benchmark:  50%|█████     | 15/30 [00:09<00:08,  1.70it/s]running benchmark:  53%|█████▎    | 16/30 [00:10<00:08,  1.70it/s]running benchmark:  57%|█████▋    | 17/30 [00:10<00:07,  1.71it/s]running benchmark:  60%|██████    | 18/30 [00:11<00:07,  1.71it/s]running benchmark:  63%|██████▎   | 19/30 [00:12<00:06,  1.71it/s]running benchmark:  67%|██████▋   | 20/30 [00:12<00:05,  1.71it/s]running benchmark:  70%|███████   | 21/30 [00:13<00:05,  1.71it/s]running benchmark:  73%|███████▎  | 22/30 [00:13<00:04,  1.71it/s]running benchmark:  77%|███████▋  | 23/30 [00:14<00:04,  1.71it/s]running benchmark:  80%|████████  | 24/30 [00:14<00:03,  1.71it/s]running benchmark:  83%|████████▎ | 25/30 [00:15<00:02,  1.71it/s]running benchmark:  87%|████████▋ | 26/30 [00:16<00:02,  1.71it/s]running benchmark:  90%|█████████ | 27/30 [00:16<00:01,  1.71it/s]running benchmark:  93%|█████████▎| 28/30 [00:17<00:01,  1.71it/s]running benchmark:  97%|█████████▋| 29/30 [00:17<00:00,  1.71it/s]running benchmark: 100%|██████████| 30/30 [00:18<00:00,  1.71it/s]running benchmark: 100%|██████████| 30/30 [00:18<00:00,  1.62it/s]
+1487.352ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+resnet152
+cuda eval  resnet152                           int8dynamic-bs32          
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  10%|█         | 3/30 [00:00<00:01, 23.35it/s]running benchmark:  20%|██        | 6/30 [00:00<00:00, 26.74it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 29.18it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 30.23it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 30.93it/s]running benchmark:  73%|███████▎  | 22/30 [00:00<00:00, 31.35it/s]running benchmark:  87%|████████▋ | 26/30 [00:00<00:00, 31.61it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 31.74it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 30.62it/s]
+2579.883ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+resnet18
+cuda eval  resnet18                            int8dynamic-bs32          
+AUTOTUNE convolution(32x64x56x56, 128x64x3x3)
+  convolution 0.0340 ms 100.0%
+  triton_convolution_34 0.1875 ms 18.1%
+  triton_convolution_37 0.2076 ms 16.4%
+  triton_convolution_40 0.2158 ms 15.8%
+  triton_convolution_39 0.2297 ms 14.8%
+  triton_convolution_35 0.2570 ms 13.2%
+  triton_convolution_38 0.2883 ms 11.8%
+  triton_convolution_36 0.5113 ms 6.6%
+SingleProcess AUTOTUNE takes 4.3256 seconds
+AUTOTUNE convolution(32x64x56x56, 128x64x1x1)
+  convolution 0.0151 ms 100.0%
+  triton_convolution_48 0.0182 ms 82.6%
+  triton_convolution_51 0.0204 ms 74.1%
+  triton_convolution_49 0.0209 ms 72.2%
+  triton_convolution_52 0.0220 ms 68.5%
+  triton_convolution_54 0.0229 ms 65.9%
+  triton_convolution_53 0.0236 ms 63.9%
+  triton_convolution_50 0.0654 ms 23.1%
+SingleProcess AUTOTUNE takes 4.3247 seconds
+AUTOTUNE convolution(32x128x28x28, 256x128x3x3)
+  convolution 0.0295 ms 100.0%
+  triton_convolution_74 0.1531 ms 19.3%
+  triton_convolution_72 0.2123 ms 13.9%
+  triton_convolution_75 0.2124 ms 13.9%
+  triton_convolution_73 0.3105 ms 9.5%
+  triton_convolution_69 0.3122 ms 9.5%
+  triton_convolution_70 0.4121 ms 7.2%
+  triton_convolution_71 0.6618 ms 4.5%
+SingleProcess AUTOTUNE takes 5.0419 seconds
+AUTOTUNE convolution(32x128x28x28, 256x128x1x1)
+  convolution 0.0129 ms 100.0%
+  triton_convolution_86 0.0180 ms 71.7%
+  triton_convolution_89 0.0187 ms 69.0%
+  triton_convolution_87 0.0192 ms 67.1%
+  triton_convolution_88 0.0200 ms 64.4%
+  triton_convolution_84 0.0228 ms 56.4%
+  triton_convolution_83 0.0244 ms 52.8%
+  triton_convolution_85 0.0790 ms 16.3%
+SingleProcess AUTOTUNE takes 4.8181 seconds
+AUTOTUNE convolution(32x256x14x14, 512x256x3x3)
+  convolution 0.0292 ms 100.0%
+  triton_convolution_109 0.2916 ms 10.0%
+  triton_convolution_108 0.3628 ms 8.0%
+  triton_convolution_110 0.4077 ms 7.2%
+  triton_convolution_107 0.4176 ms 7.0%
+  triton_convolution_104 0.6007 ms 4.9%
+  triton_convolution_105 0.8221 ms 3.5%
+  triton_convolution_106 0.9665 ms 3.0%
+SingleProcess AUTOTUNE takes 4.9722 seconds
+AUTOTUNE convolution(32x256x14x14, 512x256x1x1)
+  convolution 0.0115 ms 100.0%
+  triton_convolution_122 0.0183 ms 62.6%
+  triton_convolution_121 0.0251 ms 45.7%
+  triton_convolution_123 0.0267 ms 42.9%
+  triton_convolution_124 0.0267 ms 42.9%
+  triton_convolution_119 0.0363 ms 31.6%
+  triton_convolution_118 0.0373 ms 30.7%
+  triton_convolution_120 0.1164 ms 9.8%
+SingleProcess AUTOTUNE takes 5.0870 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 140.98it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 146.51it/s]
+2768.749ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+resnet50
+cuda eval  resnet50                            int8dynamic-bs32          
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  20%|██        | 6/30 [00:00<00:00, 59.45it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 69.50it/s]running benchmark:  73%|███████▎  | 22/30 [00:00<00:00, 71.21it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 71.74it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 70.46it/s]
+2267.168ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+WARNING:root:resnet50_quantized_qat failed to load
+resnet50_quantized_qat
+The eval test only supports CPU.
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 443, in load_model
+    benchmark = benchmark_cls(
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/util/model.py", line 24, in __call__
+    obj = type.__call__(cls, *args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/models/resnet50_quantized_qat/__init__.py", line 21, in __init__
+    raise NotImplementedError("The eval test only supports CPU.")
+NotImplementedError: The eval test only supports CPU.
+
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+resnext50_32x4d
+cuda eval  resnext50_32x4d                     int8dynamic-bs32          
+AUTOTUNE mm(100352x128, 128x256)
+  triton_mm_20 0.0659 ms 100.0%
+  triton_mm_19 0.0675 ms 97.7%
+  triton_mm_18 0.0749 ms 88.0%
+  triton_mm_25 0.0765 ms 86.1%
+  triton_mm_22 0.0771 ms 85.5%
+  triton_mm_21 0.0786 ms 83.9%
+  mm 0.0862 ms 76.4%
+  triton_mm_26 0.0871 ms 75.7%
+  triton_mm_28 0.1148 ms 57.4%
+  triton_mm_27 0.1386 ms 47.6%
+SingleProcess AUTOTUNE takes 4.4158 seconds
+AUTOTUNE mm(100352x256, 256x256)
+  mm 0.0908 ms 100.0%
+  triton_mm_92 0.1023 ms 88.7%
+  triton_mm_94 0.1151 ms 78.9%
+  triton_mm_91 0.1177 ms 77.1%
+  triton_mm_97 0.1187 ms 76.5%
+  triton_mm_93 0.1254 ms 72.4%
+  triton_mm_98 0.1391 ms 65.3%
+  triton_mm_90 0.1393 ms 65.2%
+  triton_mm_96 0.2383 ms 38.1%
+  triton_mm_95 0.2396 ms 37.9%
+SingleProcess AUTOTUNE takes 4.7950 seconds
+AUTOTUNE mm(25088x256, 256x512)
+  mm 0.0486 ms 100.0%
+  triton_mm_104 0.0516 ms 94.2%
+  triton_mm_103 0.0516 ms 94.2%
+  triton_mm_106 0.0605 ms 80.3%
+  triton_mm_105 0.0608 ms 79.9%
+  triton_mm_109 0.0609 ms 79.8%
+  triton_mm_102 0.0639 ms 76.0%
+  triton_mm_110 0.0692 ms 70.2%
+  triton_mm_112 0.0920 ms 52.8%
+  triton_mm_108 0.1226 ms 39.6%
+SingleProcess AUTOTUNE takes 4.6997 seconds
+AUTOTUNE mm(25088x512, 512x512)
+  mm 0.0765 ms 100.0%
+  triton_mm_195 0.0818 ms 93.6%
+  triton_mm_194 0.0824 ms 92.9%
+  triton_mm_197 0.0965 ms 79.3%
+  triton_mm_196 0.0965 ms 79.3%
+  triton_mm_200 0.1042 ms 73.5%
+  triton_mm_193 0.1052 ms 72.7%
+  triton_mm_201 0.1123 ms 68.2%
+  triton_mm_203 0.1696 ms 45.1%
+  triton_mm_199 0.2049 ms 37.4%
+SingleProcess AUTOTUNE takes 4.8323 seconds
+AUTOTUNE mm(6272x512, 512x1024)
+  mm 0.0405 ms 100.0%
+  triton_mm_207 0.0464 ms 87.4%
+  triton_mm_206 0.0467 ms 86.8%
+  triton_mm_209 0.0537 ms 75.4%
+  triton_mm_212 0.0538 ms 75.3%
+  triton_mm_208 0.0540 ms 75.1%
+  triton_mm_205 0.0606 ms 66.8%
+  triton_mm_213 0.0608 ms 66.6%
+  triton_mm_215 0.0926 ms 43.7%
+  triton_mm_211 0.1077 ms 37.6%
+SingleProcess AUTOTUNE takes 5.1763 seconds
+AUTOTUNE mm(6272x1024, 1024x1024)
+  mm 0.0654 ms 100.0%
+  triton_mm_346 0.0772 ms 84.6%
+  triton_mm_345 0.0776 ms 84.2%
+  triton_mm_347 0.0922 ms 70.9%
+  triton_mm_348 0.0923 ms 70.8%
+  triton_mm_352 0.1052 ms 62.2%
+  triton_mm_344 0.1074 ms 60.9%
+  triton_mm_351 0.1351 ms 48.4%
+  triton_mm_354 0.1686 ms 38.8%
+  triton_mm_353 0.1880 ms 34.8%
+SingleProcess AUTOTUNE takes 5.0874 seconds
+AUTOTUNE mm(1568x1024, 1024x2048)
+  mm 0.0357 ms 100.0%
+  triton_mm_357 0.0475 ms 75.1%
+  triton_mm_360 0.0500 ms 71.4%
+  triton_mm_358 0.0500 ms 71.3%
+  triton_mm_359 0.0502 ms 71.1%
+  triton_mm_364 0.0595 ms 59.9%
+  triton_mm_356 0.0681 ms 52.4%
+  triton_mm_363 0.0710 ms 50.3%
+  triton_mm_361 0.1000 ms 35.7%
+  triton_mm_362 0.1010 ms 35.3%
+SingleProcess AUTOTUNE takes 5.3137 seconds
+AUTOTUNE mm(1568x2048, 2048x1024)
+  mm 0.0354 ms 100.0%
+  triton_mm_376 0.0453 ms 78.0%
+  triton_mm_377 0.0453 ms 78.0%
+  triton_mm_378 0.0477 ms 74.1%
+  triton_mm_379 0.0484 ms 73.0%
+  triton_mm_383 0.0583 ms 60.6%
+  triton_mm_375 0.0618 ms 57.3%
+  triton_mm_382 0.0693 ms 51.0%
+  triton_mm_380 0.1023 ms 34.6%
+  triton_mm_381 0.1027 ms 34.4%
+SingleProcess AUTOTUNE takes 4.7740 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  20%|██        | 6/30 [00:00<00:00, 56.89it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 61.10it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 62.42it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 63.10it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 62.38it/s]
+2237.289ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:10, ?it/s]
+sam
+cuda eval  sam                                 int8dynamic-bs32          
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:28,  1.02it/s]running benchmark:   7%|▋         | 2/30 [00:01<00:22,  1.22it/s]running benchmark:  10%|█         | 3/30 [00:02<00:19,  1.40it/s]running benchmark:  13%|█▎        | 4/30 [00:02<00:17,  1.50it/s]running benchmark:  17%|█▋        | 5/30 [00:03<00:16,  1.56it/s]running benchmark:  20%|██        | 6/30 [00:04<00:14,  1.60it/s]running benchmark:  23%|██▎       | 7/30 [00:04<00:14,  1.59it/s]running benchmark:  27%|██▋       | 8/30 [00:05<00:13,  1.62it/s]running benchmark:  30%|███       | 9/30 [00:05<00:12,  1.64it/s]running benchmark:  33%|███▎      | 10/30 [00:06<00:12,  1.64it/s]running benchmark:  37%|███▋      | 11/30 [00:07<00:11,  1.65it/s]running benchmark:  40%|████      | 12/30 [00:07<00:10,  1.66it/s]running benchmark:  43%|████▎     | 13/30 [00:08<00:10,  1.67it/s]running benchmark:  47%|████▋     | 14/30 [00:08<00:09,  1.68it/s]running benchmark:  50%|█████     | 15/30 [00:09<00:08,  1.67it/s]running benchmark:  53%|█████▎    | 16/30 [00:10<00:08,  1.67it/s]running benchmark:  57%|█████▋    | 17/30 [00:10<00:07,  1.68it/s]running benchmark:  60%|██████    | 18/30 [00:11<00:07,  1.68it/s]running benchmark:  63%|██████▎   | 19/30 [00:11<00:06,  1.69it/s]running benchmark:  67%|██████▋   | 20/30 [00:12<00:05,  1.70it/s]running benchmark:  70%|███████   | 21/30 [00:13<00:05,  1.70it/s]running benchmark:  73%|███████▎  | 22/30 [00:13<00:04,  1.70it/s]running benchmark:  77%|███████▋  | 23/30 [00:14<00:04,  1.71it/s]running benchmark:  80%|████████  | 24/30 [00:14<00:03,  1.71it/s]running benchmark:  83%|████████▎ | 25/30 [00:15<00:02,  1.71it/s]running benchmark:  87%|████████▋ | 26/30 [00:15<00:02,  1.71it/s]running benchmark:  90%|█████████ | 27/30 [00:16<00:01,  1.70it/s]running benchmark:  93%|█████████▎| 28/30 [00:17<00:01,  1.69it/s]running benchmark:  97%|█████████▋| 29/30 [00:17<00:00,  1.70it/s]running benchmark: 100%|██████████| 30/30 [00:18<00:00,  1.70it/s]running benchmark: 100%|██████████| 30/30 [00:18<00:00,  1.64it/s]
+10269.176ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+shufflenet_v2_x1_0
+cuda eval  shufflenet_v2_x1_0                  int8dynamic-bs32          
+AUTOTUNE convolution(32x3x224x224, 24x3x3x3)
+  convolution 0.1124 ms 100.0%
+  triton_convolution_4 0.1234 ms 91.1%
+  triton_convolution_0 0.1337 ms 84.1%
+  triton_convolution_3 0.1361 ms 82.6%
+  triton_convolution_2 0.1484 ms 75.8%
+  triton_convolution_5 0.1862 ms 60.4%
+  triton_convolution_1 0.2219 ms 50.7%
+SingleProcess AUTOTUNE takes 3.0205 seconds
+AUTOTUNE mm(25088x24, 24x58)
+  triton_mm_17 0.0102 ms 100.0%
+  triton_mm_10 0.0105 ms 97.6%
+  triton_mm_9 0.0106 ms 96.7%
+  triton_mm_6 0.0108 ms 94.4%
+  triton_mm_7 0.0108 ms 94.4%
+  triton_mm_8 0.0108 ms 94.1%
+  triton_mm_14 0.0109 ms 93.3%
+  triton_mm_13 0.0110 ms 93.0%
+  triton_mm_12 0.0111 ms 92.2%
+  triton_mm_15 0.0123 ms 83.0%
+SingleProcess AUTOTUNE takes 3.8274 seconds
+AUTOTUNE mm(100352x24, 24x58)
+  triton_mm_19 0.0185 ms 100.0%
+  triton_mm_20 0.0189 ms 97.8%
+  triton_mm_18 0.0189 ms 97.5%
+  triton_mm_25 0.0193 ms 95.7%
+  triton_mm_22 0.0197 ms 93.6%
+  triton_mm_21 0.0197 ms 93.5%
+  triton_mm_26 0.0198 ms 93.2%
+  triton_mm_29 0.0206 ms 89.6%
+  triton_mm_24 0.0211 ms 87.4%
+  triton_mm_27 0.0268 ms 68.8%
+SingleProcess AUTOTUNE takes 3.9207 seconds
+AUTOTUNE mm(25088x58, 58x58)
+  triton_mm_30 0.0128 ms 100.0%
+  triton_mm_32 0.0129 ms 99.3%
+  triton_mm_40 0.0133 ms 96.2%
+  triton_mm_38 0.0134 ms 95.9%
+  triton_mm_39 0.0134 ms 95.7%
+  triton_mm_37 0.0136 ms 94.6%
+  triton_mm_31 0.0139 ms 92.6%
+  triton_mm_34 0.0140 ms 91.6%
+  triton_mm_33 0.0147 ms 87.1%
+  mm 0.0148 ms 86.6%
+SingleProcess AUTOTUNE takes 4.8740 seconds
+AUTOTUNE mm(6272x116, 116x116)
+  triton_mm_116 0.0106 ms 100.0%
+  triton_mm_115 0.0108 ms 98.2%
+  triton_mm_118 0.0116 ms 91.2%
+  triton_mm_122 0.0116 ms 91.2%
+  triton_mm_114 0.0117 ms 90.5%
+  triton_mm_117 0.0118 ms 90.2%
+  triton_mm_121 0.0127 ms 83.4%
+  triton_mm_123 0.0130 ms 81.8%
+  mm 0.0135 ms 78.5%
+  triton_mm_125 0.0136 ms 78.1%
+SingleProcess AUTOTUNE takes 4.7046 seconds
+AUTOTUNE mm(25088x116, 116x116)
+  triton_mm_133 0.0182 ms 100.0%
+  triton_mm_127 0.0203 ms 90.0%
+  mm 0.0211 ms 86.4%
+  triton_mm_129 0.0213 ms 85.6%
+  triton_mm_128 0.0218 ms 83.7%
+  triton_mm_130 0.0220 ms 83.1%
+  triton_mm_126 0.0221 ms 82.5%
+  triton_mm_136 0.0225 ms 81.2%
+  triton_mm_134 0.0248 ms 73.5%
+  triton_mm_135 0.0271 ms 67.4%
+SingleProcess AUTOTUNE takes 4.9096 seconds
+AUTOTUNE mm(1568x232, 232x232)
+  triton_mm_321 0.0106 ms 100.0%
+  triton_mm_322 0.0108 ms 98.5%
+  triton_mm_326 0.0108 ms 98.2%
+  mm 0.0109 ms 97.4%
+  triton_mm_319 0.0110 ms 96.5%
+  triton_mm_320 0.0110 ms 96.5%
+  triton_mm_323 0.0117 ms 90.2%
+  triton_mm_327 0.0127 ms 83.2%
+  triton_mm_324 0.0130 ms 81.5%
+  triton_mm_318 0.0137 ms 77.2%
+SingleProcess AUTOTUNE takes 4.8087 seconds
+AUTOTUNE mm(6272x232, 232x232)
+  mm 0.0176 ms 100.0%
+  triton_mm_332 0.0189 ms 93.3%
+  triton_mm_337 0.0194 ms 90.9%
+  triton_mm_334 0.0198 ms 89.2%
+  triton_mm_331 0.0209 ms 84.6%
+  triton_mm_338 0.0209 ms 84.5%
+  triton_mm_333 0.0216 ms 81.7%
+  triton_mm_330 0.0238 ms 74.2%
+  triton_mm_335 0.0246 ms 71.7%
+  triton_mm_341 0.0287 ms 61.6%
+SingleProcess AUTOTUNE takes 4.7868 seconds
+AUTOTUNE mm(1568x464, 464x1024)
+  mm 0.0165 ms 100.0%
+  triton_mm_427 0.0174 ms 94.9%
+  triton_mm_428 0.0191 ms 86.6%
+  triton_mm_429 0.0192 ms 85.9%
+  triton_mm_430 0.0195 ms 84.6%
+  triton_mm_426 0.0220 ms 75.2%
+  triton_mm_434 0.0226 ms 73.1%
+  triton_mm_433 0.0260 ms 63.6%
+  triton_mm_436 0.0304 ms 54.4%
+  triton_mm_435 0.0328 ms 50.3%
+SingleProcess AUTOTUNE takes 4.7143 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 73.92it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 81.10it/s]running benchmark:  87%|████████▋ | 26/30 [00:00<00:00, 83.64it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 82.79it/s]
+6084.709ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+soft_actor_critic
+cuda eval  soft_actor_critic                   int8dynamic-bs32          
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 97.68it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 149.74it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 142.86it/s]
+13449.625ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+speech_transformer
+cuda eval  speech_transformer                  int8dynamic-bs32          
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['non-cuda device in graph']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:08,  3.57it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:07,  3.61it/s]running benchmark:  10%|█         | 3/30 [00:00<00:07,  3.68it/s]running benchmark:  13%|█▎        | 4/30 [00:01<00:07,  3.68it/s]running benchmark:  17%|█▋        | 5/30 [00:01<00:06,  3.72it/s]running benchmark:  20%|██        | 6/30 [00:01<00:06,  3.71it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:06,  3.74it/s]running benchmark:  27%|██▋       | 8/30 [00:02<00:05,  3.73it/s]running benchmark:  30%|███       | 9/30 [00:02<00:05,  3.73it/s]running benchmark:  33%|███▎      | 10/30 [00:02<00:05,  3.72it/s]running benchmark:  37%|███▋      | 11/30 [00:02<00:05,  3.74it/s]running benchmark:  40%|████      | 12/30 [00:03<00:04,  3.77it/s]running benchmark:  43%|████▎     | 13/30 [00:03<00:04,  3.79it/s]running benchmark:  47%|████▋     | 14/30 [00:03<00:04,  3.78it/s]running benchmark:  50%|█████     | 15/30 [00:04<00:03,  3.76it/s]running benchmark:  53%|█████▎    | 16/30 [00:04<00:03,  3.77it/s]running benchmark:  57%|█████▋    | 17/30 [00:04<00:03,  3.79it/s]running benchmark:  60%|██████    | 18/30 [00:04<00:03,  3.79it/s]running benchmark:  63%|██████▎   | 19/30 [00:05<00:02,  3.78it/s]running benchmark:  67%|██████▋   | 20/30 [00:05<00:02,  3.80it/s]running benchmark:  70%|███████   | 21/30 [00:05<00:02,  3.82it/s]running benchmark:  73%|███████▎  | 22/30 [00:05<00:02,  3.84it/s]running benchmark:  77%|███████▋  | 23/30 [00:06<00:01,  3.82it/s]running benchmark:  80%|████████  | 24/30 [00:06<00:01,  3.85it/s]running benchmark:  83%|████████▎ | 25/30 [00:06<00:01,  3.85it/s]running benchmark:  87%|████████▋ | 26/30 [00:06<00:01,  3.85it/s]running benchmark:  90%|█████████ | 27/30 [00:07<00:00,  3.82it/s]running benchmark:  93%|█████████▎| 28/30 [00:07<00:00,  3.82it/s]running benchmark:  97%|█████████▋| 29/30 [00:07<00:00,  3.81it/s]running benchmark: 100%|██████████| 30/30 [00:07<00:00,  3.78it/s]running benchmark: 100%|██████████| 30/30 [00:07<00:00,  3.77it/s]
+20611.925ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+squeezenet1_1
+cuda eval  squeezenet1_1                       int8dynamic-bs32          
+AUTOTUNE convolution(32x3x224x224, 64x3x3x3)
+  convolution 0.1646 ms 100.0%
+  triton_convolution_4 0.1657 ms 99.3%
+  triton_convolution_3 0.1751 ms 94.0%
+  triton_convolution_5 0.2442 ms 67.4%
+  triton_convolution_2 0.2654 ms 62.0%
+  triton_convolution_0 0.2702 ms 60.9%
+  triton_convolution_1 0.6941 ms 23.7%
+SingleProcess AUTOTUNE takes 3.7125 seconds
+AUTOTUNE addmm(96800x16, 96800x64, 64x16)
+  triton_mm_15 0.0210 ms 100.0%
+  triton_mm_14 0.0211 ms 99.5%
+  triton_mm_6 0.0219 ms 96.2%
+  triton_mm_13 0.0221 ms 95.2%
+  triton_mm_8 0.0222 ms 94.6%
+  triton_mm_11 0.0223 ms 94.2%
+  triton_mm_7 0.0225 ms 93.6%
+  triton_mm_9 0.0227 ms 92.8%
+  triton_mm_16 0.0227 ms 92.8%
+  triton_mm_10 0.0229 ms 91.6%
+SingleProcess AUTOTUNE takes 3.9952 seconds
+AUTOTUNE addmm(96800x64, 96800x16, 16x64)
+  triton_mm_26 0.0161 ms 100.0%
+  triton_mm_20 0.0163 ms 98.3%
+  triton_mm_18 0.0164 ms 98.2%
+  triton_mm_19 0.0164 ms 98.2%
+  triton_mm_21 0.0166 ms 96.7%
+  triton_mm_22 0.0172 ms 93.1%
+  triton_mm_25 0.0173 ms 93.0%
+  triton_mm_24 0.0179 ms 89.8%
+  triton_mm_23 0.0186 ms 86.3%
+  triton_mm_27 0.0191 ms 84.1%
+SingleProcess AUTOTUNE takes 3.9823 seconds
+AUTOTUNE convolution(32x16x55x55, 64x16x3x3)
+  convolution 0.0340 ms 100.0%
+  triton_convolution_33 0.0487 ms 69.7%
+  triton_convolution_32 0.0500 ms 67.9%
+  triton_convolution_34 0.0622 ms 54.6%
+  triton_convolution_29 0.0686 ms 49.5%
+  triton_convolution_31 0.1025 ms 33.1%
+  triton_convolution_30 0.1214 ms 28.0%
+SingleProcess AUTOTUNE takes 3.1956 seconds
+AUTOTUNE addmm(96800x16, 96800x128, 128x16)
+  triton_mm_43 0.0336 ms 100.0%
+  triton_mm_36 0.0349 ms 96.2%
+  triton_mm_37 0.0350 ms 96.0%
+  triton_mm_39 0.0351 ms 95.5%
+  triton_mm_38 0.0354 ms 94.9%
+  triton_mm_44 0.0354 ms 94.8%
+  triton_mm_35 0.0357 ms 94.1%
+  triton_mm_42 0.0359 ms 93.6%
+  triton_mm_41 0.0361 ms 92.9%
+  triton_mm_40 0.0363 ms 92.4%
+SingleProcess AUTOTUNE takes 4.2067 seconds
+AUTOTUNE addmm(23328x32, 23328x128, 128x32)
+  triton_mm_66 0.0134 ms 100.0%
+  triton_mm_65 0.0139 ms 96.4%
+  triton_mm_64 0.0140 ms 96.1%
+  triton_mm_71 0.0142 ms 94.4%
+  triton_mm_68 0.0143 ms 93.7%
+  triton_mm_72 0.0145 ms 92.9%
+  triton_mm_73 0.0151 ms 89.2%
+  triton_mm_67 0.0153 ms 87.7%
+  triton_mm_70 0.0153 ms 87.7%
+  triton_mm_74 0.0154 ms 87.5%
+SingleProcess AUTOTUNE takes 4.2282 seconds
+AUTOTUNE addmm(23328x128, 23328x32, 32x128)
+  triton_mm_76 0.0123 ms 100.0%
+  triton_mm_84 0.0126 ms 97.5%
+  triton_mm_82 0.0129 ms 95.0%
+  triton_mm_78 0.0129 ms 94.8%
+  triton_mm_80 0.0134 ms 91.6%
+  triton_mm_77 0.0136 ms 90.2%
+  bias_addmm 0.0136 ms 89.9%
+  triton_mm_83 0.0138 ms 88.9%
+  triton_mm_79 0.0139 ms 88.2%
+  triton_mm_85 0.0140 ms 87.2%
+SingleProcess AUTOTUNE takes 4.7602 seconds
+AUTOTUNE convolution(32x32x27x27, 128x32x3x3)
+  convolution 0.0199 ms 100.0%
+  triton_convolution_91 0.0601 ms 33.1%
+  triton_convolution_88 0.0603 ms 33.0%
+  triton_convolution_94 0.0623 ms 31.9%
+  triton_convolution_93 0.0709 ms 28.1%
+  triton_convolution_92 0.0863 ms 23.1%
+  triton_convolution_89 0.0926 ms 21.5%
+  triton_convolution_90 0.1431 ms 13.9%
+SingleProcess AUTOTUNE takes 4.2567 seconds
+AUTOTUNE addmm(23328x32, 23328x256, 256x32)
+  triton_mm_95 0.0197 ms 100.0%
+  triton_mm_97 0.0200 ms 98.7%
+  triton_mm_96 0.0200 ms 98.4%
+  triton_mm_102 0.0201 ms 97.9%
+  triton_mm_101 0.0204 ms 96.7%
+  triton_mm_99 0.0210 ms 93.8%
+  triton_mm_104 0.0212 ms 93.1%
+  triton_mm_103 0.0212 ms 92.8%
+  bias_addmm 0.0215 ms 91.7%
+  triton_mm_98 0.0217 ms 90.9%
+SingleProcess AUTOTUNE takes 4.2595 seconds
+AUTOTUNE addmm(5408x48, 5408x256, 256x48)
+  triton_mm_134 0.0107 ms 100.0%
+  triton_mm_129 0.0115 ms 93.3%
+  triton_mm_127 0.0116 ms 92.0%
+  triton_mm_131 0.0118 ms 90.5%
+  bias_addmm 0.0121 ms 88.6%
+  triton_mm_130 0.0122 ms 88.2%
+  triton_mm_132 0.0129 ms 83.3%
+  triton_mm_128 0.0129 ms 83.0%
+  triton_mm_135 0.0129 ms 82.9%
+  triton_mm_126 0.0154 ms 69.6%
+SingleProcess AUTOTUNE takes 4.8852 seconds
+AUTOTUNE addmm(5408x192, 5408x48, 48x192)
+  triton_mm_148 0.0096 ms 100.0%
+  triton_mm_149 0.0098 ms 97.7%
+  triton_mm_146 0.0100 ms 95.8%
+  triton_mm_140 0.0102 ms 93.4%
+  triton_mm_138 0.0103 ms 92.9%
+  triton_mm_147 0.0105 ms 91.2%
+  triton_mm_139 0.0109 ms 87.4%
+  triton_mm_142 0.0110 ms 86.7%
+  triton_mm_141 0.0112 ms 85.7%
+  triton_mm_145 0.0113 ms 84.6%
+SingleProcess AUTOTUNE takes 5.1891 seconds
+AUTOTUNE convolution(32x48x13x13, 192x48x3x3)
+  convolution 0.0190 ms 100.0%
+  triton_convolution_153 0.0554 ms 34.4%
+  triton_convolution_156 0.0599 ms 31.8%
+  triton_convolution_155 0.0614 ms 31.0%
+  triton_convolution_154 0.0674 ms 28.3%
+  triton_convolution_150 0.0729 ms 26.1%
+  triton_convolution_152 0.0736 ms 25.9%
+  triton_convolution_151 0.0979 ms 19.5%
+SingleProcess AUTOTUNE takes 4.6932 seconds
+AUTOTUNE addmm(5408x48, 5408x384, 384x48)
+  triton_mm_165 0.0129 ms 100.0%
+  triton_mm_160 0.0131 ms 98.3%
+  bias_addmm 0.0132 ms 97.3%
+  triton_mm_161 0.0138 ms 93.3%
+  triton_mm_162 0.0140 ms 92.4%
+  triton_mm_158 0.0144 ms 89.4%
+  triton_mm_163 0.0151 ms 85.6%
+  triton_mm_166 0.0152 ms 85.0%
+  triton_mm_159 0.0153 ms 84.1%
+  addmm 0.0172 ms 75.0%
+SingleProcess AUTOTUNE takes 5.5807 seconds
+AUTOTUNE addmm(5408x64, 5408x384, 384x64)
+  triton_mm_196 0.0122 ms 100.0%
+  triton_mm_191 0.0125 ms 97.9%
+  triton_mm_197 0.0138 ms 88.6%
+  bias_addmm 0.0139 ms 88.2%
+  triton_mm_189 0.0139 ms 88.2%
+  triton_mm_192 0.0139 ms 87.8%
+  triton_mm_193 0.0140 ms 87.4%
+  triton_mm_194 0.0143 ms 85.4%
+  triton_mm_190 0.0155 ms 79.1%
+  addmm 0.0173 ms 70.5%
+SingleProcess AUTOTUNE takes 5.1675 seconds
+AUTOTUNE addmm(5408x256, 5408x64, 64x256)
+  triton_mm_204 0.0103 ms 100.0%
+  triton_mm_203 0.0104 ms 99.4%
+  triton_mm_202 0.0106 ms 97.1%
+  triton_mm_201 0.0107 ms 96.4%
+  triton_mm_208 0.0108 ms 95.6%
+  triton_mm_200 0.0109 ms 94.7%
+  bias_addmm 0.0115 ms 89.7%
+  triton_mm_210 0.0117 ms 88.5%
+  triton_mm_207 0.0117 ms 88.0%
+  triton_mm_209 0.0126 ms 82.0%
+SingleProcess AUTOTUNE takes 5.1413 seconds
+AUTOTUNE convolution(32x64x13x13, 256x64x3x3)
+  convolution 0.0183 ms 100.0%
+  triton_convolution_215 0.0567 ms 32.4%
+  triton_convolution_217 0.0569 ms 32.2%
+  triton_convolution_218 0.0612 ms 30.0%
+  triton_convolution_216 0.0817 ms 22.5%
+  triton_convolution_213 0.1135 ms 16.2%
+  triton_convolution_212 0.1245 ms 14.7%
+  triton_convolution_214 0.2277 ms 8.1%
+SingleProcess AUTOTUNE takes 5.0238 seconds
+AUTOTUNE addmm(5408x64, 5408x512, 512x64)
+  triton_mm_222 0.0137 ms 100.0%
+  triton_mm_227 0.0140 ms 98.4%
+  bias_addmm 0.0150 ms 91.7%
+  triton_mm_224 0.0154 ms 89.0%
+  triton_mm_223 0.0155 ms 88.8%
+  triton_mm_220 0.0158 ms 87.0%
+  triton_mm_228 0.0158 ms 86.7%
+  triton_mm_225 0.0161 ms 85.5%
+  triton_mm_221 0.0171 ms 80.5%
+  addmm 0.0185 ms 74.4%
+SingleProcess AUTOTUNE takes 5.2225 seconds
+AUTOTUNE addmm(5408x1000, 5408x512, 512x1000)
+  bias_addmm 0.0431 ms 100.0%
+  triton_mm_251 0.0453 ms 95.2%
+  triton_mm_252 0.0456 ms 94.6%
+  triton_mm_253 0.0512 ms 84.3%
+  triton_mm_254 0.0527 ms 81.8%
+  triton_mm_250 0.0557 ms 77.5%
+  triton_mm_257 0.0562 ms 76.8%
+  triton_mm_258 0.0568 ms 75.9%
+  addmm 0.0612 ms 70.5%
+  triton_mm_260 0.0812 ms 53.2%
+SingleProcess AUTOTUNE takes 5.6889 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 269.31it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 269.14it/s]
+2258.675ms
+loading model: 0it [00:00, ?it/s]stable_diffusion_text_encoder
+
+Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s][A
+Loading pipeline components...:  17%|█▋        | 1/6 [00:00<00:00,  9.86it/s][A
+Loading pipeline components...:  50%|█████     | 3/6 [00:00<00:00,  6.88it/s][A
+Loading pipeline components...:  83%|████████▎ | 5/6 [00:00<00:00,  7.58it/s][ALoading pipeline components...: 100%|██████████| 6/6 [00:00<00:00,  8.27it/s]
+loading model: 0it [00:05, ?it/s]
+cuda eval  stable_diffusion_text_encoder       int8dynamic-bs32          
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:27,  1.07it/s]running benchmark:   7%|▋         | 2/30 [00:01<00:25,  1.08it/s]running benchmark:  10%|█         | 3/30 [00:02<00:24,  1.09it/s]running benchmark:  13%|█▎        | 4/30 [00:03<00:23,  1.09it/s]running benchmark:  17%|█▋        | 5/30 [00:04<00:22,  1.10it/s]running benchmark:  20%|██        | 6/30 [00:05<00:21,  1.10it/s]running benchmark:  23%|██▎       | 7/30 [00:06<00:20,  1.10it/s]running benchmark:  27%|██▋       | 8/30 [00:07<00:20,  1.10it/s]running benchmark:  30%|███       | 9/30 [00:08<00:19,  1.10it/s]running benchmark:  33%|███▎      | 10/30 [00:09<00:18,  1.10it/s]running benchmark:  37%|███▋      | 11/30 [00:10<00:17,  1.10it/s]running benchmark:  40%|████      | 12/30 [00:10<00:16,  1.10it/s]running benchmark:  43%|████▎     | 13/30 [00:11<00:15,  1.10it/s]running benchmark:  47%|████▋     | 14/30 [00:12<00:14,  1.10it/s]running benchmark:  50%|█████     | 15/30 [00:13<00:13,  1.10it/s]running benchmark:  53%|█████▎    | 16/30 [00:14<00:12,  1.10it/s]running benchmark:  57%|█████▋    | 17/30 [00:15<00:11,  1.10it/s]running benchmark:  60%|██████    | 18/30 [00:16<00:10,  1.10it/s]running benchmark:  63%|██████▎   | 19/30 [00:17<00:10,  1.10it/s]running benchmark:  67%|██████▋   | 20/30 [00:18<00:09,  1.10it/s]running benchmark:  70%|███████   | 21/30 [00:19<00:08,  1.10it/s]running benchmark:  73%|███████▎  | 22/30 [00:20<00:07,  1.10it/s]running benchmark:  77%|███████▋  | 23/30 [00:20<00:06,  1.10it/s]running benchmark:  80%|████████  | 24/30 [00:21<00:05,  1.10it/s]running benchmark:  83%|████████▎ | 25/30 [00:22<00:04,  1.10it/s]running benchmark:  87%|████████▋ | 26/30 [00:23<00:03,  1.10it/s]running benchmark:  90%|█████████ | 27/30 [00:24<00:02,  1.10it/s]running benchmark:  93%|█████████▎| 28/30 [00:25<00:01,  1.10it/s]running benchmark:  97%|█████████▋| 29/30 [00:26<00:00,  1.10it/s]running benchmark: 100%|██████████| 30/30 [00:27<00:00,  1.10it/s]running benchmark: 100%|██████████| 30/30 [00:27<00:00,  1.10it/s]
+232076.757ms
+loading model: 0it [00:00, ?it/s]stable_diffusion_unet
+
+Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s][A
+Loading pipeline components...:  67%|██████▋   | 4/6 [00:00<00:00, 22.48it/s][ALoading pipeline components...: 100%|██████████| 6/6 [00:00<00:00,  7.82it/s]
+loading model: 0it [00:06, ?it/s]
+cuda eval  stable_diffusion_unet               int8dynamic-bs32          
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:19,  1.51it/s]running benchmark:   7%|▋         | 2/30 [00:01<00:16,  1.68it/s]running benchmark:  10%|█         | 3/30 [00:01<00:15,  1.74it/s]running benchmark:  13%|█▎        | 4/30 [00:02<00:14,  1.77it/s]running benchmark:  17%|█▋        | 5/30 [00:02<00:13,  1.79it/s]running benchmark:  20%|██        | 6/30 [00:03<00:13,  1.81it/s]running benchmark:  23%|██▎       | 7/30 [00:03<00:12,  1.82it/s]running benchmark:  27%|██▋       | 8/30 [00:04<00:12,  1.82it/s]running benchmark:  30%|███       | 9/30 [00:05<00:11,  1.83it/s]running benchmark:  33%|███▎      | 10/30 [00:05<00:10,  1.83it/s]running benchmark:  37%|███▋      | 11/30 [00:06<00:10,  1.83it/s]running benchmark:  40%|████      | 12/30 [00:06<00:09,  1.83it/s]running benchmark:  43%|████▎     | 13/30 [00:07<00:09,  1.84it/s]running benchmark:  47%|████▋     | 14/30 [00:07<00:08,  1.83it/s]running benchmark:  50%|█████     | 15/30 [00:08<00:08,  1.84it/s]running benchmark:  53%|█████▎    | 16/30 [00:08<00:07,  1.83it/s]running benchmark:  57%|█████▋    | 17/30 [00:09<00:07,  1.83it/s]running benchmark:  60%|██████    | 18/30 [00:09<00:06,  1.84it/s]running benchmark:  63%|██████▎   | 19/30 [00:10<00:05,  1.84it/s]running benchmark:  67%|██████▋   | 20/30 [00:11<00:05,  1.83it/s]running benchmark:  70%|███████   | 21/30 [00:11<00:04,  1.84it/s]running benchmark:  73%|███████▎  | 22/30 [00:12<00:04,  1.83it/s]running benchmark:  77%|███████▋  | 23/30 [00:12<00:03,  1.84it/s]running benchmark:  80%|████████  | 24/30 [00:13<00:03,  1.83it/s]running benchmark:  83%|████████▎ | 25/30 [00:13<00:02,  1.83it/s]running benchmark:  87%|████████▋ | 26/30 [00:14<00:02,  1.84it/s]running benchmark:  90%|█████████ | 27/30 [00:14<00:01,  1.84it/s]running benchmark:  93%|█████████▎| 28/30 [00:15<00:01,  1.84it/s]running benchmark:  97%|█████████▋| 29/30 [00:15<00:00,  1.84it/s]running benchmark: 100%|██████████| 30/30 [00:16<00:00,  1.84it/s]running benchmark: 100%|██████████| 30/30 [00:16<00:00,  1.82it/s]
+13094.578ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+timm_efficientdet
+cuda eval  timm_efficientdet                   int8dynamic-bs32          
+> /home/cdhernandez/local/pytorch/torch/_ops.py(759)__call__()
+-> return self._op(*args, **(kwargs or {}))
+(Pdb) TIMEOUT
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+timm_efficientnet
+cuda eval  timm_efficientnet                   int8dynamic-bs32          
+AUTOTUNE convolution(32x3x224x224, 32x3x3x3)
+  convolution 0.1094 ms 100.0%
+  triton_convolution_4 0.1242 ms 88.1%
+  triton_convolution_0 0.1343 ms 81.5%
+  triton_convolution_3 0.1370 ms 79.9%
+  triton_convolution_2 0.1484 ms 73.7%
+  triton_convolution_5 0.1879 ms 58.2%
+  triton_convolution_1 0.2244 ms 48.8%
+SingleProcess AUTOTUNE takes 0.9131 seconds
+AUTOTUNE addmm(32x8, 32x32, 32x8)
+  triton_mm_6 0.0065 ms 100.0%
+  triton_mm_7 0.0070 ms 91.8%
+  triton_mm_8 0.0070 ms 91.8%
+  triton_mm_9 0.0070 ms 91.8%
+  triton_mm_11 0.0071 ms 91.0%
+  triton_mm_10 0.0074 ms 87.3%
+  bias_addmm 0.0076 ms 84.5%
+  addmm 0.0117 ms 55.3%
+SingleProcess AUTOTUNE takes 1.0004 seconds
+AUTOTUNE addmm(32x32, 32x8, 8x32)
+  triton_mm_13 0.0060 ms 100.0%
+  triton_mm_16 0.0062 ms 96.4%
+  triton_mm_14 0.0066 ms 90.8%
+  triton_mm_15 0.0066 ms 90.8%
+  triton_mm_12 0.0066 ms 90.6%
+  bias_addmm 0.0074 ms 81.0%
+  addmm 0.0120 ms 50.1%
+SingleProcess AUTOTUNE takes 0.8723 seconds
+AUTOTUNE mm(401408x32, 32x16)
+  triton_mm_17 0.0348 ms 100.0%
+  triton_mm_21 0.0348 ms 99.8%
+  triton_mm_24 0.0348 ms 99.7%
+  triton_mm_18 0.0351 ms 99.0%
+  triton_mm_20 0.0351 ms 98.9%
+  triton_mm_22 0.0351 ms 98.9%
+  triton_mm_19 0.0352 ms 98.8%
+  triton_mm_25 0.0354 ms 98.2%
+  triton_mm_23 0.0354 ms 98.1%
+  triton_mm_27 0.0386 ms 90.1%
+SingleProcess AUTOTUNE takes 1.4750 seconds
+AUTOTUNE mm(401408x16, 16x96)
+  triton_mm_33 0.0646 ms 100.0%
+  triton_mm_37 0.0647 ms 99.9%
+  triton_mm_38 0.0661 ms 97.8%
+  triton_mm_28 0.0673 ms 96.1%
+  triton_mm_32 0.0678 ms 95.4%
+  triton_mm_29 0.0685 ms 94.3%
+  triton_mm_30 0.0685 ms 94.3%
+  triton_mm_35 0.0686 ms 94.2%
+  triton_mm_31 0.0689 ms 93.9%
+  triton_mm_36 0.0690 ms 93.7%
+SingleProcess AUTOTUNE takes 1.6201 seconds
+AUTOTUNE addmm(32x4, 32x96, 96x4)
+  triton_mm_42 0.0067 ms 100.0%
+  triton_mm_41 0.0073 ms 92.5%
+  triton_mm_40 0.0073 ms 92.1%
+  triton_mm_43 0.0074 ms 91.3%
+  triton_mm_44 0.0078 ms 85.7%
+  bias_addmm 0.0081 ms 82.7%
+  triton_mm_39 0.0082 ms 82.2%
+  triton_mm_46 0.0083 ms 80.8%
+  triton_mm_45 0.0089 ms 75.8%
+  addmm 0.0112 ms 59.8%
+SingleProcess AUTOTUNE takes 1.2656 seconds
+AUTOTUNE addmm(32x96, 32x4, 4x96)
+  triton_mm_52 0.0062 ms 100.0%
+  triton_mm_51 0.0063 ms 99.5%
+  triton_mm_53 0.0063 ms 99.5%
+  triton_mm_57 0.0065 ms 96.1%
+  triton_mm_56 0.0067 ms 93.1%
+  triton_mm_48 0.0067 ms 92.9%
+  triton_mm_50 0.0069 ms 90.3%
+  triton_mm_55 0.0069 ms 90.1%
+  triton_mm_49 0.0070 ms 89.4%
+  triton_mm_47 0.0070 ms 89.0%
+SingleProcess AUTOTUNE takes 1.9945 seconds
+AUTOTUNE mm(100352x96, 96x24)
+  triton_mm_65 0.0292 ms 100.0%
+  triton_mm_61 0.0296 ms 98.7%
+  triton_mm_62 0.0299 ms 97.7%
+  triton_mm_58 0.0300 ms 97.5%
+  triton_mm_63 0.0304 ms 96.3%
+  triton_mm_66 0.0304 ms 96.1%
+  triton_mm_60 0.0309 ms 94.6%
+  triton_mm_59 0.0310 ms 94.2%
+  triton_mm_69 0.0318 ms 91.9%
+  triton_mm_68 0.0320 ms 91.3%
+SingleProcess AUTOTUNE takes 1.6133 seconds
+AUTOTUNE mm(100352x24, 24x144)
+  triton_mm_72 0.0318 ms 100.0%
+  triton_mm_70 0.0332 ms 95.8%
+  triton_mm_81 0.0342 ms 93.1%
+  triton_mm_74 0.0343 ms 92.9%
+  triton_mm_75 0.0355 ms 89.7%
+  triton_mm_78 0.0355 ms 89.6%
+  triton_mm_79 0.0375 ms 85.0%
+  triton_mm_71 0.0380 ms 83.8%
+  triton_mm_77 0.0390 ms 81.6%
+  triton_mm_76 0.0396 ms 80.3%
+SingleProcess AUTOTUNE takes 1.6121 seconds
+AUTOTUNE addmm(32x6, 32x144, 144x6)
+  triton_mm_85 0.0069 ms 100.0%
+  triton_mm_86 0.0076 ms 90.8%
+  triton_mm_84 0.0077 ms 90.0%
+  triton_mm_87 0.0078 ms 88.6%
+  triton_mm_83 0.0079 ms 87.5%
+  bias_addmm 0.0091 ms 76.7%
+  triton_mm_82 0.0091 ms 76.1%
+  triton_mm_88 0.0108 ms 64.0%
+  triton_mm_89 0.0111 ms 62.5%
+  addmm 0.0123 ms 56.7%
+SingleProcess AUTOTUNE takes 1.2720 seconds
+AUTOTUNE addmm(32x144, 32x6, 6x144)
+  triton_mm_90 0.0065 ms 100.0%
+  triton_mm_96 0.0065 ms 100.0%
+  triton_mm_93 0.0067 ms 96.2%
+  triton_mm_97 0.0067 ms 96.2%
+  triton_mm_95 0.0068 ms 95.7%
+  triton_mm_99 0.0068 ms 95.3%
+  triton_mm_92 0.0070 ms 92.2%
+  triton_mm_94 0.0070 ms 92.2%
+  triton_mm_98 0.0070 ms 92.2%
+  triton_mm_100 0.0070 ms 92.2%
+SingleProcess AUTOTUNE takes 1.7632 seconds
+AUTOTUNE mm(100352x144, 144x24)
+  triton_mm_105 0.0410 ms 100.0%
+  triton_mm_103 0.0410 ms 99.8%
+  triton_mm_106 0.0416 ms 98.5%
+  triton_mm_108 0.0416 ms 98.5%
+  triton_mm_104 0.0421 ms 97.3%
+  triton_mm_112 0.0421 ms 97.3%
+  triton_mm_109 0.0422 ms 97.0%
+  triton_mm_111 0.0432 ms 94.8%
+  triton_mm_102 0.0435 ms 94.1%
+  mm 0.0449 ms 91.3%
+SingleProcess AUTOTUNE takes 1.5944 seconds
+AUTOTUNE mm(25088x144, 144x40)
+  triton_mm_146 0.0155 ms 100.0%
+  triton_mm_151 0.0162 ms 95.5%
+  triton_mm_148 0.0166 ms 93.1%
+  triton_mm_155 0.0177 ms 87.5%
+  triton_mm_145 0.0180 ms 86.1%
+  triton_mm_152 0.0181 ms 85.3%
+  triton_mm_144 0.0182 ms 85.0%
+  triton_mm_147 0.0188 ms 82.4%
+  triton_mm_149 0.0196 ms 79.1%
+  mm 0.0201 ms 76.8%
+SingleProcess AUTOTUNE takes 4.1262 seconds
+AUTOTUNE mm(25088x40, 40x240)
+  triton_mm_164 0.0207 ms 100.0%
+  triton_mm_158 0.0208 ms 99.7%
+  mm 0.0216 ms 95.9%
+  triton_mm_167 0.0223 ms 93.1%
+  triton_mm_160 0.0226 ms 91.8%
+  triton_mm_163 0.0240 ms 86.3%
+  triton_mm_157 0.0243 ms 85.3%
+  triton_mm_159 0.0249 ms 83.4%
+  triton_mm_165 0.0258 ms 80.5%
+  triton_mm_156 0.0261 ms 79.4%
+SingleProcess AUTOTUNE takes 1.6583 seconds
+AUTOTUNE addmm(32x10, 32x240, 240x10)
+  triton_mm_171 0.0078 ms 100.0%
+  triton_mm_172 0.0078 ms 99.2%
+  triton_mm_173 0.0081 ms 96.4%
+  triton_mm_170 0.0084 ms 92.4%
+  triton_mm_169 0.0085 ms 91.0%
+  bias_addmm 0.0095 ms 81.8%
+  triton_mm_168 0.0109 ms 71.1%
+  addmm 0.0128 ms 60.6%
+  triton_mm_175 0.0148 ms 52.5%
+  triton_mm_174 0.0150 ms 51.8%
+SingleProcess AUTOTUNE takes 1.3745 seconds
+AUTOTUNE addmm(32x240, 32x10, 10x240)
+  triton_mm_181 0.0062 ms 100.0%
+  triton_mm_185 0.0062 ms 100.0%
+  triton_mm_178 0.0065 ms 96.5%
+  triton_mm_180 0.0065 ms 96.5%
+  triton_mm_182 0.0065 ms 96.5%
+  triton_mm_184 0.0065 ms 96.5%
+  triton_mm_186 0.0065 ms 96.5%
+  triton_mm_176 0.0070 ms 89.0%
+  triton_mm_183 0.0075 ms 83.7%
+  triton_mm_179 0.0075 ms 83.3%
+SingleProcess AUTOTUNE takes 1.6535 seconds
+AUTOTUNE mm(25088x240, 240x40)
+  triton_mm_189 0.0208 ms 100.0%
+  triton_mm_191 0.0210 ms 99.2%
+  triton_mm_194 0.0216 ms 96.2%
+  triton_mm_195 0.0231 ms 90.2%
+  triton_mm_190 0.0233 ms 89.2%
+  mm 0.0240 ms 86.6%
+  triton_mm_187 0.0247 ms 84.1%
+  triton_mm_188 0.0248 ms 84.0%
+  triton_mm_198 0.0259 ms 80.3%
+  triton_mm_192 0.0260 ms 80.0%
+SingleProcess AUTOTUNE takes 4.6825 seconds
+AUTOTUNE mm(6272x240, 240x80)
+  triton_mm_232 0.0122 ms 100.0%
+  triton_mm_234 0.0124 ms 99.0%
+  triton_mm_233 0.0125 ms 97.7%
+  triton_mm_238 0.0126 ms 97.2%
+  triton_mm_231 0.0128 ms 95.7%
+  triton_mm_235 0.0140 ms 87.6%
+  mm 0.0141 ms 86.8%
+  triton_mm_239 0.0145 ms 84.1%
+  triton_mm_230 0.0156 ms 78.6%
+  triton_mm_236 0.0170 ms 71.9%
+SingleProcess AUTOTUNE takes 1.6571 seconds
+AUTOTUNE mm(6272x80, 80x480)
+  triton_mm_249 0.0139 ms 100.0%
+  mm 0.0148 ms 94.0%
+  triton_mm_245 0.0148 ms 94.0%
+  triton_mm_243 0.0149 ms 93.1%
+  triton_mm_242 0.0152 ms 91.4%
+  triton_mm_246 0.0154 ms 90.6%
+  triton_mm_244 0.0154 ms 90.4%
+  triton_mm_250 0.0182 ms 76.4%
+  triton_mm_252 0.0185 ms 75.4%
+  triton_mm_253 0.0209 ms 66.7%
+SingleProcess AUTOTUNE takes 1.7384 seconds
+AUTOTUNE addmm(32x20, 32x480, 480x20)
+  triton_mm_257 0.0092 ms 100.0%
+  triton_mm_258 0.0100 ms 92.0%
+  triton_mm_259 0.0101 ms 90.9%
+  triton_mm_256 0.0103 ms 89.4%
+  triton_mm_255 0.0113 ms 81.4%
+  bias_addmm 0.0119 ms 77.2%
+  addmm 0.0155 ms 59.5%
+  triton_mm_254 0.0170 ms 54.3%
+  triton_mm_261 0.0198 ms 46.5%
+  triton_mm_260 0.0207 ms 44.6%
+SingleProcess AUTOTUNE takes 1.2630 seconds
+AUTOTUNE addmm(32x480, 32x20, 20x480)
+  triton_mm_267 0.0065 ms 100.0%
+  triton_mm_262 0.0065 ms 99.5%
+  triton_mm_268 0.0065 ms 99.5%
+  triton_mm_273 0.0069 ms 93.1%
+  triton_mm_271 0.0070 ms 92.2%
+  triton_mm_264 0.0071 ms 91.4%
+  triton_mm_266 0.0071 ms 90.6%
+  triton_mm_270 0.0071 ms 90.6%
+  triton_mm_263 0.0074 ms 87.4%
+  triton_mm_269 0.0075 ms 86.3%
+SingleProcess AUTOTUNE takes 1.7823 seconds
+AUTOTUNE mm(6272x480, 480x80)
+  triton_mm_278 0.0151 ms 100.0%
+  mm 0.0153 ms 98.7%
+  triton_mm_277 0.0154 ms 98.0%
+  triton_mm_282 0.0157 ms 95.9%
+  triton_mm_275 0.0164 ms 91.8%
+  triton_mm_276 0.0165 ms 91.5%
+  triton_mm_279 0.0174 ms 86.7%
+  triton_mm_283 0.0205 ms 73.4%
+  triton_mm_280 0.0209 ms 72.1%
+  triton_mm_274 0.0231 ms 65.2%
+SingleProcess AUTOTUNE takes 1.6409 seconds
+AUTOTUNE mm(6272x480, 480x112)
+  triton_mm_366 0.0156 ms 100.0%
+  mm 0.0159 ms 98.4%
+  triton_mm_365 0.0159 ms 98.4%
+  triton_mm_364 0.0166 ms 94.0%
+  triton_mm_370 0.0171 ms 91.4%
+  triton_mm_363 0.0174 ms 89.9%
+  triton_mm_367 0.0202 ms 77.4%
+  triton_mm_362 0.0233 ms 67.3%
+  triton_mm_371 0.0234 ms 67.0%
+  triton_mm_368 0.0244 ms 64.3%
+SingleProcess AUTOTUNE takes 1.6373 seconds
+AUTOTUNE mm(6272x112, 112x672)
+  triton_mm_376 0.0191 ms 100.0%
+  triton_mm_381 0.0207 ms 92.6%
+  triton_mm_375 0.0217 ms 88.2%
+  triton_mm_378 0.0221 ms 86.4%
+  triton_mm_382 0.0223 ms 85.7%
+  triton_mm_377 0.0226 ms 84.6%
+  triton_mm_374 0.0231 ms 82.8%
+  mm 0.0250 ms 76.7%
+  triton_mm_384 0.0277 ms 69.0%
+  triton_mm_383 0.0313 ms 61.1%
+SingleProcess AUTOTUNE takes 1.6365 seconds
+AUTOTUNE addmm(32x28, 32x672, 672x28)
+  triton_mm_389 0.0107 ms 100.0%
+  triton_mm_390 0.0112 ms 96.0%
+  triton_mm_388 0.0118 ms 90.5%
+  bias_addmm 0.0121 ms 88.6%
+  triton_mm_391 0.0128 ms 84.0%
+  triton_mm_387 0.0139 ms 77.0%
+  addmm 0.0156 ms 68.8%
+  triton_mm_386 0.0213 ms 50.4%
+  triton_mm_393 0.0257 ms 41.7%
+  triton_mm_392 0.0261 ms 41.1%
+SingleProcess AUTOTUNE takes 1.2713 seconds
+AUTOTUNE addmm(32x672, 32x28, 28x672)
+  triton_mm_403 0.0065 ms 100.0%
+  triton_mm_398 0.0067 ms 96.7%
+  triton_mm_397 0.0069 ms 93.1%
+  triton_mm_399 0.0070 ms 92.2%
+  triton_mm_396 0.0072 ms 90.0%
+  triton_mm_402 0.0072 ms 89.8%
+  triton_mm_394 0.0072 ms 89.4%
+  triton_mm_400 0.0072 ms 89.4%
+  triton_mm_395 0.0074 ms 87.8%
+  triton_mm_401 0.0075 ms 86.3%
+SingleProcess AUTOTUNE takes 1.7761 seconds
+AUTOTUNE mm(6272x672, 672x112)
+  mm 0.0181 ms 100.0%
+  triton_mm_410 0.0182 ms 99.6%
+  triton_mm_409 0.0190 ms 95.1%
+  triton_mm_408 0.0203 ms 89.1%
+  triton_mm_414 0.0205 ms 88.3%
+  triton_mm_407 0.0210 ms 86.3%
+  triton_mm_411 0.0250 ms 72.5%
+  triton_mm_412 0.0282 ms 64.3%
+  triton_mm_415 0.0295 ms 61.4%
+  triton_mm_406 0.0298 ms 60.7%
+SingleProcess AUTOTUNE takes 1.6350 seconds
+AUTOTUNE mm(1568x672, 672x192)
+  mm 0.0125 ms 100.0%
+  triton_mm_502 0.0135 ms 92.4%
+  triton_mm_498 0.0149 ms 83.9%
+  triton_mm_497 0.0154 ms 80.9%
+  triton_mm_499 0.0157 ms 79.3%
+  triton_mm_500 0.0157 ms 79.3%
+  triton_mm_503 0.0160 ms 78.0%
+  triton_mm_495 0.0177 ms 70.4%
+  triton_mm_496 0.0179 ms 69.8%
+  triton_mm_494 0.0256 ms 48.8%
+SingleProcess AUTOTUNE takes 5.0328 seconds
+AUTOTUNE mm(1568x192, 192x1152)
+  triton_mm_507 0.0138 ms 100.0%
+  triton_mm_508 0.0142 ms 97.1%
+  triton_mm_506 0.0144 ms 95.3%
+  mm 0.0149 ms 92.1%
+  triton_mm_514 0.0154 ms 89.2%
+  triton_mm_509 0.0156 ms 87.9%
+  triton_mm_510 0.0158 ms 87.2%
+  triton_mm_513 0.0166 ms 82.9%
+  triton_mm_516 0.0175 ms 78.6%
+  triton_mm_512 0.0215 ms 63.9%
+SingleProcess AUTOTUNE takes 1.6465 seconds
+AUTOTUNE addmm(32x48, 32x1152, 1152x48)
+  bias_addmm 0.0122 ms 100.0%
+  triton_mm_521 0.0134 ms 90.7%
+  triton_mm_522 0.0151 ms 80.3%
+  triton_mm_524 0.0153 ms 79.2%
+  addmm 0.0155 ms 78.7%
+  triton_mm_525 0.0158 ms 77.2%
+  triton_mm_520 0.0161 ms 75.4%
+  triton_mm_519 0.0209 ms 58.2%
+  triton_mm_518 0.0346 ms 35.1%
+  triton_mm_523 0.0355 ms 34.2%
+SingleProcess AUTOTUNE takes 1.5143 seconds
+AUTOTUNE addmm(32x1152, 32x48, 48x1152)
+  triton_mm_536 0.0072 ms 100.0%
+  triton_mm_533 0.0074 ms 97.0%
+  triton_mm_530 0.0076 ms 94.1%
+  triton_mm_537 0.0077 ms 93.0%
+  triton_mm_529 0.0079 ms 91.5%
+  triton_mm_534 0.0079 ms 91.5%
+  triton_mm_531 0.0080 ms 89.6%
+  triton_mm_532 0.0082 ms 87.9%
+  triton_mm_528 0.0082 ms 87.5%
+  triton_mm_538 0.0083 ms 86.5%
+SingleProcess AUTOTUNE takes 1.7876 seconds
+AUTOTUNE mm(1568x1152, 1152x192)
+  mm 0.0146 ms 100.0%
+  triton_mm_548 0.0175 ms 83.2%
+  triton_mm_549 0.0197 ms 73.9%
+  triton_mm_543 0.0203 ms 72.0%
+  triton_mm_544 0.0207 ms 70.4%
+  triton_mm_545 0.0210 ms 69.4%
+  triton_mm_546 0.0218 ms 67.1%
+  triton_mm_541 0.0252 ms 57.8%
+  triton_mm_542 0.0261 ms 55.9%
+  triton_mm_540 0.0390 ms 37.4%
+SingleProcess AUTOTUNE takes 1.8321 seconds
+AUTOTUNE mm(1568x1152, 1152x320)
+  mm 0.0172 ms 100.0%
+  triton_mm_681 0.0209 ms 82.4%
+  triton_mm_682 0.0209 ms 82.4%
+  triton_mm_686 0.0223 ms 77.3%
+  triton_mm_680 0.0257 ms 66.9%
+  triton_mm_679 0.0260 ms 66.2%
+  triton_mm_683 0.0284 ms 60.6%
+  triton_mm_684 0.0289 ms 59.5%
+  triton_mm_687 0.0314 ms 54.8%
+  triton_mm_678 0.0392 ms 44.0%
+SingleProcess AUTOTUNE takes 1.6326 seconds
+AUTOTUNE mm(1568x320, 320x1280)
+  triton_mm_691 0.0172 ms 100.0%
+  triton_mm_692 0.0173 ms 99.4%
+  mm 0.0180 ms 95.6%
+  triton_mm_690 0.0184 ms 93.4%
+  triton_mm_693 0.0193 ms 89.4%
+  triton_mm_694 0.0195 ms 88.5%
+  triton_mm_698 0.0204 ms 84.3%
+  triton_mm_697 0.0211 ms 81.8%
+  triton_mm_700 0.0258 ms 66.8%
+  triton_mm_696 0.0310 ms 55.5%
+SingleProcess AUTOTUNE takes 1.6535 seconds
+AUTOTUNE int_mm(32x1280, 1280x1000, 32x1000)
+  triton_mm_712 0.0139 ms 100.0%
+  triton_mm_707 0.0156 ms 89.3%
+  triton_mm_710 0.0162 ms 85.8%
+  triton_mm_708 0.0169 ms 82.2%
+  triton_mm_711 0.0172 ms 80.8%
+  triton_mm_706 0.0182 ms 76.1%
+  triton_mm_705 0.0208 ms 66.7%
+  triton_mm_704 0.0236 ms 58.9%
+  triton_mm_703 0.0267 ms 52.0%
+  triton_mm_702 0.0346 ms 40.1%
+SingleProcess AUTOTUNE takes 1.5528 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  20%|██        | 6/30 [00:00<00:00, 57.66it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 60.39it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 62.02it/s]running benchmark:  90%|█████████ | 27/30 [00:00<00:00, 63.23it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 62.66it/s]
+3969.429ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+timm_nfnet
+cuda eval  timm_nfnet                          int8dynamic-bs32          
+AUTOTUNE convolution(32x3x193x193, 16x3x3x3)
+  triton_convolution_1 0.0598 ms 100.0%
+  convolution 0.0684 ms 87.5%
+  triton_convolution_4 0.0930 ms 64.4%
+  triton_convolution_2 0.1021 ms 58.6%
+  triton_convolution_0 0.1050 ms 57.0%
+  triton_convolution_3 0.1292 ms 46.3%
+SingleProcess AUTOTUNE takes 2.0838 seconds
+AUTOTUNE convolution(32x16x96x96, 32x16x3x3)
+  triton_convolution_6 0.0959 ms 100.0%
+  triton_convolution_5 0.1020 ms 94.0%
+  triton_convolution_8 0.1106 ms 86.8%
+  triton_convolution_10 0.1175 ms 81.6%
+  convolution 0.1188 ms 80.7%
+  triton_convolution_9 0.1192 ms 80.5%
+  triton_convolution_7 0.2079 ms 46.1%
+SingleProcess AUTOTUNE takes 2.5396 seconds
+AUTOTUNE convolution(32x32x96x96, 64x32x3x3)
+  convolution 0.1788 ms 100.0%
+  triton_convolution_14 0.2572 ms 69.5%
+  triton_convolution_11 0.2608 ms 68.6%
+  triton_convolution_15 0.2679 ms 66.8%
+  triton_convolution_12 0.2684 ms 66.6%
+  triton_convolution_16 0.2911 ms 61.4%
+  triton_convolution_17 0.3066 ms 58.3%
+  triton_convolution_13 0.5848 ms 30.6%
+SingleProcess AUTOTUNE takes 3.9280 seconds
+AUTOTUNE convolution(32x64x97x97, 128x64x3x3)
+  convolution 0.1716 ms 100.0%
+  triton_convolution_22 0.2960 ms 58.0%
+  triton_convolution_24 0.3147 ms 54.5%
+  triton_convolution_19 0.3232 ms 53.1%
+  triton_convolution_18 0.3520 ms 48.7%
+  triton_convolution_21 0.3605 ms 47.6%
+  triton_convolution_23 0.4547 ms 37.7%
+  triton_convolution_20 1.0013 ms 17.1%
+SingleProcess AUTOTUNE takes 4.4760 seconds
+AUTOTUNE convolution(32x128x48x48, 128x128x1x1)
+  convolution 0.0782 ms 100.0%
+  triton_convolution_29 0.1042 ms 75.1%
+  triton_convolution_25 0.1195 ms 65.5%
+  triton_convolution_26 0.1242 ms 63.0%
+  triton_convolution_28 0.1331 ms 58.8%
+  triton_convolution_30 0.1345 ms 58.1%
+  triton_convolution_31 0.1373 ms 56.9%
+  triton_convolution_27 0.2052 ms 38.1%
+  conv1x1_via_mm 0.4361 ms 17.9%
+SingleProcess AUTOTUNE takes 4.6776 seconds
+AUTOTUNE convolution(32x128x48x48, 128x128x3x3)
+  convolution 0.1721 ms 100.0%
+  triton_convolution_33 0.4372 ms 39.4%
+  triton_convolution_38 0.4908 ms 35.1%
+  triton_convolution_36 0.5369 ms 32.1%
+  triton_convolution_32 0.5444 ms 31.6%
+  triton_convolution_35 0.6156 ms 28.0%
+  triton_convolution_37 0.7349 ms 23.4%
+  triton_convolution_34 1.1690 ms 14.7%
+SingleProcess AUTOTUNE takes 4.3413 seconds
+AUTOTUNE convolution(32x128x48x48, 256x128x1x1)
+  convolution 0.0674 ms 100.0%
+  triton_convolution_50 0.1862 ms 36.2%
+  triton_convolution_47 0.2196 ms 30.7%
+  triton_convolution_49 0.2361 ms 28.5%
+  triton_convolution_52 0.2489 ms 27.1%
+  triton_convolution_51 0.2594 ms 26.0%
+  triton_convolution_46 0.2741 ms 24.6%
+  triton_convolution_48 0.3605 ms 18.7%
+  conv1x1_via_mm 0.5659 ms 11.9%
+SingleProcess AUTOTUNE takes 5.0824 seconds
+AUTOTUNE convolution(32x256x1x1, 128x256x1x1)
+  convolution 0.0094 ms 100.0%
+  triton_convolution_57 0.0137 ms 68.8%
+  triton_convolution_58 0.0150 ms 62.9%
+  triton_convolution_56 0.0166 ms 56.8%
+  triton_convolution_53 0.0183 ms 51.5%
+  triton_convolution_55 0.0206 ms 45.7%
+  triton_convolution_54 0.0215 ms 43.9%
+  conv1x1_via_mm 0.1413 ms 6.7%
+SingleProcess AUTOTUNE takes 1.0240 seconds
+AUTOTUNE convolution(32x128x1x1, 256x128x1x1)
+  convolution 0.0091 ms 100.0%
+  triton_convolution_63 0.0097 ms 94.4%
+  triton_convolution_61 0.0127 ms 71.6%
+  triton_convolution_64 0.0132 ms 68.8%
+  triton_convolution_60 0.0135 ms 67.7%
+  triton_convolution_65 0.0135 ms 67.5%
+  triton_convolution_62 0.0180 ms 50.8%
+  triton_convolution_59 0.0219 ms 41.7%
+  conv1x1_via_mm 0.1248 ms 7.3%
+SingleProcess AUTOTUNE takes 2.8852 seconds
+AUTOTUNE convolution(32x256x48x48, 256x256x1x1)
+  convolution 0.0914 ms 100.0%
+  triton_convolution_74 0.2805 ms 32.6%
+  triton_convolution_77 0.2990 ms 30.6%
+  triton_convolution_76 0.3139 ms 29.1%
+  triton_convolution_79 0.3352 ms 27.3%
+  triton_convolution_78 0.3672 ms 24.9%
+  triton_convolution_73 0.3810 ms 24.0%
+  triton_convolution_75 0.6348 ms 14.4%
+  conv1x1_via_mm 0.7045 ms 13.0%
+SingleProcess AUTOTUNE takes 5.1142 seconds
+AUTOTUNE convolution(32x256x24x24, 512x256x1x1)
+  convolution 0.0603 ms 100.0%
+  triton_convolution_84 0.1379 ms 43.7%
+  triton_convolution_81 0.1477 ms 40.8%
+  triton_convolution_83 0.1658 ms 36.3%
+  triton_convolution_86 0.1673 ms 36.0%
+  triton_convolution_85 0.1960 ms 30.7%
+  triton_convolution_80 0.1989 ms 30.3%
+  triton_convolution_82 0.3260 ms 18.5%
+  conv1x1_via_mm 0.3858 ms 15.6%
+SingleProcess AUTOTUNE takes 5.3308 seconds
+AUTOTUNE convolution(32x512x1x1, 256x512x1x1)
+  convolution 0.0105 ms 100.0%
+  triton_convolution_91 0.0227 ms 46.2%
+  triton_convolution_93 0.0258 ms 40.6%
+  triton_convolution_90 0.0275 ms 38.1%
+  triton_convolution_92 0.0345 ms 30.4%
+  triton_convolution_89 0.0361 ms 29.0%
+  triton_convolution_88 0.0400 ms 26.2%
+  triton_convolution_87 0.0731 ms 14.3%
+  conv1x1_via_mm 0.1417 ms 7.4%
+SingleProcess AUTOTUNE takes 1.1478 seconds
+AUTOTUNE convolution(32x256x1x1, 512x256x1x1)
+  convolution 0.0094 ms 100.0%
+  triton_convolution_98 0.0140 ms 67.4%
+  triton_convolution_100 0.0151 ms 62.6%
+  triton_convolution_97 0.0168 ms 56.2%
+  triton_convolution_96 0.0201 ms 46.9%
+  triton_convolution_99 0.0209 ms 45.2%
+  triton_convolution_95 0.0232 ms 40.7%
+  triton_convolution_94 0.0382 ms 24.7%
+  conv1x1_via_mm 0.1322 ms 7.1%
+SingleProcess AUTOTUNE takes 2.4309 seconds
+AUTOTUNE convolution(32x512x24x24, 256x512x1x1)
+  convolution 0.0799 ms 100.0%
+  triton_convolution_112 0.1264 ms 63.2%
+  triton_convolution_109 0.1394 ms 57.3%
+  triton_convolution_114 0.1446 ms 55.3%
+  triton_convolution_111 0.1520 ms 52.6%
+  triton_convolution_113 0.1848 ms 43.3%
+  triton_convolution_108 0.2195 ms 36.4%
+  conv1x1_via_mm 0.2881 ms 27.7%
+  triton_convolution_110 0.3056 ms 26.1%
+SingleProcess AUTOTUNE takes 4.9820 seconds
+AUTOTUNE convolution(32x512x24x24, 768x512x1x1)
+  convolution 0.1533 ms 100.0%
+  triton_convolution_137 0.2887 ms 53.1%
+  triton_convolution_140 0.3129 ms 49.0%
+  triton_convolution_142 0.3427 ms 44.7%
+  triton_convolution_139 0.3543 ms 43.3%
+  triton_convolution_136 0.4322 ms 35.5%
+  triton_convolution_141 0.4323 ms 35.5%
+  conv1x1_via_mm 0.6238 ms 24.6%
+  triton_convolution_138 0.7858 ms 19.5%
+SingleProcess AUTOTUNE takes 5.3763 seconds
+AUTOTUNE convolution(32x768x12x12, 1536x768x1x1)
+  convolution 0.0886 ms 100.0%
+  triton_convolution_144 0.1566 ms 56.6%
+  triton_convolution_147 0.2134 ms 41.5%
+  triton_convolution_149 0.2171 ms 40.8%
+  triton_convolution_146 0.2341 ms 37.9%
+  triton_convolution_143 0.2365 ms 37.5%
+  triton_convolution_148 0.3178 ms 27.9%
+  conv1x1_via_mm 0.4580 ms 19.4%
+  triton_convolution_145 0.5120 ms 17.3%
+SingleProcess AUTOTUNE takes 5.3126 seconds
+AUTOTUNE convolution(32x1536x1x1, 768x1536x1x1)
+  convolution 0.0155 ms 100.0%
+  triton_convolution_154 0.0583 ms 26.6%
+  triton_convolution_156 0.0654 ms 23.7%
+  triton_convolution_153 0.0723 ms 21.5%
+  triton_convolution_155 0.0919 ms 16.9%
+  triton_convolution_152 0.0940 ms 16.5%
+  triton_convolution_151 0.1248 ms 12.4%
+  conv1x1_via_mm 0.1352 ms 11.5%
+  triton_convolution_150 0.2123 ms 7.3%
+SingleProcess AUTOTUNE takes 2.5395 seconds
+AUTOTUNE convolution(32x768x1x1, 1536x768x1x1)
+  convolution 0.0122 ms 100.0%
+  triton_convolution_161 0.0332 ms 36.9%
+  triton_convolution_163 0.0362 ms 33.8%
+  triton_convolution_160 0.0400 ms 30.6%
+  triton_convolution_162 0.0501 ms 24.4%
+  triton_convolution_159 0.0502 ms 24.3%
+  triton_convolution_158 0.0674 ms 18.1%
+  triton_convolution_157 0.1209 ms 10.1%
+  conv1x1_via_mm 0.1503 ms 8.1%
+SingleProcess AUTOTUNE takes 2.4696 seconds
+AUTOTUNE convolution(32x512x12x12, 1536x512x1x1)
+  convolution 0.0733 ms 100.0%
+  triton_convolution_165 0.1149 ms 63.8%
+  triton_convolution_168 0.1517 ms 48.3%
+  triton_convolution_170 0.1604 ms 45.7%
+  triton_convolution_167 0.1697 ms 43.2%
+  triton_convolution_164 0.1709 ms 42.9%
+  triton_convolution_169 0.2244 ms 32.6%
+  triton_convolution_166 0.3520 ms 20.8%
+  conv1x1_via_mm 0.3627 ms 20.2%
+SingleProcess AUTOTUNE takes 5.0136 seconds
+AUTOTUNE convolution(32x1536x12x12, 768x1536x1x1)
+  convolution 0.0870 ms 100.0%
+  triton_convolution_172 0.1577 ms 55.2%
+  triton_convolution_177 0.2063 ms 42.2%
+  triton_convolution_175 0.2142 ms 40.6%
+  triton_convolution_174 0.2241 ms 38.8%
+  triton_convolution_171 0.2289 ms 38.0%
+  triton_convolution_176 0.3338 ms 26.1%
+  conv1x1_via_mm 0.4056 ms 21.5%
+  triton_convolution_173 0.5990 ms 14.5%
+SingleProcess AUTOTUNE takes 5.3106 seconds
+AUTOTUNE convolution(32x768x6x6, 1536x768x1x1)
+  convolution 0.0360 ms 100.0%
+  triton_convolution_321 0.0576 ms 62.5%
+  triton_convolution_323 0.0658 ms 54.6%
+  triton_convolution_322 0.0678 ms 53.0%
+  triton_convolution_319 0.0789 ms 45.6%
+  triton_convolution_324 0.1039 ms 34.6%
+  triton_convolution_318 0.1135 ms 31.7%
+  triton_convolution_320 0.2502 ms 14.4%
+  conv1x1_via_mm 0.2751 ms 13.1%
+SingleProcess AUTOTUNE takes 5.0832 seconds
+AUTOTUNE convolution(32x1536x6x6, 1536x1536x1x1)
+  convolution 0.0491 ms 100.0%
+  triton_convolution_342 0.1010 ms 48.7%
+  triton_convolution_344 0.1174 ms 41.8%
+  triton_convolution_343 0.1234 ms 39.8%
+  triton_convolution_340 0.1420 ms 34.6%
+  triton_convolution_345 0.1907 ms 25.8%
+  triton_convolution_339 0.2166 ms 22.7%
+  conv1x1_via_mm 0.3051 ms 16.1%
+  triton_convolution_341 0.4857 ms 10.1%
+SingleProcess AUTOTUNE takes 5.5879 seconds
+AUTOTUNE convolution(32x1536x6x6, 768x1536x1x1)
+  convolution 0.0401 ms 100.0%
+  triton_convolution_349 0.0978 ms 41.0%
+  triton_convolution_352 0.1017 ms 39.5%
+  triton_convolution_350 0.1076 ms 37.3%
+  triton_convolution_351 0.1134 ms 35.4%
+  triton_convolution_347 0.1338 ms 30.0%
+  triton_convolution_346 0.2112 ms 19.0%
+  triton_convolution_348 0.2725 ms 14.7%
+  conv1x1_via_mm 0.3048 ms 13.2%
+SingleProcess AUTOTUNE takes 5.2674 seconds
+AUTOTUNE convolution(32x1536x6x6, 3072x1536x1x1)
+  convolution 0.0769 ms 100.0%
+  triton_convolution_405 0.1955 ms 39.3%
+  triton_convolution_407 0.2275 ms 33.8%
+  triton_convolution_406 0.2333 ms 32.9%
+  triton_convolution_403 0.2489 ms 30.9%
+  triton_convolution_402 0.2546 ms 30.2%
+  triton_convolution_408 0.2868 ms 26.8%
+  conv1x1_via_mm 0.5650 ms 13.6%
+  triton_convolution_404 0.7758 ms 9.9%
+SingleProcess AUTOTUNE takes 5.9154 seconds
+AUTOTUNE int_mm(32x3072, 3072x1000, 32x1000)
+  triton_mm_419 0.0221 ms 100.0%
+  triton_mm_418 0.0264 ms 83.7%
+  triton_mm_414 0.0284 ms 77.9%
+  triton_mm_417 0.0295 ms 74.9%
+  triton_mm_415 0.0306 ms 72.4%
+  triton_mm_413 0.0340 ms 65.0%
+  triton_mm_412 0.0389 ms 56.8%
+  triton_mm_411 0.0479 ms 46.2%
+  triton_mm_410 0.0529 ms 41.8%
+  triton_mm_409 0.0761 ms 29.1%
+SingleProcess AUTOTUNE takes 4.2577 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:00, 34.38it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 37.42it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 38.97it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 39.83it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 40.64it/s]running benchmark:  93%|█████████▎| 28/30 [00:00<00:00, 41.14it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 40.07it/s]
+2437.937ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+timm_regnet
+cuda eval  timm_regnet                         int8dynamic-bs32          
+AUTOTUNE convolution(32x3x224x224, 32x3x3x3)
+  triton_convolution_1 0.1007 ms 100.0%
+  convolution 0.1015 ms 99.2%
+  triton_convolution_0 0.1123 ms 89.7%
+  triton_convolution_5 0.1257 ms 80.1%
+  triton_convolution_3 0.1278 ms 78.8%
+  triton_convolution_4 0.1306 ms 77.1%
+  triton_convolution_2 0.2301 ms 43.8%
+SingleProcess AUTOTUNE takes 0.8882 seconds
+AUTOTUNE convolution(32x32x112x112, 224x32x1x1)
+  convolution 0.1604 ms 100.0%
+  triton_convolution_10 0.5647 ms 28.4%
+  triton_convolution_11 0.6811 ms 23.6%
+  triton_convolution_6 0.6841 ms 23.4%
+  triton_convolution_9 0.6939 ms 23.1%
+  triton_convolution_7 0.6999 ms 22.9%
+  triton_convolution_12 0.7373 ms 21.8%
+  triton_convolution_8 0.8931 ms 18.0%
+  conv1x1_via_mm 2.3048 ms 7.0%
+SingleProcess AUTOTUNE takes 1.2313 seconds
+AUTOTUNE convolution(32x224x1x1, 8x224x1x1)
+  convolution 0.0094 ms 100.0%
+  triton_convolution_15 0.0132 ms 71.8%
+  triton_convolution_14 0.0173 ms 54.5%
+  triton_convolution_13 0.0173 ms 54.4%
+  conv1x1_via_mm 0.1152 ms 8.2%
+SingleProcess AUTOTUNE takes 0.6422 seconds
+AUTOTUNE convolution(32x8x1x1, 224x8x1x1)
+  triton_convolution_21 0.0068 ms 100.0%
+  triton_convolution_18 0.0070 ms 96.4%
+  triton_convolution_19 0.0072 ms 93.6%
+  triton_convolution_17 0.0073 ms 93.2%
+  triton_convolution_20 0.0077 ms 87.8%
+  convolution 0.0080 ms 84.6%
+  triton_convolution_16 0.0091 ms 74.7%
+  conv1x1_via_mm 0.1157 ms 5.8%
+SingleProcess AUTOTUNE takes 1.0198 seconds
+AUTOTUNE convolution(32x224x56x56, 224x224x1x1)
+  convolution 0.2277 ms 100.0%
+  triton_convolution_23 0.3398 ms 67.0%
+  triton_convolution_26 0.3405 ms 66.9%
+  triton_convolution_25 0.3650 ms 62.4%
+  triton_convolution_28 0.4009 ms 56.8%
+  triton_convolution_22 0.4127 ms 55.2%
+  triton_convolution_27 0.4264 ms 53.4%
+  triton_convolution_24 0.7050 ms 32.3%
+  conv1x1_via_mm 0.8220 ms 27.7%
+SingleProcess AUTOTUNE takes 1.2548 seconds
+AUTOTUNE convolution(32x32x112x112, 224x32x1x1)
+  triton_convolution_33 0.1644 ms 100.0%
+  convolution 0.1679 ms 97.9%
+  triton_convolution_34 0.1868 ms 88.0%
+  triton_convolution_32 0.1966 ms 83.6%
+  triton_convolution_29 0.1972 ms 83.4%
+  triton_convolution_30 0.2098 ms 78.3%
+  triton_convolution_35 0.2187 ms 75.2%
+  triton_convolution_31 0.2966 ms 55.4%
+SingleProcess AUTOTUNE takes 1.0337 seconds
+AUTOTUNE convolution(32x224x1x1, 56x224x1x1)
+  convolution 0.0096 ms 100.0%
+  triton_convolution_46 0.0124 ms 77.8%
+  triton_convolution_45 0.0164 ms 58.9%
+  triton_convolution_44 0.0180 ms 53.7%
+  triton_convolution_43 0.0188 ms 51.3%
+  conv1x1_via_mm 0.1134 ms 8.5%
+SingleProcess AUTOTUNE takes 0.7629 seconds
+AUTOTUNE convolution(32x56x1x1, 224x56x1x1)
+  triton_convolution_53 0.0079 ms 100.0%
+  convolution 0.0084 ms 93.9%
+  triton_convolution_50 0.0088 ms 89.5%
+  triton_convolution_49 0.0089 ms 89.2%
+  triton_convolution_48 0.0093 ms 85.2%
+  triton_convolution_51 0.0093 ms 84.9%
+  triton_convolution_52 0.0116 ms 68.4%
+  triton_convolution_47 0.0147 ms 53.9%
+  conv1x1_via_mm 0.1121 ms 7.1%
+SingleProcess AUTOTUNE takes 1.1604 seconds
+AUTOTUNE convolution(32x224x56x56, 448x224x1x1)
+  convolution 0.2035 ms 100.0%
+  triton_convolution_62 0.5852 ms 34.8%
+  triton_convolution_65 0.6023 ms 33.8%
+  triton_convolution_64 0.6954 ms 29.3%
+  triton_convolution_67 0.7010 ms 29.0%
+  triton_convolution_61 0.7890 ms 25.8%
+  triton_convolution_66 0.8129 ms 25.0%
+  triton_convolution_63 1.3484 ms 15.1%
+  conv1x1_via_mm 1.4162 ms 14.4%
+SingleProcess AUTOTUNE takes 1.2258 seconds
+AUTOTUNE convolution(32x448x1x1, 56x448x1x1)
+  convolution 0.0105 ms 100.0%
+  triton_convolution_71 0.0196 ms 53.8%
+  triton_convolution_70 0.0214 ms 49.1%
+  triton_convolution_69 0.0318 ms 33.1%
+  triton_convolution_68 0.0319 ms 33.0%
+  conv1x1_via_mm 0.1126 ms 9.3%
+SingleProcess AUTOTUNE takes 0.7613 seconds
+AUTOTUNE convolution(32x56x1x1, 448x56x1x1)
+  triton_convolution_78 0.0083 ms 100.0%
+  triton_convolution_74 0.0086 ms 96.3%
+  convolution 0.0089 ms 93.0%
+  triton_convolution_75 0.0093 ms 88.7%
+  triton_convolution_76 0.0095 ms 87.2%
+  triton_convolution_73 0.0098 ms 84.9%
+  triton_convolution_77 0.0117 ms 70.6%
+  triton_convolution_72 0.0151 ms 54.8%
+  conv1x1_via_mm 0.1209 ms 6.9%
+SingleProcess AUTOTUNE takes 1.1466 seconds
+AUTOTUNE convolution(32x448x28x28, 448x448x1x1)
+  convolution 0.0724 ms 100.0%
+  triton_convolution_80 0.1960 ms 36.9%
+  triton_convolution_83 0.2311 ms 31.3%
+  triton_convolution_85 0.2429 ms 29.8%
+  triton_convolution_82 0.2701 ms 26.8%
+  triton_convolution_79 0.2839 ms 25.5%
+  triton_convolution_84 0.3407 ms 21.2%
+  triton_convolution_81 0.4656 ms 15.5%
+  conv1x1_via_mm 0.5111 ms 14.2%
+SingleProcess AUTOTUNE takes 1.1675 seconds
+AUTOTUNE convolution(32x224x56x56, 448x224x1x1)
+  convolution 0.1553 ms 100.0%
+  triton_convolution_86 0.1800 ms 86.3%
+  triton_convolution_87 0.1955 ms 79.5%
+  triton_convolution_89 0.2008 ms 77.3%
+  triton_convolution_92 0.2139 ms 72.6%
+  triton_convolution_90 0.2167 ms 71.7%
+  triton_convolution_91 0.2261 ms 68.7%
+  triton_convolution_88 0.4545 ms 34.2%
+SingleProcess AUTOTUNE takes 1.0390 seconds
+AUTOTUNE convolution(32x448x1x1, 112x448x1x1)
+  convolution 0.0102 ms 100.0%
+  triton_convolution_104 0.0203 ms 50.3%
+  triton_convolution_105 0.0205 ms 49.8%
+  triton_convolution_103 0.0234 ms 43.6%
+  triton_convolution_100 0.0294 ms 34.7%
+  triton_convolution_102 0.0321 ms 31.8%
+  triton_convolution_101 0.0324 ms 31.5%
+  conv1x1_via_mm 0.1221 ms 8.4%
+SingleProcess AUTOTUNE takes 1.0311 seconds
+AUTOTUNE convolution(32x112x1x1, 448x112x1x1)
+  convolution 0.0092 ms 100.0%
+  triton_convolution_112 0.0106 ms 86.7%
+  triton_convolution_108 0.0117 ms 78.5%
+  triton_convolution_110 0.0130 ms 71.1%
+  triton_convolution_109 0.0136 ms 67.9%
+  triton_convolution_107 0.0145 ms 63.7%
+  triton_convolution_111 0.0198 ms 46.6%
+  triton_convolution_106 0.0229 ms 40.2%
+  conv1x1_via_mm 0.1997 ms 4.6%
+SingleProcess AUTOTUNE takes 1.1545 seconds
+AUTOTUNE convolution(32x448x28x28, 896x448x1x1)
+  convolution 0.1159 ms 100.0%
+  triton_convolution_202 0.3471 ms 33.4%
+  triton_convolution_207 0.4353 ms 26.6%
+  triton_convolution_205 0.4389 ms 26.4%
+  triton_convolution_204 0.4414 ms 26.3%
+  triton_convolution_201 0.5356 ms 21.6%
+  triton_convolution_206 0.6301 ms 18.4%
+  triton_convolution_203 0.8400 ms 13.8%
+  conv1x1_via_mm 0.8897 ms 13.0%
+SingleProcess AUTOTUNE takes 1.2048 seconds
+AUTOTUNE convolution(32x896x1x1, 112x896x1x1)
+  convolution 0.0118 ms 100.0%
+  triton_convolution_212 0.0346 ms 34.0%
+  triton_convolution_213 0.0352 ms 33.4%
+  triton_convolution_211 0.0406 ms 29.0%
+  triton_convolution_208 0.0524 ms 22.5%
+  triton_convolution_210 0.0582 ms 20.2%
+  triton_convolution_209 0.0616 ms 19.1%
+  conv1x1_via_mm 0.1138 ms 10.3%
+SingleProcess AUTOTUNE takes 1.0042 seconds
+AUTOTUNE convolution(32x112x1x1, 896x112x1x1)
+  convolution 0.0090 ms 100.0%
+  triton_convolution_220 0.0112 ms 80.5%
+  triton_convolution_216 0.0120 ms 75.1%
+  triton_convolution_218 0.0138 ms 65.3%
+  triton_convolution_217 0.0139 ms 64.6%
+  triton_convolution_215 0.0148 ms 61.0%
+  triton_convolution_219 0.0203 ms 44.3%
+  triton_convolution_214 0.0236 ms 38.1%
+  conv1x1_via_mm 0.1129 ms 8.0%
+SingleProcess AUTOTUNE takes 1.1521 seconds
+AUTOTUNE convolution(32x896x14x14, 896x896x1x1)
+  convolution 0.0973 ms 100.0%
+  triton_convolution_222 0.1643 ms 59.3%
+  triton_convolution_224 0.2031 ms 47.9%
+  triton_convolution_227 0.2157 ms 45.1%
+  triton_convolution_225 0.2332 ms 41.7%
+  triton_convolution_226 0.2412 ms 40.4%
+  triton_convolution_221 0.2777 ms 35.1%
+  triton_convolution_223 0.3844 ms 25.3%
+  conv1x1_via_mm 0.4198 ms 23.2%
+SingleProcess AUTOTUNE takes 1.1618 seconds
+AUTOTUNE convolution(32x448x28x28, 896x448x1x1)
+  convolution 0.0943 ms 100.0%
+  triton_convolution_232 0.1325 ms 71.2%
+  triton_convolution_231 0.1331 ms 70.9%
+  triton_convolution_234 0.1456 ms 64.8%
+  triton_convolution_229 0.1475 ms 64.0%
+  triton_convolution_233 0.1596 ms 59.1%
+  triton_convolution_228 0.2115 ms 44.6%
+  triton_convolution_230 0.4462 ms 21.1%
+SingleProcess AUTOTUNE takes 1.0319 seconds
+AUTOTUNE convolution(32x896x1x1, 224x896x1x1)
+  convolution 0.0120 ms 100.0%
+  triton_convolution_246 0.0355 ms 33.8%
+  triton_convolution_248 0.0356 ms 33.7%
+  triton_convolution_245 0.0415 ms 28.9%
+  triton_convolution_247 0.0545 ms 22.0%
+  triton_convolution_244 0.0583 ms 20.6%
+  triton_convolution_243 0.0643 ms 18.7%
+  triton_convolution_242 0.1199 ms 10.0%
+  conv1x1_via_mm 0.1224 ms 9.8%
+SingleProcess AUTOTUNE takes 1.1316 seconds
+AUTOTUNE convolution(32x224x1x1, 896x224x1x1)
+  convolution 0.0093 ms 100.0%
+  triton_convolution_253 0.0133 ms 70.0%
+  triton_convolution_252 0.0151 ms 61.7%
+  triton_convolution_255 0.0175 ms 53.1%
+  triton_convolution_251 0.0178 ms 52.4%
+  triton_convolution_254 0.0194 ms 48.1%
+  triton_convolution_250 0.0213 ms 43.6%
+  triton_convolution_249 0.0357 ms 26.1%
+  conv1x1_via_mm 0.1143 ms 8.1%
+SingleProcess AUTOTUNE takes 1.1462 seconds
+AUTOTUNE convolution(32x896x14x14, 2240x896x1x1)
+  convolution 0.1925 ms 100.0%
+  triton_convolution_516 0.3680 ms 52.3%
+  triton_convolution_518 0.4321 ms 44.6%
+  triton_convolution_521 0.4519 ms 42.6%
+  triton_convolution_519 0.4984 ms 38.6%
+  triton_convolution_520 0.5124 ms 37.6%
+  triton_convolution_515 0.6583 ms 29.2%
+  conv1x1_via_mm 0.9031 ms 21.3%
+  triton_convolution_517 0.9397 ms 20.5%
+SingleProcess AUTOTUNE takes 1.2078 seconds
+AUTOTUNE convolution(32x2240x1x1, 224x2240x1x1)
+  convolution 0.0179 ms 100.0%
+  triton_convolution_526 0.0792 ms 22.6%
+  triton_convolution_528 0.0801 ms 22.3%
+  triton_convolution_525 0.0948 ms 18.8%
+  conv1x1_via_mm 0.1098 ms 16.3%
+  triton_convolution_527 0.1262 ms 14.1%
+  triton_convolution_524 0.1367 ms 13.1%
+  triton_convolution_523 0.1636 ms 10.9%
+  triton_convolution_522 0.2901 ms 6.2%
+SingleProcess AUTOTUNE takes 1.1123 seconds
+AUTOTUNE convolution(32x224x1x1, 2240x224x1x1)
+  convolution 0.0105 ms 100.0%
+  triton_convolution_533 0.0141 ms 74.3%
+  triton_convolution_532 0.0156 ms 67.1%
+  triton_convolution_535 0.0181 ms 57.9%
+  triton_convolution_534 0.0194 ms 54.0%
+  triton_convolution_531 0.0200 ms 52.2%
+  triton_convolution_530 0.0225 ms 46.4%
+  triton_convolution_529 0.0380 ms 27.5%
+  conv1x1_via_mm 0.1158 ms 9.0%
+SingleProcess AUTOTUNE takes 1.1418 seconds
+AUTOTUNE convolution(32x2240x7x7, 2240x2240x1x1)
+  convolution 0.1237 ms 100.0%
+  triton_convolution_540 0.3129 ms 39.5%
+  triton_convolution_537 0.3563 ms 34.7%
+  triton_convolution_539 0.3572 ms 34.6%
+  triton_convolution_542 0.3875 ms 31.9%
+  triton_convolution_541 0.5513 ms 22.4%
+  triton_convolution_536 0.6377 ms 19.4%
+  conv1x1_via_mm 0.7134 ms 17.3%
+  triton_convolution_538 0.8652 ms 14.3%
+SingleProcess AUTOTUNE takes 1.1819 seconds
+AUTOTUNE convolution(32x896x14x14, 2240x896x1x1)
+  convolution 0.0811 ms 100.0%
+  triton_convolution_546 0.1572 ms 51.6%
+  triton_convolution_547 0.1878 ms 43.2%
+  triton_convolution_549 0.1892 ms 42.9%
+  triton_convolution_548 0.2000 ms 40.5%
+  triton_convolution_544 0.2939 ms 27.6%
+  triton_convolution_543 0.3649 ms 22.2%
+  triton_convolution_545 0.6053 ms 13.4%
+SingleProcess AUTOTUNE takes 1.0388 seconds
+AUTOTUNE int_mm(32x2240, 2240x1000, 32x1000)
+  triton_mm_560 0.0188 ms 100.0%
+  triton_mm_555 0.0221 ms 84.9%
+  triton_mm_558 0.0223 ms 84.1%
+  triton_mm_559 0.0228 ms 82.6%
+  triton_mm_556 0.0239 ms 78.6%
+  triton_mm_554 0.0259 ms 72.5%
+  triton_mm_553 0.0301 ms 62.3%
+  triton_mm_552 0.0369 ms 50.9%
+  triton_mm_551 0.0406 ms 46.3%
+  triton_mm_550 0.0570 ms 33.0%
+SingleProcess AUTOTUNE takes 1.4060 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 19.30it/s]running benchmark:  20%|██        | 6/30 [00:00<00:00, 27.46it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 29.72it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 30.72it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 31.40it/s]running benchmark:  73%|███████▎  | 22/30 [00:00<00:00, 31.82it/s]running benchmark:  87%|████████▋ | 26/30 [00:00<00:00, 32.09it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 32.17it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 30.99it/s]
+1468.243ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+timm_resnest
+cuda eval  timm_resnest                        int8dynamic-bs32          
+AUTOTUNE convolution(32x32x112x112, 32x32x3x3)
+  convolution 0.1855 ms 100.0%
+  triton_convolution_7 0.1910 ms 97.2%
+  triton_convolution_6 0.2100 ms 88.4%
+  triton_convolution_9 0.2880 ms 64.4%
+  triton_convolution_12 0.3031 ms 61.2%
+  triton_convolution_10 0.3106 ms 59.7%
+  triton_convolution_11 0.3508 ms 52.9%
+  triton_convolution_8 0.4273 ms 43.4%
+SingleProcess AUTOTUNE takes 1.0640 seconds
+AUTOTUNE convolution(32x32x112x112, 64x32x3x3)
+  convolution 0.2337 ms 100.0%
+  triton_convolution_16 0.3482 ms 67.1%
+  triton_convolution_13 0.3497 ms 66.8%
+  triton_convolution_14 0.3530 ms 66.2%
+  triton_convolution_17 0.3582 ms 65.2%
+  triton_convolution_18 0.3890 ms 60.1%
+  triton_convolution_19 0.4054 ms 57.7%
+  triton_convolution_15 0.7876 ms 29.7%
+SingleProcess AUTOTUNE takes 1.0161 seconds
+AUTOTUNE convolution(32x64x56x56, 64x64x1x1)
+  convolution 0.0368 ms 100.0%
+  triton_convolution_24 0.0582 ms 63.2%
+  triton_convolution_20 0.0586 ms 62.8%
+  triton_convolution_25 0.0657 ms 55.9%
+  triton_convolution_23 0.0717 ms 51.2%
+  triton_convolution_21 0.0807 ms 45.5%
+  triton_convolution_26 0.0880 ms 41.8%
+  triton_convolution_22 0.1072 ms 34.3%
+  conv1x1_via_mm 0.3979 ms 9.2%
+SingleProcess AUTOTUNE takes 1.0485 seconds
+AUTOTUNE convolution(32x64x1x1, 32x64x1x1)
+  triton_convolution_29 0.0081 ms 100.0%
+  triton_convolution_28 0.0082 ms 98.8%
+  convolution 0.0086 ms 93.3%
+  triton_convolution_27 0.0086 ms 93.3%
+  conv1x1_via_mm 0.1273 ms 6.3%
+SingleProcess AUTOTUNE takes 0.6477 seconds
+AUTOTUNE convolution(32x32x1x1, 128x32x1x1)
+  triton_convolution_32 0.0066 ms 100.0%
+  triton_convolution_35 0.0074 ms 89.2%
+  triton_convolution_33 0.0076 ms 86.6%
+  triton_convolution_30 0.0076 ms 86.2%
+  triton_convolution_34 0.0082 ms 80.8%
+  convolution 0.0084 ms 78.6%
+  triton_convolution_31 0.0097 ms 68.2%
+  conv1x1_via_mm 0.1173 ms 5.6%
+SingleProcess AUTOTUNE takes 1.0211 seconds
+AUTOTUNE convolution(32x64x56x56, 256x64x1x1)
+  convolution 0.0741 ms 100.0%
+  triton_convolution_40 0.1919 ms 38.6%
+  triton_convolution_37 0.2467 ms 30.0%
+  triton_convolution_41 0.2680 ms 27.6%
+  triton_convolution_39 0.2689 ms 27.6%
+  triton_convolution_42 0.2788 ms 26.6%
+  triton_convolution_36 0.2820 ms 26.3%
+  triton_convolution_38 0.3388 ms 21.9%
+  conv1x1_via_mm 0.6708 ms 11.0%
+SingleProcess AUTOTUNE takes 1.1703 seconds
+AUTOTUNE convolution(32x256x56x56, 128x256x1x1)
+  convolution 0.0784 ms 100.0%
+  triton_convolution_51 0.2115 ms 37.1%
+  triton_convolution_54 0.2125 ms 36.9%
+  triton_convolution_50 0.2204 ms 35.6%
+  triton_convolution_53 0.2279 ms 34.4%
+  triton_convolution_56 0.2468 ms 31.8%
+  triton_convolution_55 0.2553 ms 30.7%
+  triton_convolution_52 0.4768 ms 16.4%
+  conv1x1_via_mm 0.6132 ms 12.8%
+SingleProcess AUTOTUNE takes 1.1642 seconds
+AUTOTUNE convolution(32x128x1x1, 64x128x1x1)
+  convolution 0.0084 ms 100.0%
+  triton_convolution_60 0.0094 ms 89.1%
+  triton_convolution_58 0.0121 ms 69.1%
+  triton_convolution_57 0.0134 ms 62.4%
+  triton_convolution_59 0.0135 ms 61.9%
+  conv1x1_via_mm 0.1189 ms 7.1%
+SingleProcess AUTOTUNE takes 0.7677 seconds
+AUTOTUNE convolution(32x64x1x1, 256x64x1x1)
+  triton_convolution_65 0.0076 ms 100.0%
+  triton_convolution_63 0.0078 ms 96.7%
+  convolution 0.0084 ms 90.5%
+  triton_convolution_67 0.0086 ms 88.4%
+  triton_convolution_66 0.0096 ms 79.0%
+  triton_convolution_62 0.0104 ms 72.7%
+  triton_convolution_61 0.0137 ms 55.5%
+  triton_convolution_64 0.0200 ms 37.9%
+  conv1x1_via_mm 0.1225 ms 6.2%
+SingleProcess AUTOTUNE takes 1.1483 seconds
+AUTOTUNE convolution(32x128x28x28, 512x128x1x1)
+  convolution 0.0550 ms 100.0%
+  triton_convolution_69 0.0859 ms 64.0%
+  triton_convolution_72 0.1010 ms 54.4%
+  triton_convolution_71 0.1194 ms 46.0%
+  triton_convolution_74 0.1203 ms 45.7%
+  triton_convolution_68 0.1307 ms 42.1%
+  triton_convolution_73 0.1462 ms 37.6%
+  triton_convolution_70 0.1736 ms 31.7%
+  conv1x1_via_mm 0.3897 ms 14.1%
+SingleProcess AUTOTUNE takes 1.1262 seconds
+AUTOTUNE convolution(32x256x28x28, 512x256x1x1)
+  convolution 0.0763 ms 100.0%
+  triton_convolution_76 0.1289 ms 59.2%
+  triton_convolution_79 0.1553 ms 49.1%
+  triton_convolution_81 0.1695 ms 45.0%
+  triton_convolution_78 0.1698 ms 44.9%
+  triton_convolution_75 0.1907 ms 40.0%
+  triton_convolution_80 0.2238 ms 34.1%
+  triton_convolution_77 0.2920 ms 26.1%
+  conv1x1_via_mm 0.4868 ms 15.7%
+SingleProcess AUTOTUNE takes 1.1561 seconds
+AUTOTUNE convolution(32x512x28x28, 256x512x1x1)
+  convolution 0.0516 ms 100.0%
+  triton_convolution_83 0.1258 ms 41.0%
+  triton_convolution_86 0.1600 ms 32.2%
+  triton_convolution_88 0.1708 ms 30.2%
+  triton_convolution_82 0.1721 ms 30.0%
+  triton_convolution_85 0.1730 ms 29.8%
+  triton_convolution_87 0.2128 ms 24.2%
+  triton_convolution_84 0.3200 ms 16.1%
+  conv1x1_via_mm 0.3573 ms 14.4%
+SingleProcess AUTOTUNE takes 1.1500 seconds
+AUTOTUNE convolution(32x128x1x1, 512x128x1x1)
+  convolution 0.0086 ms 100.0%
+  triton_convolution_99 0.0104 ms 82.8%
+  triton_convolution_97 0.0129 ms 67.2%
+  triton_convolution_101 0.0132 ms 65.7%
+  triton_convolution_96 0.0136 ms 63.5%
+  triton_convolution_100 0.0136 ms 63.4%
+  triton_convolution_98 0.0180 ms 48.1%
+  triton_convolution_95 0.0227 ms 38.0%
+  conv1x1_via_mm 0.1159 ms 7.5%
+SingleProcess AUTOTUNE takes 1.1518 seconds
+AUTOTUNE convolution(32x256x14x14, 1024x256x1x1)
+  convolution 0.0560 ms 100.0%
+  triton_convolution_103 0.0623 ms 89.8%
+  triton_convolution_105 0.0807 ms 69.4%
+  triton_convolution_108 0.0852 ms 65.7%
+  triton_convolution_106 0.0874 ms 64.0%
+  triton_convolution_107 0.0912 ms 61.3%
+  triton_convolution_102 0.1001 ms 55.9%
+  triton_convolution_104 0.1598 ms 35.0%
+  conv1x1_via_mm 0.2912 ms 19.2%
+SingleProcess AUTOTUNE takes 1.1171 seconds
+AUTOTUNE convolution(32x512x14x14, 1024x512x1x1)
+  convolution 0.0740 ms 100.0%
+  triton_convolution_110 0.1033 ms 71.6%
+  triton_convolution_112 0.1343 ms 55.1%
+  triton_convolution_115 0.1432 ms 51.7%
+  triton_convolution_113 0.1524 ms 48.6%
+  triton_convolution_114 0.1525 ms 48.5%
+  triton_convolution_109 0.1742 ms 42.5%
+  triton_convolution_111 0.2910 ms 25.4%
+  conv1x1_via_mm 0.3232 ms 22.9%
+SingleProcess AUTOTUNE takes 1.1994 seconds
+AUTOTUNE convolution(32x1024x14x14, 512x1024x1x1)
+  convolution 0.0725 ms 100.0%
+  triton_convolution_117 0.1054 ms 68.8%
+  triton_convolution_122 0.1375 ms 52.7%
+  triton_convolution_119 0.1397 ms 51.9%
+  triton_convolution_120 0.1541 ms 47.0%
+  triton_convolution_121 0.1576 ms 46.0%
+  triton_convolution_116 0.1675 ms 43.3%
+  conv1x1_via_mm 0.2837 ms 25.6%
+  triton_convolution_118 0.3272 ms 22.2%
+SingleProcess AUTOTUNE takes 1.1372 seconds
+AUTOTUNE convolution(32x256x1x1, 1024x256x1x1)
+  convolution 0.0101 ms 100.0%
+  triton_convolution_134 0.0144 ms 70.0%
+  triton_convolution_136 0.0157 ms 64.3%
+  triton_convolution_133 0.0164 ms 61.3%
+  triton_convolution_132 0.0200 ms 50.5%
+  triton_convolution_135 0.0213 ms 47.4%
+  triton_convolution_131 0.0240 ms 42.0%
+  triton_convolution_130 0.0391 ms 25.8%
+  conv1x1_via_mm 0.1148 ms 8.8%
+SingleProcess AUTOTUNE takes 1.1407 seconds
+AUTOTUNE convolution(32x512x7x7, 2048x512x1x1)
+  convolution 0.0432 ms 100.0%
+  triton_convolution_140 0.0733 ms 58.9%
+  triton_convolution_141 0.0760 ms 56.8%
+  triton_convolution_138 0.0860 ms 50.2%
+  triton_convolution_143 0.0997 ms 43.3%
+  triton_convolution_137 0.1007 ms 42.9%
+  triton_convolution_142 0.1223 ms 35.3%
+  triton_convolution_139 0.1981 ms 21.8%
+  conv1x1_via_mm 0.2864 ms 15.1%
+SingleProcess AUTOTUNE takes 1.1313 seconds
+AUTOTUNE convolution(32x1024x7x7, 2048x1024x1x1)
+  convolution 0.0581 ms 100.0%
+  triton_convolution_147 0.1292 ms 45.0%
+  triton_convolution_148 0.1383 ms 42.0%
+  triton_convolution_145 0.1554 ms 37.4%
+  triton_convolution_150 0.1816 ms 32.0%
+  triton_convolution_144 0.1845 ms 31.5%
+  triton_convolution_149 0.2295 ms 25.3%
+  conv1x1_via_mm 0.3349 ms 17.4%
+  triton_convolution_146 0.3771 ms 15.4%
+SingleProcess AUTOTUNE takes 1.1323 seconds
+AUTOTUNE int_mm(32x2048, 2048x1000, 32x1000)
+  triton_mm_161 0.0172 ms 100.0%
+  triton_mm_160 0.0205 ms 83.9%
+  triton_mm_156 0.0214 ms 80.1%
+  triton_mm_159 0.0220 ms 77.9%
+  triton_mm_157 0.0227 ms 75.6%
+  triton_mm_155 0.0250 ms 68.8%
+  triton_mm_154 0.0291 ms 59.0%
+  triton_mm_153 0.0341 ms 50.4%
+  triton_mm_152 0.0385 ms 44.7%
+  triton_mm_151 0.0536 ms 32.1%
+SingleProcess AUTOTUNE takes 1.4168 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 77.87it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 82.88it/s]running benchmark:  87%|████████▋ | 26/30 [00:00<00:00, 83.63it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 83.33it/s]
+2326.147ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+timm_vision_transformer
+cuda eval  timm_vision_transformer             int8dynamic-bs32          
+AUTOTUNE convolution(32x3x224x224, 384x3x16x16)
+  convolution 0.2746 ms 100.0%
+  triton_convolution_6 0.3192 ms 86.0%
+  triton_convolution_1 0.3285 ms 83.6%
+  triton_convolution_3 0.3456 ms 79.5%
+  triton_convolution_4 0.4728 ms 58.1%
+  triton_convolution_5 0.5002 ms 54.9%
+  triton_convolution_0 0.5251 ms 52.3%
+  triton_convolution_2 1.0143 ms 27.1%
+SingleProcess AUTOTUNE takes 1.0876 seconds
+AUTOTUNE int_mm(6304x384, 384x1152, 6304x1152)
+  triton_mm_9 0.0425 ms 100.0%
+  triton_mm_8 0.0426 ms 99.6%
+  triton_mm_15 0.0461 ms 92.0%
+  triton_mm_11 0.0487 ms 87.2%
+  triton_mm_17 0.0494 ms 85.9%
+  triton_mm_7 0.0505 ms 84.1%
+  triton_mm_10 0.0505 ms 84.0%
+  triton_mm_16 0.0505 ms 84.0%
+  triton_mm_14 0.0547 ms 77.6%
+  triton_mm_12 0.1071 ms 39.6%
+SingleProcess AUTOTUNE takes 1.4959 seconds
+AUTOTUNE int_mm(6304x384, 384x384, 6304x384)
+  triton_mm_20 0.0206 ms 100.0%
+  triton_mm_28 0.0212 ms 96.8%
+  triton_mm_26 0.0213 ms 96.4%
+  triton_mm_19 0.0218 ms 94.4%
+  triton_mm_27 0.0219 ms 93.9%
+  triton_mm_22 0.0220 ms 93.7%
+  triton_mm_18 0.0235 ms 87.7%
+  triton_mm_25 0.0240 ms 85.6%
+  triton_mm_21 0.0245 ms 84.1%
+  triton_mm_23 0.0401 ms 51.4%
+SingleProcess AUTOTUNE takes 1.5784 seconds
+AUTOTUNE int_mm(6304x384, 384x1536, 6304x1536)
+  triton_mm_30 0.0512 ms 100.0%
+  triton_mm_39 0.0518 ms 98.8%
+  triton_mm_38 0.0525 ms 97.4%
+  triton_mm_31 0.0535 ms 95.6%
+  triton_mm_36 0.0575 ms 89.0%
+  triton_mm_37 0.0578 ms 88.5%
+  triton_mm_33 0.0616 ms 83.0%
+  triton_mm_32 0.0621 ms 82.4%
+  triton_mm_29 0.0634 ms 80.8%
+  triton_mm_34 0.1396 ms 36.7%
+SingleProcess AUTOTUNE takes 1.3923 seconds
+AUTOTUNE int_mm(6304x1536, 1536x384, 6304x384)
+  triton_mm_50 0.0351 ms 100.0%
+  triton_mm_49 0.0356 ms 98.6%
+  triton_mm_48 0.0462 ms 75.8%
+  triton_mm_42 0.0480 ms 73.0%
+  triton_mm_44 0.0505 ms 69.5%
+  triton_mm_41 0.0520 ms 67.4%
+  triton_mm_43 0.0552 ms 63.6%
+  triton_mm_47 0.0595 ms 58.9%
+  triton_mm_40 0.0621 ms 56.5%
+  triton_mm_45 0.1103 ms 31.8%
+SingleProcess AUTOTUNE takes 1.3975 seconds
+AUTOTUNE int_mm(32x384, 384x1000, 32x1000)
+  triton_mm_545 0.0096 ms 100.0%
+  triton_mm_540 0.0097 ms 99.0%
+  triton_mm_543 0.0100 ms 96.1%
+  triton_mm_541 0.0102 ms 93.7%
+  triton_mm_539 0.0108 ms 88.5%
+  triton_mm_544 0.0116 ms 82.4%
+  triton_mm_537 0.0119 ms 80.3%
+  triton_mm_538 0.0121 ms 79.1%
+  triton_mm_536 0.0125 ms 76.5%
+  triton_mm_535 0.0147 ms 65.3%
+SingleProcess AUTOTUNE takes 1.4199 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:03,  8.83it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:02,  9.46it/s]running benchmark:  10%|█         | 3/30 [00:00<00:02,  9.59it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:02,  9.87it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:02,  9.95it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:02,  9.95it/s]running benchmark:  30%|███       | 9/30 [00:00<00:02,  9.87it/s]running benchmark:  33%|███▎      | 10/30 [00:01<00:02,  9.89it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:01,  9.91it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:01,  9.96it/s]running benchmark:  47%|████▋     | 14/30 [00:01<00:01,  9.96it/s]running benchmark:  53%|█████▎    | 16/30 [00:01<00:01, 10.00it/s]running benchmark:  60%|██████    | 18/30 [00:01<00:01, 10.01it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:01,  9.95it/s]running benchmark:  70%|███████   | 21/30 [00:02<00:00,  9.99it/s]running benchmark:  77%|███████▋  | 23/30 [00:02<00:00, 10.03it/s]running benchmark:  83%|████████▎ | 25/30 [00:02<00:00, 10.01it/s]running benchmark:  90%|█████████ | 27/30 [00:02<00:00,  9.98it/s]running benchmark:  97%|█████████▋| 29/30 [00:02<00:00,  9.98it/s]running benchmark: 100%|██████████| 30/30 [00:03<00:00,  9.94it/s]
+20619.753ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:21, ?it/s]
+timm_vision_transformer_large
+cuda eval  timm_vision_transformer_large       int8dynamic-bs32          
+AUTOTUNE convolution(32x3x224x224, 1408x3x14x14)
+  triton_convolution_6 0.7556 ms 100.0%
+  convolution 0.7770 ms 97.2%
+  triton_convolution_1 0.8418 ms 89.8%
+  triton_convolution_3 0.9352 ms 80.8%
+  triton_convolution_4 1.1513 ms 65.6%
+  triton_convolution_0 1.3162 ms 57.4%
+  triton_convolution_5 1.5446 ms 48.9%
+  triton_convolution_2 4.5990 ms 16.4%
+SingleProcess AUTOTUNE takes 1.1604 seconds
+AUTOTUNE int_mm(8224x1408, 1408x4224, 8224x4224)
+  triton_mm_17 0.2996 ms 100.0%
+  triton_mm_16 0.3004 ms 99.8%
+  triton_mm_8 0.4054 ms 73.9%
+  triton_mm_9 0.4159 ms 72.1%
+  triton_mm_14 0.4457 ms 67.2%
+  triton_mm_15 0.4517 ms 66.3%
+  triton_mm_11 0.4750 ms 63.1%
+  triton_mm_10 0.4794 ms 62.5%
+  triton_mm_7 0.5510 ms 54.4%
+  triton_mm_12 1.3654 ms 21.9%
+SingleProcess AUTOTUNE takes 1.5031 seconds
+AUTOTUNE int_mm(8224x1408, 1408x1408, 8224x1408)
+  triton_mm_27 0.1145 ms 100.0%
+  triton_mm_28 0.1147 ms 99.8%
+  triton_mm_19 0.1507 ms 75.9%
+  triton_mm_20 0.1530 ms 74.8%
+  triton_mm_26 0.1599 ms 71.6%
+  triton_mm_22 0.1719 ms 66.6%
+  triton_mm_21 0.1751 ms 65.4%
+  triton_mm_25 0.1827 ms 62.7%
+  triton_mm_18 0.2040 ms 56.1%
+  triton_mm_23 0.4625 ms 24.7%
+SingleProcess AUTOTUNE takes 1.4282 seconds
+AUTOTUNE int_mm(8224x1408, 1408x6144, 8224x6144)
+  triton_mm_38 0.4095 ms 100.0%
+  triton_mm_39 0.4128 ms 99.2%
+  triton_mm_30 0.5837 ms 70.2%
+  triton_mm_31 0.6019 ms 68.0%
+  triton_mm_37 0.6533 ms 62.7%
+  triton_mm_36 0.6861 ms 59.7%
+  triton_mm_33 0.6875 ms 59.6%
+  triton_mm_32 0.6881 ms 59.5%
+  triton_mm_29 0.7888 ms 51.9%
+  triton_mm_34 1.9818 ms 20.7%
+SingleProcess AUTOTUNE takes 1.5308 seconds
+AUTOTUNE int_mm(8224x6144, 6144x1408, 8224x1408)
+  triton_mm_49 0.3418 ms 100.0%
+  triton_mm_50 0.3438 ms 99.4%
+  triton_mm_48 0.5870 ms 58.2%
+  triton_mm_41 0.5924 ms 57.7%
+  triton_mm_42 0.5971 ms 57.3%
+  triton_mm_43 0.6374 ms 53.6%
+  triton_mm_44 0.6395 ms 53.5%
+  triton_mm_47 0.7345 ms 46.5%
+  triton_mm_40 0.8611 ms 39.7%
+  triton_mm_45 1.8516 ms 18.5%
+SingleProcess AUTOTUNE takes 1.5296 seconds
+AUTOTUNE int_mm(32x1408, 1408x1000, 32x1000)
+  triton_mm_1777 0.0147 ms 100.0%
+  triton_mm_1772 0.0166 ms 88.8%
+  triton_mm_1775 0.0175 ms 84.2%
+  triton_mm_1776 0.0176 ms 83.8%
+  triton_mm_1773 0.0180 ms 81.9%
+  triton_mm_1771 0.0198 ms 74.3%
+  triton_mm_1770 0.0219 ms 67.2%
+  triton_mm_1769 0.0254 ms 57.9%
+  triton_mm_1768 0.0285 ms 51.6%
+  triton_mm_1767 0.0381 ms 38.7%
+SingleProcess AUTOTUNE takes 1.5544 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:27,  1.04it/s]running benchmark:   7%|▋         | 2/30 [00:01<00:22,  1.22it/s]running benchmark:  10%|█         | 3/30 [00:02<00:20,  1.30it/s]running benchmark:  13%|█▎        | 4/30 [00:03<00:19,  1.34it/s]running benchmark:  17%|█▋        | 5/30 [00:03<00:18,  1.36it/s]running benchmark:  20%|██        | 6/30 [00:04<00:17,  1.37it/s]running benchmark:  23%|██▎       | 7/30 [00:05<00:16,  1.38it/s]running benchmark:  27%|██▋       | 8/30 [00:05<00:15,  1.39it/s]running benchmark:  30%|███       | 9/30 [00:06<00:15,  1.39it/s]running benchmark:  33%|███▎      | 10/30 [00:07<00:14,  1.40it/s]running benchmark:  37%|███▋      | 11/30 [00:08<00:13,  1.40it/s]running benchmark:  40%|████      | 12/30 [00:08<00:12,  1.40it/s]running benchmark:  43%|████▎     | 13/30 [00:09<00:12,  1.40it/s]running benchmark:  47%|████▋     | 14/30 [00:10<00:11,  1.40it/s]running benchmark:  50%|█████     | 15/30 [00:10<00:10,  1.39it/s]running benchmark:  53%|█████▎    | 16/30 [00:11<00:10,  1.40it/s]running benchmark:  57%|█████▋    | 17/30 [00:12<00:09,  1.40it/s]running benchmark:  60%|██████    | 18/30 [00:13<00:08,  1.40it/s]running benchmark:  63%|██████▎   | 19/30 [00:13<00:07,  1.40it/s]running benchmark:  67%|██████▋   | 20/30 [00:14<00:07,  1.40it/s]running benchmark:  70%|███████   | 21/30 [00:15<00:06,  1.40it/s]running benchmark:  73%|███████▎  | 22/30 [00:15<00:05,  1.40it/s]running benchmark:  77%|███████▋  | 23/30 [00:16<00:04,  1.40it/s]running benchmark:  80%|████████  | 24/30 [00:17<00:04,  1.40it/s]running benchmark:  83%|████████▎ | 25/30 [00:18<00:03,  1.40it/s]running benchmark:  87%|████████▋ | 26/30 [00:18<00:02,  1.40it/s]running benchmark:  90%|█████████ | 27/30 [00:19<00:02,  1.40it/s]running benchmark:  93%|█████████▎| 28/30 [00:20<00:01,  1.40it/s]running benchmark:  97%|█████████▋| 29/30 [00:20<00:00,  1.39it/s]running benchmark: 100%|██████████| 30/30 [00:21<00:00,  1.39it/s]running benchmark: 100%|██████████| 30/30 [00:21<00:00,  1.38it/s]
+7298.192ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+timm_vovnet
+cuda eval  timm_vovnet                         int8dynamic-bs32          
+AUTOTUNE convolution(32x3x224x224, 64x3x3x3)
+  convolution 0.1643 ms 100.0%
+  triton_convolution_4 0.1688 ms 97.3%
+  triton_convolution_3 0.1771 ms 92.8%
+  triton_convolution_5 0.2394 ms 68.6%
+  triton_convolution_0 0.2681 ms 61.3%
+  triton_convolution_2 0.2879 ms 57.1%
+  triton_convolution_1 0.7090 ms 23.2%
+SingleProcess AUTOTUNE takes 1.0880 seconds
+AUTOTUNE convolution(32x64x112x112, 64x64x3x3)
+  convolution 0.1837 ms 100.0%
+  triton_convolution_6 0.9228 ms 19.9%
+  triton_convolution_11 1.0023 ms 18.3%
+  triton_convolution_12 1.2050 ms 15.2%
+  triton_convolution_9 1.4839 ms 12.4%
+  triton_convolution_10 1.5310 ms 12.0%
+  triton_convolution_7 1.8327 ms 10.0%
+  triton_convolution_8 3.5894 ms 5.1%
+SingleProcess AUTOTUNE takes 1.1569 seconds
+AUTOTUNE convolution(32x64x112x112, 128x64x3x3)
+  convolution 0.1016 ms 100.0%
+  triton_convolution_13 0.6916 ms 14.7%
+  triton_convolution_16 0.7778 ms 13.1%
+  triton_convolution_18 0.8584 ms 11.8%
+  triton_convolution_19 0.8827 ms 11.5%
+  triton_convolution_14 1.1394 ms 8.9%
+  triton_convolution_17 1.3769 ms 7.4%
+  triton_convolution_15 2.0794 ms 4.9%
+SingleProcess AUTOTUNE takes 1.1239 seconds
+AUTOTUNE convolution(32x128x56x56, 128x128x3x3)
+  convolution 0.1440 ms 100.0%
+  triton_convolution_23 0.8734 ms 16.5%
+  triton_convolution_25 1.0474 ms 13.8%
+  triton_convolution_20 1.0502 ms 13.7%
+  triton_convolution_26 1.3319 ms 10.8%
+  triton_convolution_24 1.5655 ms 9.2%
+  triton_convolution_21 1.9890 ms 7.2%
+  triton_convolution_22 3.7707 ms 3.8%
+SingleProcess AUTOTUNE takes 1.1467 seconds
+AUTOTUNE mm(100352x768, 768x256)
+  mm 0.2070 ms 100.0%
+  triton_mm_57 0.2493 ms 83.0%
+  triton_mm_56 0.2505 ms 82.6%
+  triton_mm_59 0.2837 ms 73.0%
+  triton_mm_58 0.2842 ms 72.8%
+  triton_mm_62 0.3070 ms 67.4%
+  triton_mm_55 0.3265 ms 63.4%
+  triton_mm_63 0.3321 ms 62.3%
+  triton_mm_65 0.5007 ms 41.3%
+  triton_mm_61 0.5938 ms 34.9%
+SingleProcess AUTOTUNE takes 1.7239 seconds
+AUTOTUNE convolution(32x256x28x28, 160x256x3x3)
+  convolution 0.1053 ms 100.0%
+  triton_convolution_72 0.7192 ms 14.6%
+  triton_convolution_70 0.8070 ms 13.0%
+  triton_convolution_67 0.9006 ms 11.7%
+  triton_convolution_73 1.0533 ms 10.0%
+  triton_convolution_71 1.1203 ms 9.4%
+  triton_convolution_68 1.8032 ms 5.8%
+  triton_convolution_69 2.7799 ms 3.8%
+SingleProcess AUTOTUNE takes 1.1206 seconds
+AUTOTUNE convolution(32x160x28x28, 160x160x3x3)
+  convolution 0.0723 ms 100.0%
+  triton_convolution_79 0.4325 ms 16.7%
+  triton_convolution_77 0.4960 ms 14.6%
+  triton_convolution_74 0.5283 ms 13.7%
+  triton_convolution_78 0.5875 ms 12.3%
+  triton_convolution_80 0.6078 ms 11.9%
+  triton_convolution_75 1.1557 ms 6.3%
+  triton_convolution_76 1.2771 ms 5.7%
+SingleProcess AUTOTUNE takes 1.0894 seconds
+AUTOTUNE mm(25088x1056, 1056x512)
+  mm 0.1347 ms 100.0%
+  triton_mm_104 0.1568 ms 85.9%
+  triton_mm_103 0.1576 ms 85.5%
+  triton_mm_106 0.1818 ms 74.1%
+  triton_mm_105 0.1832 ms 73.5%
+  triton_mm_102 0.2060 ms 65.4%
+  triton_mm_110 0.2279 ms 59.1%
+  triton_mm_109 0.2868 ms 46.9%
+  triton_mm_112 0.3320 ms 40.6%
+  triton_mm_108 0.3757 ms 35.8%
+SingleProcess AUTOTUNE takes 1.7017 seconds
+AUTOTUNE convolution(32x512x14x14, 192x512x3x3)
+  convolution 0.0775 ms 100.0%
+  triton_convolution_119 0.4780 ms 16.2%
+  triton_convolution_117 0.6714 ms 11.5%
+  triton_convolution_120 0.7641 ms 10.1%
+  triton_convolution_114 0.9423 ms 8.2%
+  triton_convolution_118 1.0544 ms 7.4%
+  triton_convolution_115 1.5001 ms 5.2%
+  triton_convolution_116 1.7807 ms 4.4%
+SingleProcess AUTOTUNE takes 1.1086 seconds
+AUTOTUNE convolution(32x192x14x14, 192x192x3x3)
+  convolution 0.0340 ms 100.0%
+  triton_convolution_126 0.1416 ms 24.0%
+  triton_convolution_124 0.1793 ms 19.0%
+  triton_convolution_125 0.2487 ms 13.7%
+  triton_convolution_127 0.2752 ms 12.4%
+  triton_convolution_121 0.3018 ms 11.3%
+  triton_convolution_122 0.5682 ms 6.0%
+  triton_convolution_123 0.6942 ms 4.9%
+SingleProcess AUTOTUNE takes 1.3935 seconds
+AUTOTUNE mm(6272x1472, 1472x768)
+  triton_mm_151 0.0844 ms 100.0%
+  triton_mm_150 0.0857 ms 98.5%
+  mm 0.0872 ms 96.8%
+  triton_mm_152 0.0977 ms 86.3%
+  triton_mm_153 0.0978 ms 86.2%
+  triton_mm_157 0.1101 ms 76.6%
+  triton_mm_149 0.1223 ms 69.0%
+  triton_mm_156 0.1435 ms 58.8%
+  triton_mm_155 0.1990 ms 42.4%
+  triton_mm_154 0.2001 ms 42.1%
+SingleProcess AUTOTUNE takes 1.6527 seconds
+AUTOTUNE convolution(32x768x14x14, 192x768x3x3)
+  convolution 0.1116 ms 100.0%
+  triton_convolution_166 0.6818 ms 16.4%
+  triton_convolution_164 1.0210 ms 10.9%
+  triton_convolution_167 1.1024 ms 10.1%
+  triton_convolution_165 1.5909 ms 7.0%
+  triton_convolution_161 1.6180 ms 6.9%
+  triton_convolution_162 2.4381 ms 4.6%
+  triton_convolution_163 2.6841 ms 4.2%
+SingleProcess AUTOTUNE takes 1.2215 seconds
+AUTOTUNE mm(6272x1728, 1728x768)
+  triton_mm_198 0.0991 ms 100.0%
+  mm 0.0995 ms 99.5%
+  triton_mm_197 0.1001 ms 98.9%
+  triton_mm_199 0.1140 ms 86.9%
+  triton_mm_200 0.1140 ms 86.9%
+  triton_mm_204 0.1267 ms 78.2%
+  triton_mm_196 0.1418 ms 69.9%
+  triton_mm_203 0.1671 ms 59.3%
+  triton_mm_201 0.2324 ms 42.6%
+  triton_mm_202 0.2332 ms 42.5%
+SingleProcess AUTOTUNE takes 1.6807 seconds
+AUTOTUNE convolution(32x768x7x7, 224x768x3x3)
+  convolution 0.0401 ms 100.0%
+  triton_convolution_213 0.6940 ms 5.8%
+  triton_convolution_212 0.7709 ms 5.2%
+  triton_convolution_211 0.9942 ms 4.0%
+  triton_convolution_214 1.1626 ms 3.5%
+  triton_convolution_208 1.7324 ms 2.3%
+  triton_convolution_210 2.4662 ms 1.6%
+  triton_convolution_209 2.4813 ms 1.6%
+SingleProcess AUTOTUNE takes 1.3019 seconds
+AUTOTUNE convolution(32x224x7x7, 224x224x3x3)
+  convolution 0.0216 ms 100.0%
+  triton_convolution_219 0.1392 ms 15.5%
+  triton_convolution_220 0.1791 ms 12.0%
+  triton_convolution_218 0.2016 ms 10.7%
+  triton_convolution_221 0.3376 ms 6.4%
+  triton_convolution_215 0.4046 ms 5.3%
+  triton_convolution_217 0.5133 ms 4.2%
+  triton_convolution_216 0.6499 ms 3.3%
+SingleProcess AUTOTUNE takes 1.0552 seconds
+AUTOTUNE mm(1568x1888, 1888x1024)
+  mm 0.0343 ms 100.0%
+  triton_mm_245 0.0416 ms 82.4%
+  triton_mm_244 0.0419 ms 81.8%
+  triton_mm_246 0.0450 ms 76.2%
+  triton_mm_247 0.0456 ms 75.1%
+  triton_mm_251 0.0571 ms 60.1%
+  triton_mm_243 0.0628 ms 54.6%
+  triton_mm_250 0.0762 ms 45.0%
+  triton_mm_253 0.0927 ms 37.0%
+  triton_mm_249 0.0933 ms 36.7%
+SingleProcess AUTOTUNE takes 1.6702 seconds
+AUTOTUNE convolution(32x1024x7x7, 224x1024x3x3)
+  convolution 0.0499 ms 100.0%
+  triton_convolution_260 0.9466 ms 5.3%
+  triton_convolution_259 1.0584 ms 4.7%
+  triton_convolution_258 1.3020 ms 3.8%
+  triton_convolution_261 1.7805 ms 2.8%
+  triton_convolution_255 1.9190 ms 2.6%
+  triton_convolution_256 3.0954 ms 1.6%
+  triton_convolution_257 3.2574 ms 1.5%
+SingleProcess AUTOTUNE takes 1.2844 seconds
+AUTOTUNE mm(1568x2144, 2144x1024)
+  mm 0.0374 ms 100.0%
+  triton_mm_292 0.0468 ms 79.9%
+  triton_mm_291 0.0471 ms 79.4%
+  triton_mm_293 0.0499 ms 75.1%
+  triton_mm_294 0.0505 ms 74.1%
+  triton_mm_298 0.0639 ms 58.6%
+  triton_mm_290 0.0700 ms 53.5%
+  triton_mm_297 0.0852 ms 44.0%
+  triton_mm_295 0.1039 ms 36.0%
+  triton_mm_300 0.1044 ms 35.8%
+SingleProcess AUTOTUNE takes 1.6151 seconds
+AUTOTUNE int_mm(32x1024, 1024x1000, 32x1000)
+  triton_mm_312 0.0125 ms 100.0%
+  triton_mm_310 0.0143 ms 87.5%
+  triton_mm_307 0.0144 ms 87.1%
+  triton_mm_311 0.0150 ms 83.5%
+  triton_mm_308 0.0155 ms 80.7%
+  triton_mm_306 0.0162 ms 77.3%
+  triton_mm_305 0.0188 ms 66.4%
+  triton_mm_304 0.0204 ms 61.4%
+  triton_mm_303 0.0231 ms 54.2%
+  triton_mm_302 0.0294 ms 42.5%
+SingleProcess AUTOTUNE takes 1.4239 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:00, 69.85it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 75.24it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 76.91it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 76.67it/s]
+2099.755ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+torch_multimodal_clip
+cuda eval  torch_multimodal_clip               int8dynamic-bs32          
+AUTOTUNE convolution(32x3x224x224, 768x3x32x32)
+  convolution 0.7616 ms 100.0%
+  triton_convolution_6 0.9547 ms 79.8%
+  triton_convolution_3 0.9676 ms 78.7%
+  triton_convolution_4 1.0388 ms 73.3%
+  triton_convolution_1 1.1317 ms 67.3%
+  triton_convolution_5 1.2253 ms 62.2%
+  triton_convolution_0 1.7600 ms 43.3%
+  triton_convolution_2 3.9942 ms 19.1%
+SingleProcess AUTOTUNE takes 1.2106 seconds
+AUTOTUNE mm(1600x768, 768x2304)
+  triton_mm_8 0.0422 ms 100.0%
+  triton_mm_9 0.0422 ms 99.9%
+  triton_mm_10 0.0491 ms 86.0%
+  triton_mm_11 0.0495 ms 85.2%
+  mm 0.0521 ms 80.9%
+  triton_mm_15 0.0531 ms 79.5%
+  triton_mm_7 0.0570 ms 74.0%
+  triton_mm_14 0.0614 ms 68.6%
+  triton_mm_17 0.0843 ms 50.0%
+  triton_mm_12 0.0896 ms 47.1%
+SingleProcess AUTOTUNE takes 1.6280 seconds
+AUTOTUNE int_mm(1600x768, 768x768, 1600x768)
+  triton_mm_27 0.0180 ms 100.0%
+  triton_mm_21 0.0210 ms 85.8%
+  triton_mm_20 0.0212 ms 84.6%
+  triton_mm_22 0.0219 ms 82.0%
+  triton_mm_23 0.0220 ms 81.9%
+  triton_mm_28 0.0244 ms 73.9%
+  triton_mm_29 0.0244 ms 73.9%
+  triton_mm_19 0.0246 ms 73.2%
+  triton_mm_26 0.0317 ms 56.7%
+  triton_mm_24 0.0383 ms 47.0%
+SingleProcess AUTOTUNE takes 1.4226 seconds
+AUTOTUNE int_mm(1600x768, 768x3072, 1600x3072)
+  triton_mm_39 0.0452 ms 100.0%
+  triton_mm_40 0.0453 ms 99.8%
+  triton_mm_31 0.0492 ms 91.9%
+  triton_mm_38 0.0498 ms 90.9%
+  triton_mm_32 0.0507 ms 89.2%
+  triton_mm_34 0.0530 ms 85.3%
+  triton_mm_33 0.0546 ms 82.9%
+  triton_mm_37 0.0555 ms 81.6%
+  triton_mm_30 0.0614 ms 73.7%
+  triton_mm_35 0.1254 ms 36.1%
+SingleProcess AUTOTUNE takes 1.3987 seconds
+AUTOTUNE int_mm(1600x3072, 3072x768, 1600x768)
+  triton_mm_49 0.0434 ms 100.0%
+  triton_mm_51 0.0520 ms 83.4%
+  triton_mm_50 0.0528 ms 82.1%
+  triton_mm_44 0.0556 ms 78.0%
+  triton_mm_43 0.0559 ms 77.5%
+  triton_mm_42 0.0562 ms 77.1%
+  triton_mm_45 0.0578 ms 75.0%
+  triton_mm_41 0.0719 ms 60.3%
+  triton_mm_48 0.0963 ms 45.0%
+  triton_mm_46 0.1150 ms 37.7%
+SingleProcess AUTOTUNE takes 1.3902 seconds
+AUTOTUNE mm(2464x512, 512x1536)
+  triton_mm_548 0.0314 ms 100.0%
+  triton_mm_549 0.0317 ms 99.0%
+  triton_mm_550 0.0367 ms 85.5%
+  triton_mm_551 0.0370 ms 85.0%
+  mm 0.0396 ms 79.4%
+  triton_mm_555 0.0401 ms 78.4%
+  triton_mm_547 0.0418 ms 75.2%
+  triton_mm_554 0.0438 ms 71.7%
+  triton_mm_557 0.0633 ms 49.7%
+  triton_mm_556 0.0661 ms 47.6%
+SingleProcess AUTOTUNE takes 1.7325 seconds
+AUTOTUNE int_mm(2464x512, 512x512, 2464x512)
+  triton_mm_567 0.0158 ms 100.0%
+  triton_mm_560 0.0174 ms 90.6%
+  triton_mm_561 0.0181 ms 87.4%
+  triton_mm_563 0.0183 ms 86.4%
+  triton_mm_562 0.0186 ms 84.9%
+  triton_mm_569 0.0212 ms 74.4%
+  triton_mm_568 0.0215 ms 73.6%
+  triton_mm_559 0.0225 ms 70.1%
+  triton_mm_566 0.0252 ms 62.8%
+  triton_mm_564 0.0289 ms 54.6%
+SingleProcess AUTOTUNE takes 1.4431 seconds
+AUTOTUNE int_mm(2464x512, 512x2048, 2464x2048)
+  triton_mm_571 0.0369 ms 100.0%
+  triton_mm_580 0.0388 ms 95.0%
+  triton_mm_579 0.0392 ms 94.0%
+  triton_mm_578 0.0394 ms 93.6%
+  triton_mm_572 0.0400 ms 92.2%
+  triton_mm_574 0.0420 ms 87.7%
+  triton_mm_573 0.0431 ms 85.5%
+  triton_mm_577 0.0454 ms 81.2%
+  triton_mm_570 0.0464 ms 79.5%
+  triton_mm_576 0.0950 ms 38.8%
+SingleProcess AUTOTUNE takes 1.6047 seconds
+AUTOTUNE int_mm(2464x2048, 2048x512, 2464x512)
+  triton_mm_589 0.0318 ms 100.0%
+  triton_mm_591 0.0397 ms 80.1%
+  triton_mm_590 0.0399 ms 79.8%
+  triton_mm_583 0.0416 ms 76.6%
+  triton_mm_582 0.0423 ms 75.3%
+  triton_mm_584 0.0432 ms 73.8%
+  triton_mm_585 0.0441 ms 72.2%
+  triton_mm_581 0.0523 ms 60.9%
+  triton_mm_588 0.0692 ms 46.0%
+  triton_mm_586 0.0788 ms 40.4%
+SingleProcess AUTOTUNE takes 1.4093 seconds
+AUTOTUNE mm(32x768, 768x512)
+  triton_mm_1092 0.0111 ms 100.0%
+  mm 0.0117 ms 95.3%
+  triton_mm_1093 0.0121 ms 91.8%
+  triton_mm_1095 0.0124 ms 89.9%
+  triton_mm_1096 0.0132 ms 84.6%
+  triton_mm_1091 0.0140 ms 79.6%
+  triton_mm_1090 0.0150 ms 74.4%
+  triton_mm_1089 0.0160 ms 69.7%
+  triton_mm_1088 0.0169 ms 65.8%
+  triton_mm_1087 0.0251 ms 44.3%
+SingleProcess AUTOTUNE takes 1.8292 seconds
+AUTOTUNE int_mm(32x512, 512x512, 32x512)
+  triton_mm_1104 0.0099 ms 100.0%
+  triton_mm_1107 0.0103 ms 95.4%
+  triton_mm_1109 0.0106 ms 92.9%
+  triton_mm_1105 0.0109 ms 90.6%
+  triton_mm_1103 0.0118 ms 83.5%
+  triton_mm_1108 0.0121 ms 81.3%
+  triton_mm_1101 0.0129 ms 76.4%
+  triton_mm_1102 0.0133 ms 74.0%
+  triton_mm_1100 0.0145 ms 68.1%
+  triton_mm_1099 0.0166 ms 59.3%
+SingleProcess AUTOTUNE takes 1.4262 seconds
+skipping cudagraphs due to ['non-cuda device in graph']
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:04,  6.46it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:04,  6.49it/s]running benchmark:  10%|█         | 3/30 [00:00<00:04,  6.49it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:04,  6.46it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:03,  6.47it/s]running benchmark:  20%|██        | 6/30 [00:00<00:03,  6.48it/s]running benchmark:  23%|██▎       | 7/30 [00:01<00:03,  6.49it/s]running benchmark:  27%|██▋       | 8/30 [00:01<00:03,  6.50it/s]running benchmark:  30%|███       | 9/30 [00:01<00:03,  6.50it/s]running benchmark:  33%|███▎      | 10/30 [00:01<00:03,  6.48it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:02,  6.43it/s]running benchmark:  40%|████      | 12/30 [00:01<00:02,  6.45it/s]running benchmark:  43%|████▎     | 13/30 [00:02<00:02,  6.45it/s]running benchmark:  47%|████▋     | 14/30 [00:02<00:02,  6.47it/s]running benchmark:  50%|█████     | 15/30 [00:02<00:02,  6.49it/s]running benchmark:  53%|█████▎    | 16/30 [00:02<00:02,  6.51it/s]running benchmark:  57%|█████▋    | 17/30 [00:02<00:01,  6.50it/s]running benchmark:  60%|██████    | 18/30 [00:02<00:01,  6.52it/s]running benchmark:  63%|██████▎   | 19/30 [00:02<00:01,  6.54it/s]running benchmark:  67%|██████▋   | 20/30 [00:03<00:01,  6.55it/s]running benchmark:  70%|███████   | 21/30 [00:03<00:01,  6.51it/s]running benchmark:  73%|███████▎  | 22/30 [00:03<00:01,  6.52it/s]running benchmark:  77%|███████▋  | 23/30 [00:03<00:01,  6.52it/s]running benchmark:  80%|████████  | 24/30 [00:03<00:00,  6.49it/s]running benchmark:  83%|████████▎ | 25/30 [00:03<00:00,  6.52it/s]running benchmark:  87%|████████▋ | 26/30 [00:04<00:00,  6.52it/s]running benchmark:  90%|█████████ | 27/30 [00:04<00:00,  6.53it/s]running benchmark:  93%|█████████▎| 28/30 [00:04<00:00,  6.55it/s]running benchmark:  97%|█████████▋| 29/30 [00:04<00:00,  6.54it/s]running benchmark: 100%|██████████| 30/30 [00:04<00:00,  6.50it/s]running benchmark: 100%|██████████| 30/30 [00:04<00:00,  6.50it/s]
+17630.882ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model tts_angular does not support bfloat16, running with amp instead
+loading model: 0it [00:01, ?it/s]
+WARNING:common:Model tts_angular does not support bfloat16, running with amp instead
+tts_angular
+cuda eval  tts_angular                         int8dynamic-bs32          
+WARNING:common:Model tts_angular does not support bfloat16, running with amp instead
+AUTOTUNE int_mm(1600x768, 768x256, 1600x256)
+  triton_mm_8 0.0137 ms 100.0%
+  triton_mm_3 0.0176 ms 77.9%
+  triton_mm_4 0.0178 ms 77.3%
+  triton_mm_5 0.0178 ms 77.2%
+  triton_mm_6 0.0181 ms 75.8%
+  triton_mm_2 0.0206 ms 66.6%
+  triton_mm_1 0.0210 ms 65.5%
+  triton_mm_0 0.0221 ms 62.2%
+  triton_mm_10 0.0323 ms 42.5%
+  triton_mm_9 0.0324 ms 42.3%
+SingleProcess AUTOTUNE takes 7.6584 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  33%|███▎      | 10/30 [00:00<00:00, 92.28it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 93.27it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 92.47it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 92.52it/s]
+2680.454ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+vgg16
+cuda eval  vgg16                               int8dynamic-bs32          
+AUTOTUNE convolution(32x3x224x224, 64x3x3x3)
+  convolution 0.4379 ms 100.0%
+  triton_convolution_4 0.6190 ms 70.7%
+  triton_convolution_3 0.6369 ms 68.8%
+  triton_convolution_5 0.8393 ms 52.2%
+  triton_convolution_0 0.8688 ms 50.4%
+  triton_convolution_2 0.9432 ms 46.4%
+  triton_convolution_1 2.3577 ms 18.6%
+SingleProcess AUTOTUNE takes 3.9031 seconds
+AUTOTUNE convolution(32x64x224x224, 64x64x3x3)
+  convolution 0.6996 ms 100.0%
+  triton_convolution_6 3.5353 ms 19.8%
+  triton_convolution_11 3.9583 ms 17.7%
+  triton_convolution_12 4.7977 ms 14.6%
+  triton_convolution_9 6.0754 ms 11.5%
+  triton_convolution_10 6.1224 ms 11.4%
+  triton_convolution_7 6.7490 ms 10.4%
+  triton_convolution_8 14.0966 ms 5.0%
+SingleProcess AUTOTUNE takes 4.1816 seconds
+AUTOTUNE convolution(32x64x112x112, 128x64x3x3)
+  convolution 0.3221 ms 100.0%
+  triton_convolution_13 1.7096 ms 18.8%
+  triton_convolution_16 1.7364 ms 18.6%
+  triton_convolution_18 2.0458 ms 15.7%
+  triton_convolution_19 2.3623 ms 13.6%
+  triton_convolution_17 3.0478 ms 10.6%
+  triton_convolution_14 3.4251 ms 9.4%
+  triton_convolution_15 7.1314 ms 4.5%
+SingleProcess AUTOTUNE takes 4.2308 seconds
+AUTOTUNE convolution(32x128x112x112, 128x128x3x3)
+  convolution 0.5670 ms 100.0%
+  triton_convolution_23 3.4408 ms 16.5%
+  triton_convolution_20 3.9200 ms 14.5%
+  triton_convolution_25 4.0611 ms 14.0%
+  triton_convolution_26 5.2127 ms 10.9%
+  triton_convolution_24 6.1533 ms 9.2%
+  triton_convolution_21 7.6136 ms 7.4%
+  triton_convolution_22 14.2487 ms 4.0%
+SingleProcess AUTOTUNE takes 4.4393 seconds
+AUTOTUNE convolution(32x128x56x56, 256x128x3x3)
+  convolution 0.2738 ms 100.0%
+  triton_convolution_32 1.4469 ms 18.9%
+  triton_convolution_30 1.6518 ms 16.6%
+  triton_convolution_27 2.0457 ms 13.4%
+  triton_convolution_33 2.6334 ms 10.4%
+  triton_convolution_31 3.1060 ms 8.8%
+  triton_convolution_28 3.8554 ms 7.1%
+  triton_convolution_29 7.0746 ms 3.9%
+SingleProcess AUTOTUNE takes 5.0770 seconds
+AUTOTUNE convolution(32x256x56x56, 256x256x3x3)
+  convolution 0.5115 ms 100.0%
+  triton_convolution_39 3.1987 ms 16.0%
+  triton_convolution_37 3.5296 ms 14.5%
+  triton_convolution_34 4.1228 ms 12.4%
+  triton_convolution_40 5.7689 ms 8.9%
+  triton_convolution_38 7.9876 ms 6.4%
+  triton_convolution_35 10.3163 ms 5.0%
+  triton_convolution_36 14.4757 ms 3.5%
+SingleProcess AUTOTUNE takes 5.3865 seconds
+AUTOTUNE convolution(32x256x28x28, 512x256x3x3)
+  convolution 0.2590 ms 100.0%
+  triton_convolution_51 1.6150 ms 16.0%
+  triton_convolution_53 1.6716 ms 15.5%
+  triton_convolution_48 2.1441 ms 12.1%
+  triton_convolution_54 2.5724 ms 10.1%
+  triton_convolution_52 2.9428 ms 8.8%
+  triton_convolution_49 3.9359 ms 6.6%
+  triton_convolution_50 7.3193 ms 3.5%
+SingleProcess AUTOTUNE takes 4.8506 seconds
+AUTOTUNE convolution(32x512x28x28, 512x512x3x3)
+  convolution 0.5020 ms 100.0%
+  triton_convolution_60 3.6115 ms 13.9%
+  triton_convolution_55 5.0294 ms 10.0%
+  triton_convolution_58 5.0426 ms 10.0%
+  triton_convolution_61 7.2688 ms 6.9%
+  triton_convolution_59 8.6402 ms 5.8%
+  triton_convolution_56 9.7269 ms 5.2%
+  triton_convolution_57 14.9091 ms 3.4%
+SingleProcess AUTOTUNE takes 5.0506 seconds
+AUTOTUNE convolution(32x512x14x14, 512x512x3x3)
+  convolution 0.1267 ms 100.0%
+  triton_convolution_74 0.9098 ms 13.9%
+  triton_convolution_72 1.2397 ms 10.2%
+  triton_convolution_69 1.2944 ms 9.8%
+  triton_convolution_75 1.3521 ms 9.4%
+  triton_convolution_70 1.7788 ms 7.1%
+  triton_convolution_73 2.0176 ms 6.3%
+  triton_convolution_71 4.1180 ms 3.1%
+SingleProcess AUTOTUNE takes 4.7680 seconds
+AUTOTUNE int_mm(32x25088, 25088x4096, 32x4096)
+  triton_mm_100 0.1350 ms 100.0%
+  triton_mm_99 0.1544 ms 87.5%
+  triton_mm_98 0.1910 ms 70.7%
+  triton_mm_95 0.2069 ms 65.3%
+  triton_mm_96 0.2099 ms 64.3%
+  triton_mm_94 0.2329 ms 58.0%
+  triton_mm_93 0.2680 ms 50.4%
+  triton_mm_92 0.3533 ms 38.2%
+  triton_mm_91 0.3995 ms 33.8%
+  triton_mm_90 0.5294 ms 25.5%
+SingleProcess AUTOTUNE takes 3.9120 seconds
+AUTOTUNE int_mm(32x4096, 4096x4096, 32x4096)
+  triton_mm_111 0.0305 ms 100.0%
+  triton_mm_110 0.0348 ms 87.6%
+  triton_mm_109 0.0401 ms 76.0%
+  triton_mm_106 0.0438 ms 69.5%
+  triton_mm_107 0.0446 ms 68.2%
+  triton_mm_105 0.0479 ms 63.6%
+  triton_mm_104 0.0543 ms 56.1%
+  triton_mm_103 0.0685 ms 44.5%
+  triton_mm_102 0.0770 ms 39.6%
+  triton_mm_101 0.0959 ms 31.8%
+SingleProcess AUTOTUNE takes 1.3884 seconds
+AUTOTUNE int_mm(32x4096, 4096x1000, 32x1000)
+  triton_mm_122 0.0263 ms 100.0%
+  triton_mm_121 0.0316 ms 83.2%
+  triton_mm_117 0.0350 ms 75.0%
+  triton_mm_120 0.0359 ms 73.1%
+  triton_mm_118 0.0380 ms 69.2%
+  triton_mm_116 0.0425 ms 61.8%
+  triton_mm_115 0.0492 ms 53.4%
+  triton_mm_114 0.0613 ms 42.9%
+  triton_mm_113 0.0689 ms 38.1%
+  triton_mm_112 0.1005 ms 26.1%
+SingleProcess AUTOTUNE takes 1.3913 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  10%|█         | 3/30 [00:00<00:00, 29.46it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 36.34it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 38.25it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 39.07it/s]running benchmark:  73%|███████▎  | 22/30 [00:00<00:00, 38.77it/s]running benchmark:  87%|████████▋ | 26/30 [00:00<00:00, 38.71it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 38.37it/s]
+2785.318ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model vision_maskrcnn does not support bfloat16, running with float16 instead
+loading model: 0it [00:04, ?it/s]
+WARNING:common:Model vision_maskrcnn does not support bfloat16, running with float16 instead
+vision_maskrcnn
+cuda eval  vision_maskrcnn                     int8dynamic-bs32          
+WARNING:common:Model vision_maskrcnn does not support bfloat16, running with float16 instead
+AUTOTUNE convolution(1x3x800x1216, 64x3x7x7)
+  convolution 0.1078 ms 100.0%
+  triton_convolution_3 0.4403 ms 24.5%
+  triton_convolution_4 0.4762 ms 22.6%
+  triton_convolution_5 0.5210 ms 20.7%
+  triton_convolution_0 0.5631 ms 19.1%
+  triton_convolution_2 0.6964 ms 15.5%
+  triton_convolution_1 0.9203 ms 11.7%
+SingleProcess AUTOTUNE takes 3.3822 seconds
+AUTOTUNE mm(60800x64, 64x64)
+  triton_mm_8 0.0187 ms 100.0%
+  triton_mm_14 0.0187 ms 100.0%
+  triton_mm_7 0.0188 ms 99.7%
+  triton_mm_6 0.0191 ms 98.0%
+  triton_mm_10 0.0205 ms 91.1%
+  mm 0.0208 ms 89.8%
+  triton_mm_9 0.0208 ms 89.7%
+  triton_mm_13 0.0212 ms 88.2%
+  triton_mm_16 0.0212 ms 88.2%
+  triton_mm_17 0.0231 ms 81.0%
+SingleProcess AUTOTUNE takes 4.1853 seconds
+AUTOTUNE convolution(1x64x200x304, 64x64x3x3)
+  convolution 0.0468 ms 100.0%
+  triton_convolution_23 0.1772 ms 26.4%
+  triton_convolution_18 0.1803 ms 26.0%
+  triton_convolution_24 0.1818 ms 25.8%
+  triton_convolution_21 0.2598 ms 18.0%
+  triton_convolution_22 0.2645 ms 17.7%
+  triton_convolution_19 0.3290 ms 14.2%
+  triton_convolution_20 0.7239 ms 6.5%
+SingleProcess AUTOTUNE takes 4.0624 seconds
+AUTOTUNE mm(60800x64, 64x256)
+  triton_mm_27 0.0348 ms 100.0%
+  triton_mm_26 0.0367 ms 94.9%
+  triton_mm_33 0.0388 ms 89.9%
+  triton_mm_29 0.0410 ms 85.0%
+  triton_mm_28 0.0414 ms 84.2%
+  triton_mm_25 0.0432 ms 80.7%
+  mm 0.0448 ms 77.7%
+  triton_mm_35 0.0487 ms 71.6%
+  triton_mm_32 0.0492 ms 70.9%
+  triton_mm_36 0.0605 ms 57.6%
+SingleProcess AUTOTUNE takes 4.1283 seconds
+AUTOTUNE mm(60800x256, 256x64)
+  mm 0.0414 ms 100.0%
+  triton_mm_51 0.0416 ms 99.5%
+  triton_mm_52 0.0421 ms 98.4%
+  triton_mm_50 0.0422 ms 98.0%
+  triton_mm_53 0.0433 ms 95.5%
+  triton_mm_57 0.0438 ms 94.5%
+  triton_mm_56 0.0448 ms 92.4%
+  triton_mm_49 0.0455 ms 91.0%
+  triton_mm_54 0.0482 ms 85.8%
+  triton_mm_55 0.0515 ms 80.3%
+SingleProcess AUTOTUNE takes 4.5677 seconds
+AUTOTUNE mm(60800x256, 256x128)
+  triton_mm_113 0.0489 ms 100.0%
+  triton_mm_112 0.0500 ms 97.8%
+  mm 0.0504 ms 97.1%
+  triton_mm_114 0.0507 ms 96.5%
+  triton_mm_115 0.0532 ms 92.0%
+  triton_mm_119 0.0545 ms 89.8%
+  triton_mm_118 0.0548 ms 89.3%
+  triton_mm_111 0.0559 ms 87.5%
+  triton_mm_116 0.0782 ms 62.6%
+  triton_mm_117 0.0801 ms 61.1%
+SingleProcess AUTOTUNE takes 1.6250 seconds
+AUTOTUNE convolution(1x128x200x304, 128x128x3x3)
+  convolution 0.0472 ms 100.0%
+  triton_convolution_129 0.3084 ms 15.3%
+  triton_convolution_128 0.3448 ms 13.7%
+  triton_convolution_124 0.3702 ms 12.7%
+  triton_convolution_123 0.3829 ms 12.3%
+  triton_convolution_126 0.4057 ms 11.6%
+  triton_convolution_127 0.4226 ms 11.2%
+  triton_convolution_125 0.9989 ms 4.7%
+SingleProcess AUTOTUNE takes 1.0532 seconds
+AUTOTUNE mm(15200x128, 128x512)
+  triton_mm_132 0.0258 ms 100.0%
+  triton_mm_131 0.0275 ms 93.8%
+  mm 0.0282 ms 91.4%
+  triton_mm_134 0.0288 ms 89.5%
+  triton_mm_137 0.0296 ms 87.2%
+  triton_mm_130 0.0306 ms 84.3%
+  triton_mm_133 0.0314 ms 82.1%
+  triton_mm_138 0.0326 ms 79.0%
+  triton_mm_140 0.0401 ms 64.3%
+  triton_mm_136 0.0503 ms 51.2%
+SingleProcess AUTOTUNE takes 4.4610 seconds
+AUTOTUNE convolution(1x256x200x304, 512x256x1x1)
+  convolution 0.0368 ms 100.0%
+  triton_convolution_145 0.0989 ms 37.2%
+  triton_convolution_147 0.1090 ms 33.7%
+  triton_convolution_148 0.1122 ms 32.8%
+  triton_convolution_146 0.1437 ms 25.6%
+  triton_convolution_142 0.2024 ms 18.2%
+  triton_convolution_143 0.2231 ms 16.5%
+  triton_convolution_144 0.5519 ms 6.7%
+SingleProcess AUTOTUNE takes 4.2970 seconds
+AUTOTUNE mm(15200x512, 512x128)
+  triton_mm_151 0.0283 ms 100.0%
+  triton_mm_150 0.0289 ms 97.8%
+  triton_mm_152 0.0294 ms 96.2%
+  triton_mm_157 0.0311 ms 90.9%
+  triton_mm_149 0.0314 ms 90.2%
+  triton_mm_153 0.0316 ms 89.6%
+  mm 0.0334 ms 84.6%
+  triton_mm_156 0.0339 ms 83.5%
+  triton_mm_154 0.0388 ms 72.9%
+  triton_mm_155 0.0414 ms 68.4%
+SingleProcess AUTOTUNE takes 1.6248 seconds
+AUTOTUNE convolution(1x128x100x152, 128x128x3x3)
+  convolution 0.0368 ms 100.0%
+  triton_convolution_166 0.2075 ms 17.7%
+  triton_convolution_167 0.2110 ms 17.4%
+  triton_convolution_164 0.2364 ms 15.6%
+  triton_convolution_161 0.2430 ms 15.1%
+  triton_convolution_165 0.2636 ms 13.9%
+  triton_convolution_162 0.3365 ms 10.9%
+  triton_convolution_163 0.9402 ms 3.9%
+SingleProcess AUTOTUNE takes 3.8918 seconds
+AUTOTUNE mm(15200x512, 512x256)
+  triton_mm_244 0.0358 ms 100.0%
+  mm 0.0358 ms 99.9%
+  triton_mm_243 0.0378 ms 94.7%
+  triton_mm_246 0.0389 ms 92.0%
+  triton_mm_245 0.0397 ms 90.1%
+  triton_mm_250 0.0423 ms 84.6%
+  triton_mm_242 0.0471 ms 75.9%
+  triton_mm_249 0.0533 ms 67.1%
+  triton_mm_248 0.0676 ms 52.9%
+  triton_mm_247 0.0682 ms 52.5%
+SingleProcess AUTOTUNE takes 1.6118 seconds
+AUTOTUNE convolution(1x256x100x152, 256x256x3x3)
+  convolution 0.0401 ms 100.0%
+  triton_convolution_259 0.3091 ms 13.0%
+  triton_convolution_260 0.3491 ms 11.5%
+  triton_convolution_257 0.4364 ms 9.2%
+  triton_convolution_254 0.5909 ms 6.8%
+  triton_convolution_258 0.6001 ms 6.7%
+  triton_convolution_255 0.6594 ms 6.1%
+  triton_convolution_256 1.0213 ms 3.9%
+SingleProcess AUTOTUNE takes 1.0646 seconds
+AUTOTUNE mm(3800x256, 256x1024)
+  triton_mm_262 0.0228 ms 100.0%
+  mm 0.0249 ms 91.3%
+  triton_mm_263 0.0251 ms 90.6%
+  triton_mm_264 0.0256 ms 89.0%
+  triton_mm_265 0.0270 ms 84.4%
+  triton_mm_268 0.0288 ms 79.1%
+  triton_mm_269 0.0296 ms 76.8%
+  triton_mm_261 0.0305 ms 74.5%
+  triton_mm_271 0.0390 ms 58.4%
+  triton_mm_267 0.0440 ms 51.7%
+SingleProcess AUTOTUNE takes 1.6601 seconds
+AUTOTUNE convolution(1x512x100x152, 1024x512x1x1)
+  convolution 0.0332 ms 100.0%
+  triton_convolution_276 0.0992 ms 33.4%
+  triton_convolution_279 0.1161 ms 28.5%
+  triton_convolution_278 0.1201 ms 27.6%
+  triton_convolution_277 0.1414 ms 23.4%
+  triton_convolution_273 0.2287 ms 14.5%
+  triton_convolution_274 0.2536 ms 13.1%
+  triton_convolution_275 0.6035 ms 5.5%
+SingleProcess AUTOTUNE takes 1.0299 seconds
+AUTOTUNE mm(3800x1024, 1024x256)
+  mm 0.0228 ms 100.0%
+  triton_mm_281 0.0271 ms 83.9%
+  triton_mm_282 0.0279 ms 81.6%
+  triton_mm_283 0.0295 ms 77.0%
+  triton_mm_284 0.0303 ms 75.0%
+  triton_mm_288 0.0304 ms 74.9%
+  triton_mm_286 0.0407 ms 55.9%
+  triton_mm_285 0.0408 ms 55.8%
+  triton_mm_280 0.0416 ms 54.7%
+  triton_mm_287 0.0459 ms 49.6%
+SingleProcess AUTOTUNE takes 1.7209 seconds
+AUTOTUNE convolution(1x256x50x76, 256x256x3x3)
+  convolution 0.0379 ms 100.0%
+  triton_convolution_297 0.2252 ms 16.8%
+  triton_convolution_295 0.2444 ms 15.5%
+  triton_convolution_298 0.2776 ms 13.6%
+  triton_convolution_296 0.3988 ms 9.5%
+  triton_convolution_292 0.5112 ms 7.4%
+  triton_convolution_293 0.5786 ms 6.5%
+  triton_convolution_294 0.9576 ms 4.0%
+SingleProcess AUTOTUNE takes 1.0778 seconds
+AUTOTUNE mm(3800x1024, 1024x512)
+  triton_mm_436 0.0357 ms 100.0%
+  triton_mm_437 0.0384 ms 93.0%
+  mm 0.0395 ms 90.4%
+  triton_mm_438 0.0426 ms 83.8%
+  triton_mm_439 0.0442 ms 80.7%
+  triton_mm_443 0.0450 ms 79.4%
+  triton_mm_435 0.0459 ms 77.7%
+  triton_mm_445 0.0653 ms 54.6%
+  triton_mm_440 0.0662 ms 53.9%
+  triton_mm_441 0.0663 ms 53.8%
+SingleProcess AUTOTUNE takes 1.6210 seconds
+AUTOTUNE convolution(1x512x50x76, 512x512x3x3)
+  convolution 0.0468 ms 100.0%
+  triton_convolution_452 0.5963 ms 7.9%
+  triton_convolution_453 0.6835 ms 6.8%
+  triton_convolution_451 0.7675 ms 6.1%
+  triton_convolution_450 0.8222 ms 5.7%
+  triton_convolution_448 1.2222 ms 3.8%
+  triton_convolution_447 1.2780 ms 3.7%
+  triton_convolution_449 1.9300 ms 2.4%
+SingleProcess AUTOTUNE takes 1.0807 seconds
+AUTOTUNE mm(950x512, 512x2048)
+  triton_mm_455 0.0234 ms 100.0%
+  triton_mm_456 0.0280 ms 83.6%
+  triton_mm_457 0.0290 ms 80.7%
+  triton_mm_458 0.0331 ms 70.6%
+  triton_mm_462 0.0332 ms 70.6%
+  triton_mm_461 0.0334 ms 70.0%
+  triton_mm_454 0.0351 ms 66.7%
+  triton_mm_464 0.0361 ms 64.9%
+  triton_mm_460 0.0416 ms 56.2%
+  mm 0.0443 ms 52.8%
+SingleProcess AUTOTUNE takes 1.6261 seconds
+AUTOTUNE convolution(1x1024x50x76, 2048x1024x1x1)
+  convolution 0.0364 ms 100.0%
+  triton_convolution_469 0.1324 ms 27.5%
+  triton_convolution_471 0.1449 ms 25.1%
+  triton_convolution_472 0.1515 ms 24.0%
+  triton_convolution_470 0.1672 ms 21.8%
+  triton_convolution_466 0.2605 ms 14.0%
+  triton_convolution_467 0.2733 ms 13.3%
+  triton_convolution_468 0.8165 ms 4.5%
+SingleProcess AUTOTUNE takes 1.0331 seconds
+AUTOTUNE mm(950x2048, 2048x512)
+  triton_mm_476 0.0336 ms 100.0%
+  mm 0.0356 ms 94.3%
+  triton_mm_477 0.0369 ms 91.0%
+  triton_mm_475 0.0411 ms 81.7%
+  triton_mm_474 0.0413 ms 81.2%
+  triton_mm_481 0.0444 ms 75.6%
+  triton_mm_479 0.0475 ms 70.6%
+  triton_mm_482 0.0526 ms 63.8%
+  triton_mm_478 0.0569 ms 59.0%
+  triton_mm_473 0.0616 ms 54.5%
+SingleProcess AUTOTUNE takes 1.6158 seconds
+AUTOTUNE convolution(1x512x25x38, 512x512x3x3)
+  convolution 0.0453 ms 100.0%
+  triton_convolution_490 0.4800 ms 9.4%
+  triton_convolution_489 0.5905 ms 7.7%
+  triton_convolution_491 0.6030 ms 7.5%
+  triton_convolution_488 0.6830 ms 6.6%
+  triton_convolution_485 1.0627 ms 4.3%
+  triton_convolution_486 1.2381 ms 3.7%
+  triton_convolution_487 1.8025 ms 2.5%
+SingleProcess AUTOTUNE takes 1.0934 seconds
+AUTOTUNE addmm(950x256, 950x2048, 2048x256)
+  triton_mm_543 0.0270 ms 100.0%
+  triton_mm_544 0.0309 ms 87.4%
+  triton_mm_538 0.0334 ms 80.9%
+  addmm 0.0338 ms 79.9%
+  triton_mm_541 0.0347 ms 77.8%
+  triton_mm_539 0.0375 ms 72.1%
+  triton_mm_536 0.0405 ms 66.7%
+  triton_mm_537 0.0412 ms 65.6%
+  triton_mm_540 0.0414 ms 65.3%
+  bias_addmm 0.0513 ms 52.7%
+SingleProcess AUTOTUNE takes 1.7379 seconds
+AUTOTUNE convolution(1x256x25x38, 256x256x3x3)
+  convolution 0.0189 ms 100.0%
+  triton_convolution_551 0.1758 ms 10.8%
+  triton_convolution_552 0.2244 ms 8.4%
+  triton_convolution_550 0.2302 ms 8.2%
+  triton_convolution_553 0.2914 ms 6.5%
+  triton_convolution_547 0.5384 ms 3.5%
+  triton_convolution_548 0.5983 ms 3.2%
+  triton_convolution_549 0.8997 ms 2.1%
+SingleProcess AUTOTUNE takes 1.0612 seconds
+AUTOTUNE addmm(60800x256, 60800x256, 256x256)
+  bias_addmm 0.0683 ms 100.0%
+  triton_mm_556 0.0709 ms 96.3%
+  triton_mm_555 0.0776 ms 88.0%
+  triton_mm_558 0.0794 ms 86.1%
+  triton_mm_561 0.0804 ms 85.0%
+  triton_mm_557 0.0857 ms 79.7%
+  triton_mm_554 0.0880 ms 77.7%
+  triton_mm_562 0.0910 ms 75.1%
+  triton_mm_564 0.1202 ms 56.8%
+  addmm 0.1223 ms 55.9%
+SingleProcess AUTOTUNE takes 1.8836 seconds
+AUTOTUNE addmm(15200x256, 15200x512, 512x256)
+  bias_addmm 0.0361 ms 100.0%
+  triton_mm_568 0.0380 ms 94.9%
+  triton_mm_567 0.0396 ms 91.1%
+  triton_mm_569 0.0408 ms 88.4%
+  triton_mm_570 0.0409 ms 88.2%
+  triton_mm_574 0.0438 ms 82.5%
+  triton_mm_566 0.0480 ms 75.2%
+  addmm 0.0497 ms 72.6%
+  triton_mm_573 0.0547 ms 66.0%
+  triton_mm_572 0.0690 ms 52.3%
+SingleProcess AUTOTUNE takes 2.0313 seconds
+AUTOTUNE addmm(3800x256, 3800x1024, 1024x256)
+  bias_addmm 0.0241 ms 100.0%
+  triton_mm_579 0.0278 ms 86.6%
+  triton_mm_580 0.0285 ms 84.3%
+  addmm 0.0297 ms 81.0%
+  triton_mm_586 0.0305 ms 78.8%
+  triton_mm_581 0.0306 ms 78.6%
+  triton_mm_582 0.0311 ms 77.4%
+  triton_mm_584 0.0413 ms 58.2%
+  triton_mm_583 0.0413 ms 58.2%
+  triton_mm_578 0.0417 ms 57.7%
+SingleProcess AUTOTUNE takes 1.9132 seconds
+AUTOTUNE convolution(1x256x200x304, 256x256x3x3)
+  convolution 0.3061 ms 100.0%
+  triton_convolution_595 1.8899 ms 16.2%
+  triton_convolution_593 2.2997 ms 13.3%
+  triton_convolution_596 2.5756 ms 11.9%
+  triton_convolution_590 2.7285 ms 11.2%
+  triton_convolution_594 4.8749 ms 6.3%
+  triton_convolution_591 4.9890 ms 6.1%
+  triton_convolution_592 8.7221 ms 3.5%
+SingleProcess AUTOTUNE takes 1.1751 seconds
+AUTOTUNE convolution(1x256x100x152, 256x256x3x3)
+  convolution 0.0953 ms 100.0%
+  triton_convolution_602 0.6204 ms 15.4%
+  triton_convolution_600 0.6663 ms 14.3%
+  triton_convolution_603 0.7044 ms 13.5%
+  triton_convolution_601 0.9836 ms 9.7%
+  triton_convolution_597 1.0431 ms 9.1%
+  triton_convolution_598 1.2016 ms 7.9%
+  triton_convolution_599 2.8233 ms 3.4%
+SingleProcess AUTOTUNE takes 1.1205 seconds
+AUTOTUNE addmm(60800x12, 60800x256, 256x12)
+  triton_mm_620 0.0372 ms 100.0%
+  triton_mm_619 0.0377 ms 98.4%
+  triton_mm_625 0.0390 ms 95.4%
+  triton_mm_618 0.0390 ms 95.3%
+  triton_mm_626 0.0390 ms 95.2%
+  triton_mm_622 0.0400 ms 92.8%
+  triton_mm_621 0.0401 ms 92.7%
+  triton_mm_623 0.0408 ms 91.0%
+  triton_mm_628 0.0409 ms 90.9%
+  triton_mm_624 0.0421 ms 88.3%
+SingleProcess AUTOTUNE takes 4.2879 seconds
+AUTOTUNE addmm(15200x12, 15200x256, 256x12)
+  triton_mm_639 0.0142 ms 100.0%
+  triton_mm_640 0.0143 ms 99.2%
+  triton_mm_645 0.0145 ms 97.8%
+  triton_mm_638 0.0147 ms 96.5%
+  triton_mm_642 0.0147 ms 96.5%
+  triton_mm_641 0.0147 ms 96.3%
+  triton_mm_643 0.0158 ms 89.5%
+  triton_mm_646 0.0159 ms 89.0%
+  triton_mm_637 0.0162 ms 87.7%
+  triton_mm_644 0.0170 ms 83.4%
+SingleProcess AUTOTUNE takes 3.8847 seconds
+AUTOTUNE addmm(3800x12, 3800x256, 256x12)
+  triton_mm_664 0.0093 ms 100.0%
+  triton_mm_661 0.0096 ms 97.3%
+  triton_mm_660 0.0100 ms 93.6%
+  triton_mm_659 0.0100 ms 93.0%
+  triton_mm_662 0.0101 ms 92.6%
+  triton_mm_665 0.0106 ms 88.2%
+  triton_mm_657 0.0107 ms 87.2%
+  triton_mm_658 0.0112 ms 83.7%
+  bias_addmm 0.0119 ms 78.5%
+  triton_mm_656 0.0136 ms 68.9%
+SingleProcess AUTOTUNE takes 4.0696 seconds
+AUTOTUNE addmm(950x12, 950x256, 256x12)
+  triton_mm_681 0.0083 ms 100.0%
+  triton_mm_680 0.0088 ms 93.5%
+  triton_mm_683 0.0088 ms 93.5%
+  triton_mm_678 0.0091 ms 91.2%
+  triton_mm_684 0.0091 ms 90.5%
+  triton_mm_676 0.0099 ms 83.2%
+  bias_addmm 0.0101 ms 81.8%
+  triton_mm_679 0.0106 ms 77.7%
+  triton_mm_677 0.0108 ms 76.1%
+  triton_mm_675 0.0133 ms 61.9%
+SingleProcess AUTOTUNE takes 4.3906 seconds
+AUTOTUNE convolution(1x256x13x19, 256x256x3x3)
+  convolution 0.0169 ms 100.0%
+  triton_convolution_691 0.1683 ms 10.1%
+  triton_convolution_692 0.2357 ms 7.2%
+  triton_convolution_690 0.2401 ms 7.1%
+  triton_convolution_689 0.3034 ms 5.6%
+  triton_convolution_693 0.3903 ms 4.3%
+  triton_convolution_687 0.5332 ms 3.2%
+  triton_convolution_688 0.6092 ms 2.8%
+SingleProcess AUTOTUNE takes 4.5736 seconds
+AUTOTUNE addmm(247x12, 247x256, 256x12)
+  bias_addmm 0.0113 ms 100.0%
+  triton_mm_703 0.0145 ms 77.5%
+  addmm 0.0146 ms 77.0%
+  triton_mm_705 0.0161 ms 70.0%
+  triton_mm_700 0.0169 ms 66.7%
+  triton_mm_702 0.0169 ms 66.7%
+  triton_mm_699 0.0172 ms 65.5%
+  triton_mm_695 0.0174 ms 64.6%
+  triton_mm_697 0.0174 ms 64.6%
+  triton_mm_701 0.0179 ms 62.8%
+SingleProcess AUTOTUNE takes 4.0484 seconds
+AUTOTUNE addmm(60800x3, 60800x256, 256x3)
+  triton_mm_708 0.0373 ms 100.0%
+  triton_mm_707 0.0378 ms 98.7%
+  triton_mm_713 0.0389 ms 95.9%
+  triton_mm_706 0.0391 ms 95.5%
+  triton_mm_710 0.0395 ms 94.4%
+  triton_mm_709 0.0396 ms 94.1%
+  triton_mm_711 0.0402 ms 92.8%
+  triton_mm_714 0.0405 ms 92.1%
+  triton_mm_716 0.0411 ms 90.9%
+  triton_mm_717 0.0432 ms 86.4%
+SingleProcess AUTOTUNE takes 4.2899 seconds
+AUTOTUNE addmm(15200x3, 15200x256, 256x3)
+  triton_mm_719 0.0142 ms 100.0%
+  triton_mm_721 0.0143 ms 99.6%
+  triton_mm_720 0.0147 ms 96.7%
+  triton_mm_722 0.0152 ms 93.3%
+  triton_mm_723 0.0155 ms 91.9%
+  triton_mm_726 0.0164 ms 86.9%
+  triton_mm_725 0.0166 ms 85.7%
+  triton_mm_718 0.0166 ms 85.4%
+  triton_mm_728 0.0206 ms 68.8%
+  triton_mm_729 0.0207 ms 68.7%
+SingleProcess AUTOTUNE takes 4.1482 seconds
+AUTOTUNE addmm(3800x3, 3800x256, 256x3)
+  triton_mm_735 0.0092 ms 100.0%
+  triton_mm_733 0.0096 ms 95.3%
+  triton_mm_738 0.0100 ms 91.4%
+  triton_mm_734 0.0105 ms 87.2%
+  triton_mm_731 0.0108 ms 84.4%
+  triton_mm_732 0.0109 ms 84.1%
+  triton_mm_736 0.0109 ms 83.9%
+  triton_mm_739 0.0121 ms 75.7%
+  triton_mm_730 0.0137 ms 67.0%
+  bias_addmm 0.0143 ms 64.0%
+SingleProcess AUTOTUNE takes 3.9445 seconds
+AUTOTUNE addmm(950x3, 950x256, 256x3)
+  triton_mm_748 0.0089 ms 100.0%
+  triton_mm_747 0.0093 ms 95.4%
+  triton_mm_751 0.0093 ms 95.0%
+  triton_mm_750 0.0094 ms 94.5%
+  triton_mm_745 0.0095 ms 93.1%
+  triton_mm_743 0.0100 ms 89.2%
+  triton_mm_746 0.0102 ms 87.3%
+  triton_mm_744 0.0108 ms 81.9%
+  bias_addmm 0.0117 ms 76.0%
+  triton_mm_742 0.0129 ms 69.0%
+SingleProcess AUTOTUNE takes 4.2331 seconds
+AUTOTUNE addmm(247x3, 247x256, 256x3)
+  bias_addmm 0.0105 ms 100.0%
+  triton_mm_763 0.0141 ms 74.3%
+  addmm 0.0144 ms 72.8%
+  triton_mm_765 0.0156 ms 67.0%
+  triton_mm_760 0.0169 ms 62.0%
+  triton_mm_759 0.0172 ms 60.9%
+  triton_mm_762 0.0172 ms 60.9%
+  triton_mm_755 0.0174 ms 60.1%
+  triton_mm_757 0.0174 ms 60.0%
+  triton_mm_761 0.0177 ms 59.0%
+SingleProcess AUTOTUNE takes 4.7807 seconds
+AUTOTUNE int_mm(0x12544, 12544x1024, 0x1024)
+  triton_mm_766 0.0028 ms 100.0%
+  triton_mm_767 0.0028 ms 100.0%
+  triton_mm_768 0.0028 ms 100.0%
+  triton_mm_769 0.0028 ms 100.0%
+  triton_mm_770 0.0028 ms 100.0%
+  triton_mm_771 0.0028 ms 100.0%
+  triton_mm_772 0.0028 ms 100.0%
+  triton_mm_773 0.0028 ms 100.0%
+  triton_mm_774 0.0028 ms 100.0%
+  triton_mm_775 0.0028 ms 100.0%
+SingleProcess AUTOTUNE takes 2.0978 seconds
+AUTOTUNE int_mm(0x1024, 1024x1024, 0x1024)
+  triton_mm_777 0.0028 ms 100.0%
+  triton_mm_778 0.0028 ms 100.0%
+  triton_mm_779 0.0028 ms 100.0%
+  triton_mm_780 0.0028 ms 100.0%
+  triton_mm_781 0.0028 ms 100.0%
+  triton_mm_782 0.0028 ms 100.0%
+  triton_mm_783 0.0028 ms 100.0%
+  triton_mm_784 0.0028 ms 100.0%
+  triton_mm_785 0.0028 ms 100.0%
+  triton_mm_786 0.0028 ms 100.0%
+SingleProcess AUTOTUNE takes 1.9992 seconds
+AUTOTUNE int_mm(0x1024, 1024x91, 0x91)
+  triton_mm_788 0.0028 ms 100.0%
+  triton_mm_789 0.0028 ms 100.0%
+  triton_mm_790 0.0028 ms 100.0%
+  triton_mm_791 0.0028 ms 100.0%
+  triton_mm_792 0.0028 ms 100.0%
+  triton_mm_793 0.0028 ms 100.0%
+  triton_mm_794 0.0028 ms 100.0%
+  triton_mm_795 0.0028 ms 100.0%
+  triton_mm_796 0.0028 ms 100.0%
+  triton_mm_797 0.0028 ms 100.0%
+SingleProcess AUTOTUNE takes 1.8417 seconds
+AUTOTUNE int_mm(0x1024, 1024x364, 0x364)
+  triton_mm_798 0.0028 ms 100.0%
+  triton_mm_799 0.0028 ms 100.0%
+  triton_mm_800 0.0028 ms 100.0%
+  triton_mm_801 0.0028 ms 100.0%
+  triton_mm_802 0.0028 ms 100.0%
+  triton_mm_803 0.0028 ms 100.0%
+  triton_mm_804 0.0028 ms 100.0%
+  triton_mm_805 0.0028 ms 100.0%
+  triton_mm_806 0.0028 ms 100.0%
+  triton_mm_807 0.0028 ms 100.0%
+SingleProcess AUTOTUNE takes 2.3354 seconds
+ERROR:common:Backend dynamo failed in warmup()
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2380, in warmup
+    fn(model, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 488, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/vision/torchvision/models/detection/generalized_rcnn.py", line 104, in forward
+    proposals, proposal_losses = self.rpn(images, features, targets)
+  File "/home/cdhernandez/local/vision/torchvision/models/detection/generalized_rcnn.py", line 105, in resume_in_forward
+    detections, detector_losses = self.roi_heads(features, proposals, images.image_sizes, targets)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/vision/torchvision/models/detection/roi_heads.py", line 761, in forward
+    box_features = self.box_roi_pool(features, proposals, image_shapes)
+  File "/home/cdhernandez/local/vision/torchvision/models/detection/roi_heads.py", line 775, in resume_in_forward
+    boxes, scores, labels = self.postprocess_detections(class_logits, box_regression, proposals, image_shapes)
+  File "/home/cdhernandez/local/vision/torchvision/models/detection/roi_heads.py", line 804, in resume_in_forward
+    mask_features = self.mask_roi_pool(features, mask_proposals, image_shapes)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 654, in catch_errors
+    return callback(frame, cache_entry, hooks, frame_state)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 727, in _convert_frame
+    result = inner_convert(frame, cache_entry, hooks, frame_state)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 383, in _convert_frame_assert
+    compiled_product = _compile(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 646, in _compile
+    guarded_code = compile_inner(code, one_graph, hooks, transform)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 562, in compile_inner
+    out_code = transform_code_object(code, transform)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/bytecode_transformation.py", line 1033, in transform_code_object
+    transformations(instructions, code_options)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 151, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 527, in transform
+    tracer.run()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2144, in run
+    super().run()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 818, in run
+    and self.step()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 781, in step
+    getattr(self, inst.opname)(inst)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2259, in RETURN_VALUE
+    self.output.compile_subgraph(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 945, in compile_subgraph
+    self.compile_and_call_fx_graph(tx, pass2.graph_output_vars(), root)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1087, in compile_and_call_fx_graph
+    compiled_fn = self.call_user_compiler(gm)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1159, in call_user_compiler
+    raise BackendCompilerFailed(self.compiler_fn, e).with_traceback(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1140, in call_user_compiler
+    compiled_fn = compiler_fn(gm, self.example_inputs())
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_dynamo.py", line 117, in debug_wrapper
+    compiled_gm = compiler_fn(gm, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/__init__.py", line 1672, in __call__
+    return compile_fx(model_, inputs_, config_patches=self.config)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 952, in compile_fx
+    return compile_fx(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1168, in compile_fx
+    return aot_autograd(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/backends/common.py", line 55, in compiler_fn
+    cg = aot_module_simplified(gm, example_inputs, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 887, in aot_module_simplified
+    compiled_fn = create_aot_dispatcher_function(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 600, in create_aot_dispatcher_function
+    compiled_fn = compiler_fn(flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 425, in aot_wrapper_dedupe
+    return compiler_fn(flat_fn, leaf_flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 630, in aot_wrapper_synthetic_base
+    return compiler_fn(flat_fn, flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py", line 97, in aot_dispatch_base
+    compiled_fw = compiler(fw_module, updated_flat_args)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1100, in fw_compiler_base
+    return inner_compile(
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_aot.py", line 83, in debug_wrapper
+    inner_compiled_fn = compiler_fn(gm, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/debug.py", line 305, in inner
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 320, in compile_fx_inner
+    compiled_graph = fx_codegen_and_compile(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 535, in fx_codegen_and_compile
+    graph.run(*example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 518, in run
+    return super().run(*args)
+  File "/home/cdhernandez/local/pytorch/torch/fx/interpreter.py", line 138, in run
+    self.env[node] = self.run_node(node)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 815, in run_node
+    result = self.call_function(n.target, args, kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 695, in call_function
+    raise LoweringException(e, target, args, kwargs).with_traceback(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 692, in call_function
+    out = lowerings[target](*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/lowering.py", line 291, in wrapped
+    out = decomp_fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/kernel/conv.py", line 367, in convolution
+    result = convolution(x, weight, None, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/lowering.py", line 291, in wrapped
+    out = decomp_fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/kernel/conv.py", line 457, in convolution
+    return autotune_select_algorithm("convolution", choices, args, layout)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/select_algorithm.py", line 991, in autotune_select_algorithm
+    return _ALGORITHM_SELECTOR_CACHE(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/select_algorithm.py", line 748, in __call__
+    timings = self.lookup(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 291, in lookup
+    timings = benchmark(choices)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/select_algorithm.py", line 739, in autotune
+    return make_benchmark_fn()(choices)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/select_algorithm.py", line 865, in benchmark_in_current_process
+    raise AssertionError(  # noqa: TRY200
+torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised:
+LoweringException: AssertionError: Incorrect result from choice ExternKernelCaller(extern_kernels.convolution)
+
+expected size 256==256, stride 196==1 at dim=1
+  target: aten.convolution.default
+  args[0]: TensorBox(StorageBox(
+    InputBuffer(name='arg12_1', layout=FixedLayout('cuda', torch.float16, size=[0, 256, 14, 14], stride=[50176, 196, 14, 1]))
+  ))
+  args[1]: TensorBox(StorageBox(
+    InputBuffer(name='arg0_1', layout=FixedLayout('cuda', torch.float16, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
+  ))
+  args[2]: TensorBox(StorageBox(
+    InputBuffer(name='arg1_1', layout=FixedLayout('cuda', torch.float16, size=[256], stride=[1]))
+  ))
+  args[3]: [1, 1]
+  args[4]: [1, 1]
+  args[5]: [1, 1]
+  args[6]: False
+  args[7]: [0, 0]
+  args[8]: 1
+
+Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information
+
+
+You can suppress this exception and fall back to eager by setting:
+    import torch._dynamo
+    torch._dynamo.config.suppress_errors = True
+
+Run failed with return code:  255
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:03, ?it/s]
+yolov3
+cuda eval  yolov3                              int8dynamic-bs32          
+AUTOTUNE convolution(32x3x384x512, 32x3x3x3)
+  convolution 1.3879 ms 100.0%
+  triton_convolution_4 1.7469 ms 79.4%
+  triton_convolution_0 1.8358 ms 75.6%
+  triton_convolution_3 1.8833 ms 73.7%
+  triton_convolution_2 1.8943 ms 73.3%
+  triton_convolution_5 2.4233 ms 57.3%
+  triton_convolution_1 2.4691 ms 56.2%
+SingleProcess AUTOTUNE takes 2.8322 seconds
+AUTOTUNE convolution(32x32x384x512, 64x32x3x3)
+  convolution 0.4808 ms 100.0%
+  triton_convolution_12 2.4677 ms 19.5%
+  triton_convolution_6 2.5394 ms 18.9%
+  triton_convolution_11 3.0575 ms 15.7%
+  triton_convolution_7 3.2082 ms 15.0%
+  triton_convolution_9 4.9532 ms 9.7%
+  triton_convolution_10 5.3741 ms 8.9%
+  triton_convolution_8 7.4585 ms 6.4%
+SingleProcess AUTOTUNE takes 4.0978 seconds
+AUTOTUNE mm(1572864x64, 64x32)
+  triton_mm_21 0.1922 ms 100.0%
+  triton_mm_15 0.1977 ms 97.2%
+  triton_mm_22 0.2027 ms 94.8%
+  triton_mm_17 0.2058 ms 93.4%
+  triton_mm_16 0.2082 ms 92.3%
+  triton_mm_20 0.2089 ms 92.0%
+  triton_mm_23 0.2112 ms 91.0%
+  triton_mm_18 0.2169 ms 88.6%
+  triton_mm_24 0.2210 ms 87.0%
+  triton_mm_14 0.2252 ms 85.3%
+SingleProcess AUTOTUNE takes 3.5490 seconds
+AUTOTUNE convolution(32x32x192x256, 64x32x3x3)
+  convolution 0.4158 ms 100.0%
+  triton_convolution_31 1.6724 ms 24.9%
+  triton_convolution_25 1.7381 ms 23.9%
+  triton_convolution_30 2.0053 ms 20.7%
+  triton_convolution_26 2.2423 ms 18.5%
+  triton_convolution_28 2.8924 ms 14.4%
+  triton_convolution_29 3.0772 ms 13.5%
+  triton_convolution_27 4.0509 ms 10.3%
+SingleProcess AUTOTUNE takes 4.3666 seconds
+AUTOTUNE convolution(32x64x192x256, 128x64x3x3)
+  convolution 0.3353 ms 100.0%
+  triton_convolution_38 2.5076 ms 13.4%
+  triton_convolution_32 2.5314 ms 13.2%
+  triton_convolution_35 2.8046 ms 12.0%
+  triton_convolution_37 3.1096 ms 10.8%
+  triton_convolution_33 3.5305 ms 9.5%
+  triton_convolution_36 5.2089 ms 6.4%
+  triton_convolution_34 7.5641 ms 4.4%
+SingleProcess AUTOTUNE takes 4.3805 seconds
+AUTOTUNE mm(393216x128, 128x64)
+  triton_mm_41 0.1082 ms 100.0%
+  triton_mm_40 0.1098 ms 98.5%
+  triton_mm_47 0.1107 ms 97.7%
+  triton_mm_46 0.1108 ms 97.7%
+  triton_mm_42 0.1111 ms 97.4%
+  triton_mm_39 0.1118 ms 96.7%
+  triton_mm_43 0.1119 ms 96.7%
+  mm 0.1165 ms 92.8%
+  triton_mm_49 0.1393 ms 77.6%
+  triton_mm_48 0.1411 ms 76.7%
+SingleProcess AUTOTUNE takes 4.1747 seconds
+AUTOTUNE convolution(32x64x96x128, 128x64x3x3)
+  convolution 0.3116 ms 100.0%
+  triton_convolution_57 1.5921 ms 19.6%
+  triton_convolution_51 1.6693 ms 18.7%
+  triton_convolution_54 1.6888 ms 18.5%
+  triton_convolution_56 1.9366 ms 16.1%
+  triton_convolution_52 2.4388 ms 12.8%
+  triton_convolution_55 2.9775 ms 10.5%
+  triton_convolution_53 7.0114 ms 4.4%
+SingleProcess AUTOTUNE takes 4.3451 seconds
+AUTOTUNE convolution(32x128x96x128, 256x128x3x3)
+  convolution 0.2899 ms 100.0%
+  triton_convolution_82 1.9838 ms 14.6%
+  triton_convolution_77 2.3015 ms 12.6%
+  triton_convolution_80 2.9037 ms 10.0%
+  triton_convolution_83 2.9656 ms 9.8%
+  triton_convolution_78 4.0628 ms 7.1%
+  triton_convolution_81 5.6011 ms 5.2%
+  triton_convolution_79 7.7685 ms 3.7%
+SingleProcess AUTOTUNE takes 4.8884 seconds
+AUTOTUNE mm(98304x256, 256x128)
+  mm 0.0670 ms 100.0%
+  triton_mm_86 0.0676 ms 99.1%
+  triton_mm_85 0.0707 ms 94.8%
+  triton_mm_88 0.0717 ms 93.5%
+  triton_mm_87 0.0725 ms 92.5%
+  triton_mm_91 0.0742 ms 90.4%
+  triton_mm_92 0.0780 ms 86.0%
+  triton_mm_84 0.0832 ms 80.6%
+  triton_mm_89 0.1221 ms 54.9%
+  triton_mm_90 0.1237 ms 54.2%
+SingleProcess AUTOTUNE takes 4.6599 seconds
+AUTOTUNE convolution(32x128x48x64, 256x128x3x3)
+  convolution 0.2706 ms 100.0%
+  triton_convolution_101 1.3609 ms 19.9%
+  triton_convolution_99 1.6060 ms 16.8%
+  triton_convolution_102 1.6791 ms 16.1%
+  triton_convolution_96 1.9900 ms 13.6%
+  triton_convolution_97 2.6663 ms 10.1%
+  triton_convolution_100 3.0284 ms 8.9%
+  triton_convolution_98 7.1332 ms 3.8%
+SingleProcess AUTOTUNE takes 4.7738 seconds
+AUTOTUNE convolution(32x256x48x64, 512x256x3x3)
+  convolution 0.2735 ms 100.0%
+  triton_convolution_241 2.1673 ms 12.6%
+  triton_convolution_236 2.5215 ms 10.8%
+  triton_convolution_242 3.1716 ms 8.6%
+  triton_convolution_239 3.5816 ms 7.6%
+  triton_convolution_237 4.0761 ms 6.7%
+  triton_convolution_240 6.8253 ms 4.0%
+  triton_convolution_238 8.1153 ms 3.4%
+SingleProcess AUTOTUNE takes 4.8407 seconds
+AUTOTUNE mm(24576x512, 512x256)
+  mm 0.0465 ms 100.0%
+  triton_mm_245 0.0497 ms 93.5%
+  triton_mm_244 0.0509 ms 91.4%
+  triton_mm_247 0.0564 ms 82.4%
+  triton_mm_246 0.0573 ms 81.1%
+  triton_mm_251 0.0610 ms 76.2%
+  triton_mm_250 0.0629 ms 73.9%
+  triton_mm_243 0.0679 ms 68.4%
+  triton_mm_253 0.1000 ms 46.5%
+  triton_mm_248 0.1057 ms 44.0%
+SingleProcess AUTOTUNE takes 4.5494 seconds
+AUTOTUNE convolution(32x256x24x32, 512x256x3x3)
+  convolution 0.2584 ms 100.0%
+  triton_convolution_260 1.5658 ms 16.5%
+  triton_convolution_258 1.5765 ms 16.4%
+  triton_convolution_261 1.7504 ms 14.8%
+  triton_convolution_255 2.1508 ms 12.0%
+  triton_convolution_256 2.4167 ms 10.7%
+  triton_convolution_259 2.8480 ms 9.1%
+  triton_convolution_257 7.4380 ms 3.5%
+SingleProcess AUTOTUNE takes 4.9631 seconds
+AUTOTUNE convolution(32x512x24x32, 1024x512x3x3)
+  convolution 0.2533 ms 100.0%
+  triton_convolution_400 2.2007 ms 11.5%
+  triton_convolution_395 2.7596 ms 9.2%
+  triton_convolution_401 2.8380 ms 8.9%
+  triton_convolution_396 2.8718 ms 8.8%
+  triton_convolution_398 3.4102 ms 7.4%
+  triton_convolution_399 5.5408 ms 4.6%
+  triton_convolution_397 7.8455 ms 3.2%
+SingleProcess AUTOTUNE takes 4.8531 seconds
+AUTOTUNE mm(6144x1024, 1024x512)
+  mm 0.0370 ms 100.0%
+  triton_mm_405 0.0502 ms 73.7%
+  triton_mm_406 0.0509 ms 72.7%
+  triton_mm_403 0.0511 ms 72.4%
+  triton_mm_404 0.0512 ms 72.4%
+  triton_mm_410 0.0588 ms 62.9%
+  triton_mm_409 0.0729 ms 50.8%
+  triton_mm_402 0.0732 ms 50.6%
+  triton_mm_407 0.0995 ms 37.2%
+  triton_mm_412 0.0997 ms 37.1%
+SingleProcess AUTOTUNE takes 4.7320 seconds
+AUTOTUNE convolution(32x512x12x16, 1024x512x3x3)
+  convolution 0.2485 ms 100.0%
+  triton_convolution_419 1.6743 ms 14.8%
+  triton_convolution_420 2.0514 ms 12.1%
+  triton_convolution_417 2.2960 ms 10.8%
+  triton_convolution_415 2.3332 ms 10.7%
+  triton_convolution_414 2.3521 ms 10.6%
+  triton_convolution_418 3.6614 ms 6.8%
+  triton_convolution_416 7.2023 ms 3.5%
+SingleProcess AUTOTUNE takes 4.8407 seconds
+AUTOTUNE mm(6144x2048, 2048x512)
+  mm 0.0614 ms 100.0%
+  triton_mm_513 0.0889 ms 69.0%
+  triton_mm_511 0.0891 ms 68.9%
+  triton_mm_512 0.0891 ms 68.9%
+  triton_mm_510 0.0918 ms 66.9%
+  triton_mm_517 0.1059 ms 58.0%
+  triton_mm_509 0.1115 ms 55.0%
+  triton_mm_516 0.1314 ms 46.7%
+  triton_mm_514 0.1830 ms 33.5%
+  triton_mm_515 0.1844 ms 33.3%
+SingleProcess AUTOTUNE takes 4.5861 seconds
+AUTOTUNE addmm(6144x255, 6144x1024, 1024x255)
+  triton_mm_550 0.0437 ms 100.0%
+  triton_mm_548 0.0450 ms 97.1%
+  triton_mm_551 0.0456 ms 95.8%
+  triton_mm_549 0.0476 ms 91.9%
+  triton_mm_555 0.0493 ms 88.6%
+  triton_mm_547 0.0584 ms 74.8%
+  triton_mm_554 0.0601 ms 72.7%
+  triton_mm_556 0.0674 ms 64.8%
+  triton_mm_553 0.0688 ms 63.5%
+  triton_mm_557 0.0704 ms 62.1%
+SingleProcess AUTOTUNE takes 6.2535 seconds
+AUTOTUNE mm(6144x512, 512x256)
+  mm 0.0183 ms 100.0%
+  triton_mm_560 0.0184 ms 99.3%
+  triton_mm_561 0.0188 ms 97.1%
+  triton_mm_563 0.0197 ms 92.8%
+  triton_mm_562 0.0198 ms 92.2%
+  triton_mm_567 0.0228 ms 80.1%
+  triton_mm_559 0.0242 ms 75.4%
+  triton_mm_566 0.0294 ms 62.2%
+  triton_mm_569 0.0313 ms 58.4%
+  triton_mm_564 0.0339 ms 53.9%
+SingleProcess AUTOTUNE takes 1.6429 seconds
+AUTOTUNE mm(24576x768, 768x256)
+  mm 0.0591 ms 100.0%
+  triton_mm_573 0.0693 ms 85.2%
+  triton_mm_572 0.0750 ms 78.8%
+  triton_mm_574 0.0764 ms 77.3%
+  triton_mm_579 0.0851 ms 69.4%
+  triton_mm_578 0.0876 ms 67.4%
+  triton_mm_571 0.0967 ms 61.1%
+  triton_mm_575 0.1142 ms 51.7%
+  triton_mm_576 0.1485 ms 39.8%
+  triton_mm_577 0.1486 ms 39.8%
+SingleProcess AUTOTUNE takes 4.7875 seconds
+AUTOTUNE addmm(24576x255, 24576x512, 512x255)
+  triton_mm_629 0.1031 ms 100.0%
+  triton_mm_631 0.1064 ms 96.9%
+  triton_mm_636 0.1071 ms 96.3%
+  triton_mm_630 0.1096 ms 94.1%
+  triton_mm_632 0.1119 ms 92.2%
+  triton_mm_628 0.1267 ms 81.4%
+  bias_addmm 0.1276 ms 80.8%
+  triton_mm_637 0.1405 ms 73.4%
+  triton_mm_634 0.1431 ms 72.1%
+  triton_mm_638 0.1482 ms 69.6%
+SingleProcess AUTOTUNE takes 5.8583 seconds
+AUTOTUNE mm(24576x256, 256x128)
+  triton_mm_647 0.0237 ms 100.0%
+  mm 0.0252 ms 94.2%
+  triton_mm_643 0.0255 ms 93.1%
+  triton_mm_642 0.0256 ms 92.9%
+  triton_mm_644 0.0256 ms 92.6%
+  triton_mm_641 0.0272 ms 87.2%
+  triton_mm_648 0.0282 ms 84.2%
+  triton_mm_640 0.0315 ms 75.5%
+  triton_mm_645 0.0373 ms 63.6%
+  triton_mm_646 0.0380 ms 62.4%
+SingleProcess AUTOTUNE takes 1.6500 seconds
+AUTOTUNE mm(98304x384, 384x128)
+  mm 0.0840 ms 100.0%
+  triton_mm_654 0.0883 ms 95.2%
+  triton_mm_653 0.0897 ms 93.6%
+  triton_mm_656 0.0916 ms 91.7%
+  triton_mm_655 0.0947 ms 88.7%
+  triton_mm_660 0.1000 ms 84.0%
+  triton_mm_659 0.1061 ms 79.2%
+  triton_mm_652 0.1114 ms 75.4%
+  triton_mm_658 0.1619 ms 51.9%
+  triton_mm_657 0.1625 ms 51.7%
+SingleProcess AUTOTUNE takes 4.5750 seconds
+AUTOTUNE addmm(98304x255, 98304x256, 256x255)
+  bias_addmm 0.2836 ms 100.0%
+  triton_mm_710 0.2889 ms 98.2%
+  triton_mm_717 0.3043 ms 93.2%
+  triton_mm_712 0.3074 ms 92.3%
+  triton_mm_709 0.3117 ms 91.0%
+  triton_mm_711 0.3226 ms 87.9%
+  triton_mm_713 0.3333 ms 85.1%
+  triton_mm_716 0.3577 ms 79.3%
+  triton_mm_719 0.3692 ms 76.8%
+  triton_mm_715 0.3717 ms 76.3%
+SingleProcess AUTOTUNE takes 5.7611 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:03,  8.32it/s]running benchmark:  10%|█         | 3/30 [00:00<00:01, 13.71it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 15.56it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:01, 16.47it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 16.97it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:01, 17.25it/s]running benchmark:  43%|████▎     | 13/30 [00:00<00:00, 17.42it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 17.55it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:00, 17.65it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 17.69it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 17.73it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 17.74it/s]running benchmark:  83%|████████▎ | 25/30 [00:01<00:00, 17.75it/s]running benchmark:  90%|█████████ | 27/30 [00:01<00:00, 17.76it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 17.76it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 17.15it/s]
+1573.960ms
+
+Summary for tag=0.000000:
+speedup             gmean=0.00x mean=0.000x
+abs_latency         gmean=0.00x mean=0.000x
+compilation_latency mean=0.000 seconds
+compression_ratio   mean=0.000x
+eager_peak_mem      gmean=0.00x mean=0.000x
+dynamo_peak_mem     gmean=0.00x mean=0.000x
+calls_captured      gmean=0.00x mean=0.000x
+unique_graphs       gmean=0.00x mean=0.000x
+graph_breaks        gmean=0.00x mean=0.000x
+unique_graph_breaks gmean=0.00x mean=0.000x
+
+Summary for tag=int8dynamic:
+speedup             gmean=9.20x mean=113.389x
+abs_latency         gmean=4.24x mean=10.510x
+compilation_latency mean=34.839 seconds
+compression_ratio   mean=1.263x
+eager_peak_mem      gmean=0.38x mean=0.878x
+dynamo_peak_mem     gmean=0.36x mean=0.844x
+calls_captured      gmean=233.44x mean=564.988x
+unique_graphs       gmean=1.86x mean=7.136x
+graph_breaks        gmean=0.00x mean=5.160x
+unique_graph_breaks gmean=0.00x mean=1.333x
+
+Summary for tag=int8weightonly:
+speedup             gmean=2.46x mean=2.889x
+abs_latency         gmean=4.51x mean=11.782x
+compilation_latency mean=31.136 seconds
+compression_ratio   mean=1.098x
+eager_peak_mem      gmean=0.38x mean=0.871x
+dynamo_peak_mem     gmean=0.46x mean=0.896x
+calls_captured      gmean=233.16x mean=563.963x
+unique_graphs       gmean=1.85x mean=7.183x
+graph_breaks        gmean=0.00x mean=5.220x
+unique_graph_breaks gmean=0.00x mean=1.317x
+
+Summary for tag=int4weightonly:
+speedup             gmean=2.01x mean=2.520x
+abs_latency         gmean=6.14x mean=33.943x
+compilation_latency mean=27.431 seconds
+compression_ratio   mean=1.140x
+eager_peak_mem      gmean=0.33x mean=0.696x
+dynamo_peak_mem     gmean=0.37x mean=0.739x
+calls_captured      gmean=219.02x mean=494.800x
+unique_graphs       gmean=1.83x mean=7.125x
+graph_breaks        gmean=0.00x mean=5.088x
+unique_graph_breaks gmean=0.00x mean=1.312x
+
+Summary for tag=baseline:
+speedup             gmean=2.42x mean=2.935x
+abs_latency         gmean=4.22x mean=13.273x
+compilation_latency mean=36.647 seconds
+compression_ratio   mean=1.125x
+eager_peak_mem      gmean=0.42x mean=1.075x
+dynamo_peak_mem     gmean=0.45x mean=1.120x
+calls_captured      gmean=240.73x mean=595.060x
+unique_graphs       gmean=1.89x mean=6.619x
+graph_breaks        gmean=0.00x mean=5.071x
+unique_graph_breaks gmean=0.00x mean=1.333x
+
+Summary for tag=int8weightonly-bs1:
+speedup             gmean=3.24x mean=4.117x
+abs_latency         gmean=2.99x mean=8.375x
+compilation_latency mean=35.067 seconds
+compression_ratio   mean=0.937x
+eager_peak_mem      gmean=0.24x mean=0.786x
+dynamo_peak_mem     gmean=0.38x mean=0.886x
+calls_captured      gmean=232.72x mean=567.580x
+unique_graphs       gmean=1.87x mean=7.259x
+graph_breaks        gmean=0.00x mean=5.284x
+unique_graph_breaks gmean=0.00x mean=1.333x
+
+Summary for tag=int4weightonly-bs1:
+speedup             gmean=2.82x mean=3.849x
+abs_latency         gmean=3.59x mean=15.921x
+compilation_latency mean=27.963 seconds
+compression_ratio   mean=0.986x
+eager_peak_mem      gmean=0.20x mean=0.605x
+dynamo_peak_mem     gmean=0.30x mean=0.703x
+calls_captured      gmean=218.43x mean=497.633x
+unique_graphs       gmean=1.84x mean=7.203x
+graph_breaks        gmean=0.00x mean=5.152x
+unique_graph_breaks gmean=0.00x mean=1.329x
+
+Summary for tag=baseline-bs1:
+speedup             gmean=3.33x mean=4.420x
+abs_latency         gmean=2.67x mean=9.869x
+compilation_latency mean=37.506 seconds
+compression_ratio   mean=1.159x
+eager_peak_mem      gmean=0.27x mean=0.992x
+dynamo_peak_mem     gmean=0.30x mean=1.025x
+calls_captured      gmean=240.37x mean=598.928x
+unique_graphs       gmean=1.90x mean=6.687x
+graph_breaks        gmean=0.00x mean=5.133x
+unique_graph_breaks gmean=0.00x mean=1.349x
+
+Summary for tag=int8dynamic-bs32:
+speedup             gmean=4.99x mean=11.234x
+abs_latency         gmean=5.10x mean=53.884x
+compilation_latency mean=111.120 seconds
+compression_ratio   mean=0.887x
+eager_peak_mem      gmean=0.32x mean=2.090x
+dynamo_peak_mem     gmean=0.61x mean=1.711x
+calls_captured      gmean=188.71x mean=645.288x
+unique_graphs       gmean=1.96x mean=41.712x
+graph_breaks        gmean=0.00x mean=37.424x
+unique_graph_breaks gmean=0.00x mean=0.864x
+start baseline batchsize 32
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:00, ?it/s]
+torchrec_dlrm
+/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/fbgemm_gpu_py.so: undefined symbol: _ZNK5torch8autograd4Node4nameEv
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 36, in <module>
+    run()
+  File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 30, in run
+    benchmark.run(bm_args)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/run.py", line 24, in run
+    main(TorchBenchmarkRunner(), original_dir, args)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3133, in main
+    process_entry(0, runner, original_dir, args)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3090, in process_entry
+    return maybe_fresh_cache(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1714, in inner
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 383, in load_model
+    module = importlib.import_module(c)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/importlib/__init__.py", line 126, in import_module
+    return _bootstrap._gcd_import(name[level:], package, level)
+  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
+  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
+  File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
+  File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
+  File "<frozen importlib._bootstrap_external>", line 883, in exec_module
+  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/__init__.py", line 7, in <module>
+    from .data.dlrm_dataloader import get_dataloader
+  File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/data/dlrm_dataloader.py", line 13, in <module>
+    from torchrec.datasets.criteo import (
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/__init__.py", line 8, in <module>
+    import torchrec.distributed  # noqa
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/__init__.py", line 36, in <module>
+    from torchrec.distributed.model_parallel import DistributedModelParallel  # noqa
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/model_parallel.py", line 21, in <module>
+    from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/__init__.py", line 22, in <module>
+    from torchrec.distributed.planner.planners import EmbeddingShardingPlanner  # noqa
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/planners.py", line 19, in <module>
+    from torchrec.distributed.planner.constants import BATCH_SIZE, MAX_SIZE
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/constants.py", line 10, in <module>
+    from torchrec.distributed.embedding_types import EmbeddingComputeKernel
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/embedding_types.py", line 14, in <module>
+    from fbgemm_gpu.split_table_batched_embeddings_ops_training import EmbeddingLocation
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 23, in <module>
+    from . import _fbgemm_gpu_docs, sparse_ops  # noqa: F401, E402  # noqa: F401, E402
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/_fbgemm_gpu_docs.py", line 19, in <module>
+    torch.ops.fbgemm.jagged_2d_to_dense,
+  File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 822, in __getattr__
+    raise AttributeError(
+AttributeError: '_OpNamespace' 'fbgemm' object has no attribute 'jagged_2d_to_dense'
+Run failed with return code:  1
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+BERT_pytorch
+cuda eval  BERT_pytorch                        baseline-bs32             
+AUTOTUNE mm(4096x768, 768x768)
+  mm 0.0306 ms 100.0%
+  triton_mm_2 0.0371 ms 82.5%
+  triton_mm_1 0.0403 ms 75.8%
+  triton_mm_3 0.0411 ms 74.4%
+  triton_mm_4 0.0413 ms 73.9%
+  triton_mm_7 0.0438 ms 69.8%
+  triton_mm_8 0.0481 ms 63.6%
+  triton_mm_0 0.0570 ms 53.7%
+  triton_mm_10 0.0712 ms 42.9%
+  triton_mm_9 0.0790 ms 38.7%
+SingleProcess AUTOTUNE takes 1.6112 seconds
+AUTOTUNE bmm(384x128x64, 384x64x128)
+  triton_bmm_32 0.0257 ms 100.0%
+  triton_bmm_26 0.0265 ms 97.2%
+  triton_bmm_25 0.0267 ms 96.5%
+  triton_bmm_27 0.0272 ms 94.8%
+  triton_bmm_28 0.0281 ms 91.7%
+  triton_bmm_34 0.0281 ms 91.6%
+  triton_bmm_24 0.0282 ms 91.2%
+  bmm 0.0284 ms 90.8%
+  triton_bmm_31 0.0296 ms 87.0%
+  triton_bmm_33 0.0316 ms 81.6%
+SingleProcess AUTOTUNE takes 1.7496 seconds
+AUTOTUNE bmm(384x128x128, 384x128x64)
+  triton_bmm_50 0.0285 ms 100.0%
+  triton_bmm_49 0.0293 ms 97.4%
+  triton_bmm_52 0.0297 ms 96.1%
+  triton_bmm_48 0.0300 ms 95.1%
+  triton_bmm_55 0.0300 ms 94.9%
+  triton_bmm_56 0.0300 ms 94.9%
+  triton_bmm_51 0.0312 ms 91.6%
+  triton_bmm_54 0.0319 ms 89.5%
+  triton_bmm_58 0.0326 ms 87.5%
+  triton_bmm_53 0.0332 ms 85.9%
+SingleProcess AUTOTUNE takes 1.6227 seconds
+AUTOTUNE mm(4096x768, 768x3072)
+  mm 0.0995 ms 100.0%
+  triton_mm_74 0.1117 ms 89.0%
+  triton_mm_73 0.1118 ms 89.0%
+  triton_mm_79 0.1285 ms 77.4%
+  triton_mm_75 0.1327 ms 75.0%
+  triton_mm_76 0.1349 ms 73.8%
+  triton_mm_72 0.1421 ms 70.0%
+  triton_mm_80 0.1557 ms 63.9%
+  triton_mm_82 0.2340 ms 42.5%
+  triton_mm_77 0.2836 ms 35.1%
+SingleProcess AUTOTUNE takes 1.6582 seconds
+AUTOTUNE mm(4096x3072, 3072x768)
+  mm 0.0851 ms 100.0%
+  triton_mm_86 0.1235 ms 68.9%
+  triton_mm_85 0.1276 ms 66.7%
+  triton_mm_87 0.1304 ms 65.3%
+  triton_mm_88 0.1315 ms 64.7%
+  triton_mm_92 0.1557 ms 54.7%
+  triton_mm_84 0.1623 ms 52.4%
+  triton_mm_91 0.1916 ms 44.4%
+  triton_mm_89 0.2732 ms 31.2%
+  triton_mm_90 0.2747 ms 31.0%
+SingleProcess AUTOTUNE takes 1.6729 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:00, 47.69it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 54.24it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 56.50it/s]running benchmark:  77%|███████▋  | 23/30 [00:00<00:00, 57.77it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 58.54it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 57.09it/s]
+2028.939ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:05, ?it/s]
+Background_Matting
+cuda eval  Background_Matting                  baseline-bs32             
+AUTOTUNE convolution(1x3x518x518, 64x3x7x7)
+  convolution 0.2702 ms 100.0%
+  triton_convolution_3 0.4516 ms 59.8%
+  triton_convolution_4 0.5052 ms 53.5%
+  triton_convolution_0 0.5578 ms 48.4%
+  triton_convolution_5 0.5711 ms 47.3%
+  triton_convolution_2 0.7046 ms 38.3%
+  triton_convolution_1 0.8257 ms 32.7%
+SingleProcess AUTOTUNE takes 0.9726 seconds
+AUTOTUNE convolution(1x64x512x512, 128x64x3x3)
+  convolution 0.0731 ms 100.0%
+  triton_convolution_12 0.4423 ms 16.5%
+  triton_convolution_6 0.4439 ms 16.5%
+  triton_convolution_9 0.5060 ms 14.4%
+  triton_convolution_11 0.5494 ms 13.3%
+  triton_convolution_7 0.6803 ms 10.7%
+  triton_convolution_10 0.9129 ms 8.0%
+  triton_convolution_8 1.2985 ms 5.6%
+SingleProcess AUTOTUNE takes 1.0910 seconds
+AUTOTUNE convolution(1x128x256x256, 256x128x3x3)
+  convolution 0.0611 ms 100.0%
+  triton_convolution_18 0.4243 ms 14.4%
+  triton_convolution_19 0.4545 ms 13.4%
+  triton_convolution_16 0.5010 ms 12.2%
+  triton_convolution_13 0.5851 ms 10.4%
+  triton_convolution_17 0.6767 ms 9.0%
+  triton_convolution_14 0.7516 ms 8.1%
+  triton_convolution_15 1.4649 ms 4.2%
+SingleProcess AUTOTUNE takes 1.0882 seconds
+AUTOTUNE mm(16384x512, 512x64)
+  triton_mm_41 0.0247 ms 100.0%
+  triton_mm_43 0.0260 ms 95.0%
+  triton_mm_42 0.0263 ms 93.8%
+  triton_mm_44 0.0263 ms 93.7%
+  triton_mm_48 0.0264 ms 93.5%
+  triton_mm_40 0.0284 ms 87.0%
+  mm 0.0284 ms 86.9%
+  triton_mm_45 0.0290 ms 85.2%
+  triton_mm_47 0.0301 ms 82.0%
+  triton_mm_46 0.0309 ms 79.9%
+SingleProcess AUTOTUNE takes 1.7546 seconds
+AUTOTUNE convolution(1x1x518x518, 64x1x7x7)
+  triton_convolution_57 0.1981 ms 100.0%
+  convolution 0.1998 ms 99.2%
+  triton_convolution_53 0.2212 ms 89.5%
+  triton_convolution_55 0.2531 ms 78.3%
+  triton_convolution_52 0.2995 ms 66.2%
+  triton_convolution_56 0.3561 ms 55.6%
+  triton_convolution_54 0.3696 ms 53.6%
+SingleProcess AUTOTUNE takes 1.1237 seconds
+AUTOTUNE mm(16384x448, 448x256)
+  triton_mm_98 0.0339 ms 100.0%
+  triton_mm_97 0.0350 ms 96.8%
+  triton_mm_100 0.0358 ms 94.6%
+  triton_mm_99 0.0363 ms 93.4%
+  mm 0.0386 ms 87.8%
+  triton_mm_104 0.0416 ms 81.5%
+  triton_mm_96 0.0437 ms 77.5%
+  triton_mm_103 0.0477 ms 71.0%
+  triton_mm_106 0.0620 ms 54.6%
+  triton_mm_101 0.0636 ms 53.2%
+SingleProcess AUTOTUNE takes 1.6290 seconds
+AUTOTUNE convolution(1x256x130x130, 256x256x3x3)
+  convolution 0.0958 ms 100.0%
+  triton_convolution_113 0.6632 ms 14.4%
+  triton_convolution_111 0.6683 ms 14.3%
+  triton_convolution_114 0.7298 ms 13.1%
+  triton_convolution_112 1.0203 ms 9.4%
+  triton_convolution_108 1.0798 ms 8.9%
+  triton_convolution_109 1.1594 ms 8.3%
+  triton_convolution_110 2.8924 ms 3.3%
+SingleProcess AUTOTUNE takes 1.1128 seconds
+AUTOTUNE convolution(1x256x256x256, 128x256x3x3)
+  convolution 0.1698 ms 100.0%
+  triton_convolution_251 1.2239 ms 13.9%
+  triton_convolution_254 1.3477 ms 12.6%
+  triton_convolution_253 1.3509 ms 12.6%
+  triton_convolution_248 1.7204 ms 9.9%
+  triton_convolution_252 2.5751 ms 6.6%
+  triton_convolution_249 2.6801 ms 6.3%
+  triton_convolution_250 5.1889 ms 3.3%
+SingleProcess AUTOTUNE takes 1.3584 seconds
+AUTOTUNE convolution(1x128x512x512, 64x128x3x3)
+  convolution 0.2039 ms 100.0%
+  triton_convolution_261 1.1652 ms 17.5%
+  triton_convolution_260 1.3204 ms 15.4%
+  triton_convolution_255 1.3754 ms 14.8%
+  triton_convolution_256 1.7022 ms 12.0%
+  triton_convolution_259 1.9594 ms 10.4%
+  triton_convolution_258 2.0119 ms 10.1%
+  triton_convolution_257 5.0642 ms 4.0%
+SingleProcess AUTOTUNE takes 1.1917 seconds
+AUTOTUNE convolution(1x64x518x518, 1x64x7x7)
+  convolution 0.4058 ms 100.0%
+  triton_convolution_262 1.1644 ms 34.9%
+  triton_convolution_266 1.3496 ms 30.1%
+  triton_convolution_263 1.3528 ms 30.0%
+  triton_convolution_267 1.4901 ms 27.2%
+  triton_convolution_265 1.7078 ms 23.8%
+  triton_convolution_264 22.7932 ms 1.8%
+SingleProcess AUTOTUNE takes 1.1481 seconds
+AUTOTUNE convolution(1x256x512x512, 64x256x3x3)
+  convolution 0.3797 ms 100.0%
+  triton_convolution_323 2.6581 ms 14.3%
+  triton_convolution_317 3.0314 ms 12.5%
+  triton_convolution_322 3.2069 ms 11.8%
+  triton_convolution_320 4.0081 ms 9.5%
+  triton_convolution_318 4.9184 ms 7.7%
+  triton_convolution_321 5.1325 ms 7.4%
+  triton_convolution_319 10.4453 ms 3.6%
+SingleProcess AUTOTUNE takes 1.2901 seconds
+AUTOTUNE convolution(1x64x518x518, 3x64x7x7)
+  convolution 0.4006 ms 100.0%
+  triton_convolution_328 1.3651 ms 29.3%
+  triton_convolution_325 1.4578 ms 27.5%
+  triton_convolution_324 1.4609 ms 27.4%
+  triton_convolution_329 1.4980 ms 26.7%
+  triton_convolution_327 1.7576 ms 22.8%
+  triton_convolution_326 24.4486 ms 1.6%
+SingleProcess AUTOTUNE takes 1.1682 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:05,  5.62it/s]running benchmark:  20%|██        | 6/30 [00:00<00:01, 23.64it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 31.51it/s]running benchmark:  53%|█████▎    | 16/30 [00:00<00:00, 35.64it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 38.06it/s]running benchmark:  87%|████████▋ | 26/30 [00:00<00:00, 39.57it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 35.01it/s]
+2058.770ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model DALLE2_pytorch does not support bfloat16, running with amp instead
+loading model: 0it [00:11, ?it/s]
+WARNING:common:Model DALLE2_pytorch does not support bfloat16, running with amp instead
+DALLE2_pytorch
+cuda eval  DALLE2_pytorch                      baseline-bs32             
+WARNING:common:Model DALLE2_pytorch does not support bfloat16, running with amp instead
+AUTOTUNE mm(4928x512, 512x1536)
+  mm 0.0490 ms 100.0%
+  triton_mm_1 0.0520 ms 94.3%
+  triton_mm_2 0.0521 ms 94.0%
+  triton_mm_3 0.0601 ms 81.5%
+  triton_mm_4 0.0606 ms 80.8%
+  triton_mm_0 0.0633 ms 77.4%
+  triton_mm_7 0.0645 ms 75.9%
+  triton_mm_8 0.0709 ms 69.1%
+  triton_mm_10 0.1020 ms 48.0%
+  triton_mm_9 0.1250 ms 39.2%
+SingleProcess AUTOTUNE takes 1.6052 seconds
+[2023-12-12 16:23:32,403] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+AUTOTUNE mm(4928x512, 512x512)
+  triton_mm_14 0.0224 ms 100.0%
+  triton_mm_13 0.0225 ms 99.7%
+  mm 0.0237 ms 94.6%
+  triton_mm_15 0.0253 ms 88.5%
+  triton_mm_16 0.0258 ms 86.8%
+  triton_mm_12 0.0263 ms 85.1%
+  triton_mm_20 0.0295 ms 75.9%
+  triton_mm_19 0.0319 ms 70.1%
+  triton_mm_22 0.0416 ms 53.8%
+  triton_mm_21 0.0477 ms 46.9%
+SingleProcess AUTOTUNE takes 4.4539 seconds
+AUTOTUNE mm(4928x512, 512x2048)
+  mm 0.0578 ms 100.0%
+  triton_mm_25 0.0658 ms 87.8%
+  triton_mm_26 0.0659 ms 87.6%
+  triton_mm_28 0.0775 ms 74.5%
+  triton_mm_27 0.0777 ms 74.3%
+  triton_mm_31 0.0784 ms 73.6%
+  triton_mm_24 0.0814 ms 70.9%
+  triton_mm_32 0.0890 ms 64.9%
+  triton_mm_34 0.1295 ms 44.6%
+  triton_mm_30 0.1628 ms 35.5%
+SingleProcess AUTOTUNE takes 4.8634 seconds
+AUTOTUNE mm(4928x2048, 2048x512)
+  mm 0.0586 ms 100.0%
+  triton_mm_38 0.0607 ms 96.5%
+  triton_mm_37 0.0614 ms 95.3%
+  triton_mm_39 0.0706 ms 82.9%
+  triton_mm_40 0.0710 ms 82.5%
+  triton_mm_44 0.0828 ms 70.8%
+  triton_mm_36 0.1067 ms 54.9%
+  triton_mm_43 0.1317 ms 44.4%
+  triton_mm_46 0.1383 ms 42.3%
+  triton_mm_41 0.1489 ms 39.3%
+SingleProcess AUTOTUNE takes 5.0408 seconds
+[2023-12-12 16:23:47,150] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 16:23:47,357] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 16:23:47,561] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 16:23:47,764] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 16:23:47,964] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 16:23:48,169] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 16:23:48,371] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 16:23:48,568] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 16:23:48,764] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 16:23:48,965] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 16:23:49,168] [2/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program
+AUTOTUNE mm(64x512, 512x512)
+  mm 0.0088 ms 100.0%
+  triton_mm_585 0.0103 ms 85.4%
+  triton_mm_582 0.0106 ms 82.7%
+  triton_mm_581 0.0108 ms 81.3%
+  triton_mm_584 0.0115 ms 76.1%
+  triton_mm_580 0.0120 ms 73.1%
+  triton_mm_579 0.0123 ms 71.2%
+  triton_mm_578 0.0131 ms 67.0%
+  triton_mm_577 0.0143 ms 61.2%
+  triton_mm_576 0.0211 ms 41.6%
+SingleProcess AUTOTUNE takes 1.6281 seconds
+AUTOTUNE mm(64x512, 512x1024)
+  triton_mm_597 0.0105 ms 100.0%
+  mm 0.0105 ms 99.7%
+  triton_mm_594 0.0107 ms 97.9%
+  triton_mm_593 0.0111 ms 94.3%
+  triton_mm_596 0.0117 ms 89.6%
+  triton_mm_592 0.0121 ms 87.0%
+  triton_mm_591 0.0133 ms 78.7%
+  triton_mm_590 0.0139 ms 75.8%
+  triton_mm_589 0.0151 ms 69.3%
+  triton_mm_588 0.0211 ms 49.7%
+SingleProcess AUTOTUNE takes 4.2903 seconds
+AUTOTUNE mm(64x1024, 1024x1024)
+  mm 0.0128 ms 100.0%
+  triton_mm_605 0.0146 ms 87.5%
+  triton_mm_609 0.0151 ms 84.7%
+  triton_mm_606 0.0151 ms 84.5%
+  triton_mm_608 0.0158 ms 80.6%
+  triton_mm_604 0.0171 ms 74.6%
+  triton_mm_603 0.0189 ms 67.4%
+  triton_mm_602 0.0212 ms 60.1%
+  triton_mm_601 0.0238 ms 53.7%
+  triton_mm_600 0.0356 ms 35.8%
+SingleProcess AUTOTUNE takes 4.2964 seconds
+AUTOTUNE addmm(64x512, 64x1024, 1024x512)
+  bias_addmm 0.0107 ms 100.0%
+  triton_mm_618 0.0146 ms 73.4%
+  triton_mm_617 0.0152 ms 70.8%
+  triton_mm_621 0.0156 ms 68.8%
+  triton_mm_620 0.0164 ms 65.3%
+  triton_mm_616 0.0172 ms 62.4%
+  addmm 0.0178 ms 60.3%
+  triton_mm_615 0.0193 ms 55.6%
+  triton_mm_614 0.0213 ms 50.4%
+  triton_mm_613 0.0239 ms 45.0%
+SingleProcess AUTOTUNE takes 5.1524 seconds
+AUTOTUNE mm(16640x512, 512x128)
+  triton_mm_625 0.0271 ms 100.0%
+  triton_mm_626 0.0277 ms 97.9%
+  triton_mm_627 0.0298 ms 91.0%
+  triton_mm_632 0.0300 ms 90.2%
+  triton_mm_628 0.0301 ms 90.1%
+  mm 0.0319 ms 84.9%
+  triton_mm_624 0.0332 ms 81.7%
+  triton_mm_631 0.0374 ms 72.5%
+  triton_mm_630 0.0422 ms 64.3%
+  triton_mm_629 0.0426 ms 63.7%
+SingleProcess AUTOTUNE takes 4.9075 seconds
+AUTOTUNE mm(16640x512, 512x512)
+  mm 0.0511 ms 100.0%
+  triton_mm_638 0.0598 ms 85.4%
+  triton_mm_637 0.0605 ms 84.5%
+  triton_mm_639 0.0674 ms 75.8%
+  triton_mm_640 0.0678 ms 75.3%
+  triton_mm_643 0.0771 ms 66.3%
+  triton_mm_644 0.0784 ms 65.2%
+  triton_mm_636 0.0808 ms 63.3%
+  triton_mm_646 0.1155 ms 44.3%
+  triton_mm_642 0.1393 ms 36.7%
+SingleProcess AUTOTUNE takes 4.5161 seconds
+AUTOTUNE bmm(64x2080x64, 64x64x261)
+  triton_bmm_650 0.1004 ms 100.0%
+  triton_bmm_649 0.1110 ms 90.4%
+  triton_bmm_648 0.1132 ms 88.6%
+  triton_bmm_652 0.1245 ms 80.6%
+  triton_bmm_658 0.1275 ms 78.7%
+  triton_bmm_657 0.1371 ms 73.2%
+  triton_bmm_651 0.1391 ms 72.1%
+  triton_bmm_655 0.1534 ms 65.4%
+  triton_bmm_659 0.1654 ms 60.7%
+  triton_bmm_654 0.2107 ms 47.6%
+SingleProcess AUTOTUNE takes 1.6652 seconds
+AUTOTUNE bmm(64x2080x261, 64x261x64)
+  triton_bmm_663 0.0995 ms 100.0%
+  triton_bmm_661 0.1019 ms 97.6%
+  triton_bmm_664 0.1058 ms 94.1%
+  triton_bmm_662 0.1067 ms 93.3%
+  triton_bmm_668 0.1104 ms 90.1%
+  triton_bmm_660 0.1168 ms 85.2%
+  triton_bmm_667 0.1190 ms 83.6%
+  triton_bmm_670 0.1199 ms 83.0%
+  triton_bmm_666 0.1212 ms 82.1%
+  triton_bmm_671 0.1316 ms 75.6%
+SingleProcess AUTOTUNE takes 1.7255 seconds
+AUTOTUNE mm(16640x512, 512x4096)
+  mm 0.3768 ms 100.0%
+  triton_mm_686 0.4152 ms 90.7%
+  triton_mm_685 0.4199 ms 89.7%
+  triton_mm_691 0.4546 ms 82.9%
+  triton_mm_687 0.4997 ms 75.4%
+  triton_mm_688 0.5007 ms 75.2%
+  triton_mm_684 0.5197 ms 72.5%
+  triton_mm_692 0.5944 ms 63.4%
+  triton_mm_694 0.7912 ms 47.6%
+  triton_mm_689 1.0958 ms 34.4%
+SingleProcess AUTOTUNE takes 5.1651 seconds
+AUTOTUNE mm(16640x2048, 2048x512)
+  mm 0.1613 ms 100.0%
+  triton_mm_697 0.2061 ms 78.3%
+  triton_mm_698 0.2072 ms 77.9%
+  triton_mm_700 0.2247 ms 71.8%
+  triton_mm_699 0.2255 ms 71.5%
+  triton_mm_704 0.2676 ms 60.3%
+  triton_mm_696 0.2734 ms 59.0%
+  triton_mm_703 0.3127 ms 51.6%
+  triton_mm_706 0.4243 ms 38.0%
+  triton_mm_701 0.4832 ms 33.4%
+SingleProcess AUTOTUNE takes 4.7868 seconds
+AUTOTUNE bmm(64x1x512, 64x512x1)
+  triton_bmm_1144 0.0089 ms 100.0%
+  triton_bmm_1142 0.0090 ms 99.3%
+  triton_bmm_1143 0.0092 ms 96.7%
+  triton_bmm_1145 0.0097 ms 91.4%
+  triton_bmm_1141 0.0113 ms 78.8%
+  bmm 0.0130 ms 68.3%
+  triton_bmm_1140 0.0156 ms 56.9%
+  triton_bmm_1146 0.0182 ms 48.9%
+  triton_bmm_1147 0.0195 ms 45.6%
+SingleProcess AUTOTUNE takes 1.4840 seconds
+skipping cudagraphs due to ['non-cuda device in graph']
+AUTOTUNE mm(2464x512, 512x1536)
+  triton_mm_1149 0.0316 ms 100.0%
+  triton_mm_1150 0.0318 ms 99.2%
+  mm 0.0330 ms 95.6%
+  triton_mm_1151 0.0362 ms 87.1%
+  triton_mm_1152 0.0364 ms 86.6%
+  triton_mm_1156 0.0400 ms 78.9%
+  triton_mm_1148 0.0413 ms 76.4%
+  triton_mm_1155 0.0442 ms 71.4%
+  triton_mm_1158 0.0628 ms 50.2%
+  triton_mm_1157 0.0659 ms 47.9%
+SingleProcess AUTOTUNE takes 4.7595 seconds
+[2023-12-12 16:25:07,820] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+AUTOTUNE mm(2464x512, 512x512)
+  mm 0.0151 ms 100.0%
+  triton_mm_1162 0.0175 ms 86.3%
+  triton_mm_1161 0.0175 ms 86.1%
+  triton_mm_1168 0.0181 ms 83.4%
+  triton_mm_1164 0.0182 ms 82.6%
+  triton_mm_1163 0.0188 ms 80.4%
+  triton_mm_1160 0.0229 ms 65.9%
+  triton_mm_1167 0.0274 ms 55.0%
+  triton_mm_1166 0.0276 ms 54.6%
+  triton_mm_1169 0.0277 ms 54.4%
+SingleProcess AUTOTUNE takes 4.4274 seconds
+AUTOTUNE mm(2464x512, 512x2048)
+  mm 0.0333 ms 100.0%
+  triton_mm_1173 0.0372 ms 89.4%
+  triton_mm_1174 0.0373 ms 89.3%
+  triton_mm_1175 0.0428 ms 77.8%
+  triton_mm_1176 0.0430 ms 77.3%
+  triton_mm_1172 0.0450 ms 73.9%
+  triton_mm_1180 0.0497 ms 67.0%
+  triton_mm_1179 0.0509 ms 65.4%
+  triton_mm_1182 0.0740 ms 45.0%
+  triton_mm_1178 0.0856 ms 38.9%
+SingleProcess AUTOTUNE takes 4.5566 seconds
+AUTOTUNE mm(2464x2048, 2048x512)
+  mm 0.0331 ms 100.0%
+  triton_mm_1185 0.0444 ms 74.6%
+  triton_mm_1186 0.0445 ms 74.5%
+  triton_mm_1192 0.0465 ms 71.3%
+  triton_mm_1187 0.0479 ms 69.2%
+  triton_mm_1188 0.0482 ms 68.8%
+  triton_mm_1184 0.0558 ms 59.3%
+  triton_mm_1191 0.0697 ms 47.5%
+  triton_mm_1189 0.0791 ms 41.9%
+  triton_mm_1190 0.0799 ms 41.4%
+SingleProcess AUTOTUNE takes 4.6932 seconds
+[2023-12-12 16:25:21,896] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 16:25:22,109] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 16:25:22,331] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 16:25:22,602] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 16:25:22,822] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 16:25:23,036] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 16:25:23,250] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 16:25:23,460] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 16:25:23,666] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 16:25:23,873] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+[2023-12-12 16:25:24,083] [8/0] torch._inductor.utils: [WARNING] DeviceCopy in input program
+AUTOTUNE mm(32x512, 512x512)
+  triton_mm_1729 0.0099 ms 100.0%
+  triton_mm_1733 0.0102 ms 96.6%
+  triton_mm_1730 0.0107 ms 92.5%
+  triton_mm_1732 0.0108 ms 91.7%
+  mm 0.0111 ms 88.8%
+  triton_mm_1728 0.0116 ms 85.6%
+  triton_mm_1727 0.0118 ms 84.0%
+  triton_mm_1726 0.0131 ms 75.6%
+  triton_mm_1725 0.0131 ms 75.4%
+  triton_mm_1724 0.0185 ms 53.5%
+SingleProcess AUTOTUNE takes 4.3300 seconds
+skipping cudagraphs due to ['non-cuda device in graph']
+AUTOTUNE mm(32x128, 128x512)
+  triton_mm_1741 0.0069 ms 100.0%
+  triton_mm_1744 0.0072 ms 96.4%
+  triton_mm_1740 0.0074 ms 93.1%
+  triton_mm_1742 0.0074 ms 93.1%
+  triton_mm_1738 0.0076 ms 91.1%
+  triton_mm_1745 0.0079 ms 87.1%
+  mm 0.0082 ms 84.4%
+  triton_mm_1737 0.0083 ms 83.1%
+  triton_mm_1739 0.0084 ms 81.8%
+  triton_mm_1736 0.0090 ms 77.1%
+SingleProcess AUTOTUNE takes 3.8476 seconds
+AUTOTUNE addmm(32x256, 32x512, 512x256)
+  bias_addmm 0.0087 ms 100.0%
+  triton_mm_1753 0.0105 ms 83.5%
+  triton_mm_1757 0.0105 ms 83.2%
+  triton_mm_1756 0.0108 ms 81.3%
+  triton_mm_1752 0.0113 ms 77.3%
+  triton_mm_1754 0.0113 ms 77.3%
+  triton_mm_1751 0.0125 ms 70.0%
+  triton_mm_1750 0.0131 ms 66.7%
+  triton_mm_1749 0.0141 ms 61.9%
+  addmm 0.0145 ms 60.3%
+SingleProcess AUTOTUNE takes 4.1965 seconds
+AUTOTUNE mm(32x512, 512x512)
+  triton_mm_1765 0.0100 ms 100.0%
+  triton_mm_1768 0.0102 ms 98.0%
+  triton_mm_1766 0.0105 ms 95.6%
+  mm 0.0105 ms 95.3%
+  triton_mm_1769 0.0108 ms 93.3%
+  triton_mm_1764 0.0110 ms 91.1%
+  triton_mm_1763 0.0120 ms 83.4%
+  triton_mm_1762 0.0128 ms 78.2%
+  triton_mm_1761 0.0141 ms 70.9%
+  triton_mm_1760 0.0190 ms 52.9%
+SingleProcess AUTOTUNE takes 3.8197 seconds
+AUTOTUNE addmm(2464x128, 2464x512, 512x128)
+  bias_addmm 0.0123 ms 100.0%
+  triton_mm_1780 0.0123 ms 100.0%
+  triton_mm_1776 0.0139 ms 88.9%
+  triton_mm_1775 0.0144 ms 85.7%
+  triton_mm_1781 0.0145 ms 84.8%
+  triton_mm_1777 0.0146 ms 84.6%
+  triton_mm_1778 0.0154 ms 80.0%
+  triton_mm_1774 0.0161 ms 76.4%
+  triton_mm_1773 0.0164 ms 75.0%
+  addmm 0.0166 ms 74.3%
+SingleProcess AUTOTUNE takes 5.8128 seconds
+AUTOTUNE mm(32x512, 512x256)
+  triton_mm_1813 0.0099 ms 100.0%
+  triton_mm_1814 0.0102 ms 96.9%
+  triton_mm_1817 0.0105 ms 94.5%
+  triton_mm_1816 0.0105 ms 94.2%
+  mm 0.0107 ms 92.5%
+  triton_mm_1812 0.0108 ms 92.3%
+  triton_mm_1811 0.0118 ms 84.2%
+  triton_mm_1810 0.0125 ms 79.1%
+  triton_mm_1809 0.0135 ms 73.6%
+  triton_mm_1808 0.0187 ms 53.1%
+SingleProcess AUTOTUNE takes 4.0537 seconds
+AUTOTUNE convolution(32x3x128x128, 64x3x3x3)
+  convolution 0.1021 ms 100.0%
+  triton_convolution_1824 0.2053 ms 49.7%
+  triton_convolution_1823 0.2060 ms 49.6%
+  triton_convolution_1825 0.2668 ms 38.3%
+  triton_convolution_1820 0.2759 ms 37.0%
+  triton_convolution_1822 0.3263 ms 31.3%
+  triton_convolution_1821 0.4017 ms 25.4%
+SingleProcess AUTOTUNE takes 3.0101 seconds
+AUTOTUNE convolution(32x3x128x128, 32x3x7x7)
+  convolution 0.1850 ms 100.0%
+  triton_convolution_1828 0.6482 ms 28.5%
+  triton_convolution_1826 0.7050 ms 26.2%
+  triton_convolution_1830 0.7214 ms 25.6%
+  triton_convolution_1829 0.7254 ms 25.5%
+  triton_convolution_1831 0.8398 ms 22.0%
+  triton_convolution_1827 0.8415 ms 22.0%
+SingleProcess AUTOTUNE takes 2.5044 seconds
+AUTOTUNE convolution(32x3x128x128, 32x3x15x15)
+  convolution 1.4647 ms 100.0%
+  triton_convolution_1834 2.7408 ms 53.4%
+  triton_convolution_1832 3.0068 ms 48.7%
+  triton_convolution_1835 3.0977 ms 47.3%
+  triton_convolution_1836 3.1015 ms 47.2%
+  triton_convolution_1837 3.4756 ms 42.1%
+  triton_convolution_1833 3.5386 ms 41.4%
+SingleProcess AUTOTUNE takes 2.6407 seconds
+AUTOTUNE convolution(32x128x128x128, 128x128x3x3)
+  convolution 0.7463 ms 100.0%
+  triton_convolution_1841 4.3318 ms 17.2%
+  triton_convolution_1844 4.3597 ms 17.1%
+  triton_convolution_1838 4.8203 ms 15.5%
+  triton_convolution_1843 5.0724 ms 14.7%
+  triton_convolution_1839 6.5948 ms 11.3%
+  triton_convolution_1842 7.8558 ms 9.5%
+  triton_convolution_1840 19.1686 ms 3.9%
+SingleProcess AUTOTUNE takes 4.8370 seconds
+AUTOTUNE addmm(131072x128, 131072x512, 512x128)
+  bias_addmm 0.1379 ms 100.0%
+  triton_mm_1918 0.1445 ms 95.4%
+  triton_mm_1920 0.1519 ms 90.8%
+  triton_mm_1917 0.1585 ms 87.0%
+  triton_mm_1919 0.1610 ms 85.6%
+  triton_mm_1924 0.1800 ms 76.6%
+  triton_mm_1916 0.1919 ms 71.9%
+  addmm 0.1968 ms 70.1%
+  triton_mm_1923 0.2545 ms 54.2%
+  triton_mm_1921 0.2796 ms 49.3%
+SingleProcess AUTOTUNE takes 6.0916 seconds
+AUTOTUNE convolution(32x128x64x64, 128x128x3x3)
+  convolution 0.1993 ms 100.0%
+  triton_convolution_1931 1.0477 ms 19.0%
+  triton_convolution_1934 1.1257 ms 17.7%
+  triton_convolution_1928 1.2924 ms 15.4%
+  triton_convolution_1933 1.2936 ms 15.4%
+  triton_convolution_1929 1.6514 ms 12.1%
+  triton_convolution_1932 1.9788 ms 10.1%
+  triton_convolution_1930 4.9871 ms 4.0%
+SingleProcess AUTOTUNE takes 4.8254 seconds
+AUTOTUNE mm(192x128, 128x1024)
+  triton_mm_1967 0.0077 ms 100.0%
+  triton_mm_1970 0.0079 ms 97.2%
+  triton_mm_1962 0.0084 ms 91.3%
+  triton_mm_1965 0.0084 ms 91.3%
+  triton_mm_1969 0.0084 ms 91.3%
+  triton_mm_1963 0.0085 ms 90.9%
+  triton_mm_1966 0.0085 ms 90.9%
+  triton_mm_1964 0.0089 ms 87.0%
+  triton_mm_1961 0.0095 ms 81.4%
+  mm 0.0095 ms 81.1%
+SingleProcess AUTOTUNE takes 4.6629 seconds
+AUTOTUNE bmm(32x4096x128, 32x128x512)
+  triton_bmm_1975 0.1541 ms 100.0%
+  bmm 0.1601 ms 96.3%
+  triton_bmm_1974 0.1664 ms 92.6%
+  triton_bmm_1980 0.1702 ms 90.6%
+  triton_bmm_1976 0.1855 ms 83.1%
+  triton_bmm_1977 0.1861 ms 82.8%
+  triton_bmm_1973 0.1895 ms 81.3%
+  triton_bmm_1981 0.2132 ms 72.3%
+  triton_bmm_1983 0.3164 ms 48.7%
+  triton_bmm_1982 0.3571 ms 43.2%
+SingleProcess AUTOTUNE takes 4.6252 seconds
+AUTOTUNE bmm(256x4096x64, 256x64x7)
+  triton_bmm_1993 0.1144 ms 100.0%
+  triton_bmm_1987 0.1153 ms 99.2%
+  triton_bmm_1986 0.1158 ms 98.8%
+  triton_bmm_1994 0.1166 ms 98.1%
+  triton_bmm_1989 0.1167 ms 98.0%
+  triton_bmm_1985 0.1169 ms 97.8%
+  triton_bmm_1988 0.1170 ms 97.8%
+  triton_bmm_1992 0.1172 ms 97.6%
+  triton_bmm_1990 0.1184 ms 96.6%
+  triton_bmm_1995 0.1194 ms 95.8%
+SingleProcess AUTOTUNE takes 3.9783 seconds
+AUTOTUNE bmm(256x4096x7, 256x7x64)
+  triton_bmm_2003 0.1075 ms 100.0%
+  triton_bmm_2004 0.1101 ms 97.6%
+  triton_bmm_2001 0.1105 ms 97.3%
+  triton_bmm_2005 0.1108 ms 97.1%
+  triton_bmm_2000 0.1108 ms 97.0%
+  triton_bmm_1997 0.1113 ms 96.6%
+  triton_bmm_1998 0.1113 ms 96.6%
+  triton_bmm_1999 0.1113 ms 96.6%
+  triton_bmm_2007 0.1157 ms 92.9%
+  triton_bmm_2002 0.1173 ms 91.6%
+SingleProcess AUTOTUNE takes 3.7095 seconds
+AUTOTUNE mm(131072x512, 512x128)
+  mm 0.1300 ms 100.0%
+  triton_mm_2010 0.1397 ms 93.1%
+  triton_mm_2012 0.1470 ms 88.5%
+  triton_mm_2009 0.1567 ms 83.0%
+  triton_mm_2011 0.1567 ms 83.0%
+  triton_mm_2015 0.1690 ms 76.9%
+  triton_mm_2016 0.1772 ms 73.4%
+  triton_mm_2008 0.1900 ms 68.5%
+  triton_mm_2013 0.2717 ms 47.9%
+  triton_mm_2014 0.2936 ms 44.3%
+SingleProcess AUTOTUNE takes 4.7308 seconds
+AUTOTUNE addmm(32768x256, 32768x512, 512x256)
+  bias_addmm 0.0572 ms 100.0%
+  triton_mm_2126 0.0665 ms 86.0%
+  triton_mm_2125 0.0666 ms 85.8%
+  triton_mm_2127 0.0718 ms 79.6%
+  triton_mm_2128 0.0728 ms 78.6%
+  triton_mm_2132 0.0810 ms 70.6%
+  addmm 0.0867 ms 66.0%
+  triton_mm_2124 0.0887 ms 64.4%
+  triton_mm_2131 0.1243 ms 46.0%
+  triton_mm_2134 0.1257 ms 45.5%
+SingleProcess AUTOTUNE takes 5.1989 seconds
+AUTOTUNE convolution(32x256x32x32, 256x256x3x3)
+  convolution 0.1721 ms 100.0%
+  triton_convolution_2141 1.0187 ms 16.9%
+  triton_convolution_2139 1.0784 ms 16.0%
+  triton_convolution_2142 1.2221 ms 14.1%
+  triton_convolution_2136 1.5455 ms 11.1%
+  triton_convolution_2137 2.0196 ms 8.5%
+  triton_convolution_2140 2.0317 ms 8.5%
+  triton_convolution_2138 4.8533 ms 3.5%
+SingleProcess AUTOTUNE takes 4.9152 seconds
+AUTOTUNE bmm(32x1024x256, 32x256x512)
+  bmm 0.0596 ms 100.0%
+  triton_bmm_2183 0.0636 ms 93.8%
+  triton_bmm_2182 0.0656 ms 90.9%
+  triton_bmm_2188 0.0746 ms 79.8%
+  triton_bmm_2185 0.0755 ms 79.0%
+  triton_bmm_2184 0.0760 ms 78.4%
+  triton_bmm_2181 0.0780 ms 76.4%
+  triton_bmm_2189 0.0876 ms 68.0%
+  triton_bmm_2191 0.1166 ms 51.1%
+  triton_bmm_2187 0.1568 ms 38.0%
+SingleProcess AUTOTUNE takes 4.5826 seconds
+AUTOTUNE bmm(256x1024x64, 256x64x7)
+  triton_bmm_2201 0.0420 ms 100.0%
+  triton_bmm_2202 0.0429 ms 98.1%
+  triton_bmm_2194 0.0433 ms 97.0%
+  triton_bmm_2195 0.0438 ms 96.1%
+  triton_bmm_2197 0.0446 ms 94.2%
+  triton_bmm_2193 0.0449 ms 93.6%
+  triton_bmm_2196 0.0451 ms 93.3%
+  triton_bmm_2200 0.0453 ms 92.9%
+  triton_bmm_2198 0.0453 ms 92.8%
+  triton_bmm_2203 0.0463 ms 90.9%
+SingleProcess AUTOTUNE takes 3.5601 seconds
+AUTOTUNE bmm(256x1024x7, 256x7x64)
+  triton_bmm_2207 0.0318 ms 100.0%
+  triton_bmm_2211 0.0318 ms 100.0%
+  triton_bmm_2209 0.0321 ms 99.2%
+  triton_bmm_2212 0.0321 ms 99.2%
+  triton_bmm_2205 0.0324 ms 98.2%
+  triton_bmm_2208 0.0324 ms 98.0%
+  triton_bmm_2206 0.0327 ms 97.2%
+  triton_bmm_2213 0.0329 ms 96.7%
+  triton_bmm_2210 0.0348 ms 91.5%
+  triton_bmm_2215 0.0351 ms 90.5%
+SingleProcess AUTOTUNE takes 3.6183 seconds
+AUTOTUNE mm(32768x512, 512x256)
+  mm 0.0566 ms 100.0%
+  triton_mm_2217 0.0637 ms 88.8%
+  triton_mm_2218 0.0637 ms 88.8%
+  triton_mm_2219 0.0689 ms 82.1%
+  triton_mm_2220 0.0699 ms 80.9%
+  triton_mm_2224 0.0783 ms 72.2%
+  triton_mm_2223 0.0839 ms 67.4%
+  triton_mm_2216 0.0860 ms 65.8%
+  triton_mm_2226 0.1243 ms 45.5%
+  triton_mm_2222 0.1357 ms 41.7%
+SingleProcess AUTOTUNE takes 4.9323 seconds
+AUTOTUNE mm(32x512, 512x1024)
+  triton_mm_2325 0.0103 ms 100.0%
+  triton_mm_2329 0.0105 ms 97.9%
+  triton_mm_2326 0.0108 ms 95.5%
+  mm 0.0109 ms 93.9%
+  triton_mm_2328 0.0110 ms 93.0%
+  triton_mm_2324 0.0118 ms 87.2%
+  triton_mm_2323 0.0123 ms 83.8%
+  triton_mm_2322 0.0133 ms 77.2%
+  triton_mm_2321 0.0146 ms 70.5%
+  triton_mm_2320 0.0200 ms 51.3%
+SingleProcess AUTOTUNE takes 3.7873 seconds
+AUTOTUNE addmm(8192x512, 8192x1024, 1024x512)
+  triton_mm_2334 0.0567 ms 100.0%
+  triton_mm_2333 0.0569 ms 99.7%
+  bias_addmm 0.0576 ms 98.5%
+  triton_mm_2335 0.0633 ms 89.5%
+  triton_mm_2336 0.0640 ms 88.6%
+  addmm 0.0709 ms 80.0%
+  triton_mm_2340 0.0747 ms 75.9%
+  triton_mm_2332 0.0783 ms 72.4%
+  triton_mm_2339 0.1005 ms 56.4%
+  triton_mm_2342 0.1206 ms 47.0%
+SingleProcess AUTOTUNE takes 5.6632 seconds
+AUTOTUNE convolution(32x512x16x16, 512x512x3x3)
+  convolution 0.1855 ms 100.0%
+  triton_convolution_2349 1.3449 ms 13.8%
+  triton_convolution_2350 1.4914 ms 12.4%
+  triton_convolution_2347 1.7348 ms 10.7%
+  triton_convolution_2345 2.1663 ms 8.6%
+  triton_convolution_2344 2.2395 ms 8.3%
+  triton_convolution_2348 2.4602 ms 7.5%
+  triton_convolution_2346 5.6294 ms 3.3%
+SingleProcess AUTOTUNE takes 4.4859 seconds
+AUTOTUNE bmm(32x256x512, 32x512x512)
+  triton_bmm_2391 0.0330 ms 100.0%
+  bmm 0.0341 ms 96.6%
+  triton_bmm_2390 0.0356 ms 92.6%
+  triton_bmm_2393 0.0372 ms 88.5%
+  triton_bmm_2392 0.0373 ms 88.4%
+  triton_bmm_2397 0.0438 ms 75.2%
+  triton_bmm_2389 0.0441 ms 74.8%
+  triton_bmm_2396 0.0516 ms 63.9%
+  triton_bmm_2399 0.0695 ms 47.4%
+  triton_bmm_2395 0.0719 ms 45.9%
+SingleProcess AUTOTUNE takes 4.5655 seconds
+AUTOTUNE bmm(256x256x64, 256x64x7)
+  triton_bmm_2401 0.0157 ms 100.0%
+  triton_bmm_2402 0.0161 ms 97.6%
+  triton_bmm_2409 0.0163 ms 96.2%
+  triton_bmm_2408 0.0164 ms 95.5%
+  triton_bmm_2410 0.0166 ms 94.4%
+  triton_bmm_2411 0.0167 ms 94.1%
+  triton_bmm_2404 0.0168 ms 93.5%
+  triton_bmm_2412 0.0168 ms 93.5%
+  triton_bmm_2407 0.0168 ms 93.3%
+  triton_bmm_2405 0.0170 ms 92.3%
+SingleProcess AUTOTUNE takes 3.5848 seconds
+AUTOTUNE bmm(256x256x7, 256x7x64)
+  triton_bmm_2420 0.0120 ms 100.0%
+  triton_bmm_2421 0.0121 ms 99.7%
+  triton_bmm_2413 0.0123 ms 97.9%
+  triton_bmm_2414 0.0123 ms 97.9%
+  triton_bmm_2415 0.0123 ms 97.7%
+  triton_bmm_2417 0.0125 ms 95.9%
+  triton_bmm_2416 0.0126 ms 95.7%
+  triton_bmm_2419 0.0131 ms 91.9%
+  triton_bmm_2422 0.0144 ms 83.6%
+  triton_bmm_2418 0.0144 ms 83.4%
+SingleProcess AUTOTUNE takes 3.2884 seconds
+AUTOTUNE mm(8192x512, 512x512)
+  triton_mm_2426 0.0329 ms 100.0%
+  mm 0.0335 ms 98.2%
+  triton_mm_2425 0.0360 ms 91.4%
+  triton_mm_2427 0.0372 ms 88.4%
+  triton_mm_2428 0.0375 ms 87.6%
+  triton_mm_2424 0.0435 ms 75.6%
+  triton_mm_2432 0.0436 ms 75.3%
+  triton_mm_2431 0.0513 ms 64.0%
+  triton_mm_2434 0.0681 ms 48.2%
+  triton_mm_2430 0.0718 ms 45.7%
+SingleProcess AUTOTUNE takes 4.7550 seconds
+AUTOTUNE mm(32x512, 512x2048)
+  triton_mm_2533 0.0105 ms 100.0%
+  triton_mm_2536 0.0110 ms 95.6%
+  triton_mm_2537 0.0110 ms 95.6%
+  mm 0.0112 ms 93.4%
+  triton_mm_2532 0.0116 ms 90.1%
+  triton_mm_2534 0.0117 ms 89.4%
+  triton_mm_2531 0.0125 ms 83.7%
+  triton_mm_2530 0.0141 ms 74.4%
+  triton_mm_2529 0.0151 ms 69.5%
+  triton_mm_2528 0.0206 ms 50.9%
+SingleProcess AUTOTUNE takes 3.9222 seconds
+AUTOTUNE addmm(8192x1024, 8192x512, 512x1024)
+  bias_addmm 0.0508 ms 100.0%
+  triton_mm_2542 0.0594 ms 85.7%
+  triton_mm_2541 0.0606 ms 83.9%
+  triton_mm_2543 0.0690 ms 73.7%
+  triton_mm_2544 0.0695 ms 73.2%
+  triton_mm_2540 0.0757 ms 67.1%
+  addmm 0.0777 ms 65.5%
+  triton_mm_2548 0.0787 ms 64.6%
+  triton_mm_2547 0.1102 ms 46.1%
+  triton_mm_2550 0.1113 ms 45.7%
+SingleProcess AUTOTUNE takes 5.2513 seconds
+AUTOTUNE convolution(32x1024x16x16, 1024x1024x3x3)
+  convolution 0.6522 ms 100.0%
+  triton_convolution_2557 4.5717 ms 14.3%
+  triton_convolution_2558 5.0573 ms 12.9%
+  triton_convolution_2555 5.9370 ms 11.0%
+  triton_convolution_2552 7.2180 ms 9.0%
+  triton_convolution_2553 7.6313 ms 8.5%
+  triton_convolution_2556 9.6745 ms 6.7%
+  triton_convolution_2554 18.7308 ms 3.5%
+SingleProcess AUTOTUNE takes 5.0240 seconds
+AUTOTUNE mm(8384x128, 128x1024)
+  triton_mm_2561 0.0261 ms 100.0%
+  triton_mm_2560 0.0266 ms 98.0%
+  mm 0.0283 ms 92.3%
+  triton_mm_2559 0.0288 ms 90.7%
+  triton_mm_2566 0.0298 ms 87.4%
+  triton_mm_2562 0.0301 ms 86.6%
+  triton_mm_2563 0.0306 ms 85.3%
+  triton_mm_2567 0.0335 ms 77.9%
+  triton_mm_2569 0.0395 ms 66.0%
+  triton_mm_2568 0.0525 ms 49.7%
+SingleProcess AUTOTUNE takes 4.1699 seconds
+AUTOTUNE bmm(32x256x1024, 32x1024x512)
+  bmm 0.0533 ms 100.0%
+  triton_bmm_2572 0.0549 ms 97.1%
+  triton_bmm_2573 0.0558 ms 95.6%
+  triton_bmm_2574 0.0622 ms 85.7%
+  triton_bmm_2575 0.0627 ms 85.1%
+  triton_bmm_2579 0.0730 ms 73.1%
+  triton_bmm_2571 0.0764 ms 69.8%
+  triton_bmm_2578 0.1037 ms 51.4%
+  triton_bmm_2581 0.1205 ms 44.2%
+  triton_bmm_2576 0.1247 ms 42.8%
+SingleProcess AUTOTUNE takes 5.1559 seconds
+AUTOTUNE bmm(256x256x64, 256x64x263)
+  triton_bmm_2585 0.0548 ms 100.0%
+  triton_bmm_2584 0.0599 ms 91.5%
+  triton_bmm_2583 0.0648 ms 84.6%
+  triton_bmm_2587 0.0656 ms 83.5%
+  triton_bmm_2593 0.0665 ms 82.4%
+  triton_bmm_2592 0.0710 ms 77.2%
+  triton_bmm_2586 0.0736 ms 74.5%
+  triton_bmm_2594 0.0845 ms 64.9%
+  triton_bmm_2590 0.0847 ms 64.7%
+  triton_bmm_2589 0.1080 ms 50.8%
+SingleProcess AUTOTUNE takes 5.3920 seconds
+AUTOTUNE bmm(256x256x263, 256x263x64)
+  triton_bmm_2596 0.0620 ms 100.0%
+  triton_bmm_2598 0.0638 ms 97.2%
+  triton_bmm_2597 0.0644 ms 96.3%
+  triton_bmm_2599 0.0664 ms 93.4%
+  triton_bmm_2595 0.0680 ms 91.2%
+  triton_bmm_2601 0.0688 ms 90.2%
+  triton_bmm_2603 0.0688 ms 90.1%
+  triton_bmm_2602 0.0728 ms 85.1%
+  triton_bmm_2606 0.0757 ms 81.9%
+  bmm 0.0788 ms 78.7%
+SingleProcess AUTOTUNE takes 5.0368 seconds
+AUTOTUNE mm(8192x512, 512x1024)
+  mm 0.0505 ms 100.0%
+  triton_mm_2609 0.0567 ms 89.0%
+  triton_mm_2608 0.0577 ms 87.5%
+  triton_mm_2611 0.0658 ms 76.7%
+  triton_mm_2610 0.0665 ms 75.9%
+  triton_mm_2607 0.0734 ms 68.7%
+  triton_mm_2614 0.0739 ms 68.3%
+  triton_mm_2615 0.0755 ms 66.9%
+  triton_mm_2617 0.1097 ms 46.0%
+  triton_mm_2613 0.1358 ms 37.2%
+SingleProcess AUTOTUNE takes 4.7269 seconds
+AUTOTUNE bmm(32x256x1024, 32x1024x64)
+  triton_bmm_2634 0.0256 ms 100.0%
+  triton_bmm_2630 0.0264 ms 96.7%
+  bmm 0.0266 ms 96.0%
+  triton_bmm_2629 0.0282 ms 90.8%
+  triton_bmm_2627 0.0295 ms 86.8%
+  triton_bmm_2628 0.0296 ms 86.5%
+  triton_bmm_2631 0.0307 ms 83.3%
+  triton_bmm_2632 0.0322 ms 79.3%
+  triton_bmm_2635 0.0349 ms 73.3%
+  triton_bmm_2626 0.0429 ms 59.6%
+SingleProcess AUTOTUNE takes 4.5283 seconds
+AUTOTUNE bmm(32x4096x32, 32x32x257)
+  triton_bmm_2664 0.0850 ms 100.0%
+  triton_bmm_2662 0.0851 ms 99.9%
+  triton_bmm_2673 0.0893 ms 95.2%
+  triton_bmm_2666 0.0908 ms 93.7%
+  triton_bmm_2670 0.0927 ms 91.7%
+  triton_bmm_2672 0.0927 ms 91.7%
+  triton_bmm_2663 0.0954 ms 89.1%
+  triton_bmm_2665 0.1027 ms 82.7%
+  triton_bmm_2668 0.1030 ms 82.5%
+  triton_bmm_2669 0.1041 ms 81.7%
+SingleProcess AUTOTUNE takes 4.4832 seconds
+AUTOTUNE bmm(32x4096x257, 32x257x32)
+  triton_bmm_2685 0.0858 ms 100.0%
+  triton_bmm_2682 0.0886 ms 96.8%
+  triton_bmm_2676 0.0886 ms 96.8%
+  triton_bmm_2680 0.0919 ms 93.3%
+  triton_bmm_2675 0.0942 ms 91.0%
+  triton_bmm_2677 0.0964 ms 89.0%
+  triton_bmm_2679 0.0971 ms 88.4%
+  triton_bmm_2678 0.0980 ms 87.6%
+  triton_bmm_2683 0.1009 ms 85.0%
+  triton_bmm_2674 0.1055 ms 81.3%
+SingleProcess AUTOTUNE takes 4.0181 seconds
+AUTOTUNE convolution(32x1536x16x16, 1024x1536x3x3)
+  convolution 0.9814 ms 100.0%
+  triton_convolution_2789 6.9666 ms 14.1%
+  triton_convolution_2784 11.0480 ms 8.9%
+  triton_convolution_2790 11.9618 ms 8.2%
+  triton_convolution_2785 13.0616 ms 7.5%
+  triton_convolution_2787 13.6345 ms 7.2%
+  triton_convolution_2788 21.6273 ms 4.5%
+  triton_convolution_2786 28.0726 ms 3.5%
+SingleProcess AUTOTUNE takes 5.0420 seconds
+AUTOTUNE addmm(8192x1024, 8192x1536, 1536x1024)
+  bias_addmm 0.1361 ms 100.0%
+  triton_mm_2870 0.1492 ms 91.2%
+  triton_mm_2871 0.1494 ms 91.1%
+  addmm 0.1614 ms 84.3%
+  triton_mm_2872 0.1709 ms 79.6%
+  triton_mm_2873 0.1730 ms 78.7%
+  triton_mm_2869 0.1984 ms 68.6%
+  triton_mm_2877 0.1996 ms 68.2%
+  triton_mm_2876 0.2374 ms 57.3%
+  triton_mm_2879 0.3335 ms 40.8%
+SingleProcess AUTOTUNE takes 5.1940 seconds
+AUTOTUNE addmm(8192x2048, 8192x1024, 1024x2048)
+  bias_addmm 0.1630 ms 100.0%
+  triton_mm_3076 0.1945 ms 83.8%
+  triton_mm_3077 0.1953 ms 83.5%
+  addmm 0.2199 ms 74.1%
+  triton_mm_3078 0.2264 ms 72.0%
+  triton_mm_3079 0.2283 ms 71.4%
+  triton_mm_3075 0.2491 ms 65.5%
+  triton_mm_3083 0.2744 ms 59.4%
+  triton_mm_3082 0.2986 ms 54.6%
+  triton_mm_3085 0.3884 ms 42.0%
+SingleProcess AUTOTUNE takes 5.3725 seconds
+AUTOTUNE convolution(32x768x32x32, 512x768x3x3)
+  convolution 1.0243 ms 100.0%
+  triton_convolution_3092 6.6178 ms 15.5%
+  triton_convolution_3087 8.1743 ms 12.5%
+  triton_convolution_3090 9.8767 ms 10.4%
+  triton_convolution_3088 13.6246 ms 7.5%
+  triton_convolution_3093 13.9897 ms 7.3%
+  triton_convolution_3091 18.6245 ms 5.5%
+  triton_convolution_3089 29.4688 ms 3.5%
+SingleProcess AUTOTUNE takes 5.1266 seconds
+AUTOTUNE bmm(32x1024x512, 32x512x512)
+  bmm 0.0930 ms 100.0%
+  triton_bmm_3108 0.1057 ms 88.0%
+  triton_bmm_3107 0.1069 ms 87.0%
+  triton_bmm_3110 0.1234 ms 75.4%
+  triton_bmm_3109 0.1243 ms 74.8%
+  triton_bmm_3113 0.1292 ms 72.0%
+  triton_bmm_3106 0.1426 ms 65.2%
+  triton_bmm_3114 0.1464 ms 63.6%
+  triton_bmm_3116 0.2160 ms 43.1%
+  triton_bmm_3112 0.2667 ms 34.9%
+SingleProcess AUTOTUNE takes 4.5926 seconds
+AUTOTUNE mm(32768x512, 512x512)
+  mm 0.0926 ms 100.0%
+  triton_mm_3143 0.1056 ms 87.7%
+  triton_mm_3142 0.1068 ms 86.7%
+  triton_mm_3144 0.1237 ms 74.9%
+  triton_mm_3145 0.1238 ms 74.8%
+  triton_mm_3148 0.1300 ms 71.3%
+  triton_mm_3141 0.1413 ms 65.5%
+  triton_mm_3149 0.1461 ms 63.4%
+  triton_mm_3151 0.2142 ms 43.2%
+  triton_mm_3147 0.2655 ms 34.9%
+SingleProcess AUTOTUNE takes 5.3362 seconds
+AUTOTUNE convolution(32x512x32x32, 512x512x3x3)
+  convolution 0.6815 ms 100.0%
+  triton_convolution_3158 4.2955 ms 15.9%
+  triton_convolution_3159 5.8142 ms 11.7%
+  triton_convolution_3153 5.8698 ms 11.6%
+  triton_convolution_3156 6.3744 ms 10.7%
+  triton_convolution_3154 10.2576 ms 6.6%
+  triton_convolution_3157 11.8570 ms 5.7%
+  triton_convolution_3155 20.1096 ms 3.4%
+SingleProcess AUTOTUNE takes 4.8692 seconds
+AUTOTUNE addmm(32768x512, 32768x768, 768x512)
+  bias_addmm 0.1330 ms 100.0%
+  triton_mm_3174 0.1589 ms 83.7%
+  triton_mm_3173 0.1593 ms 83.5%
+  triton_mm_3175 0.1829 ms 72.7%
+  triton_mm_3176 0.1835 ms 72.5%
+  addmm 0.1851 ms 71.9%
+  triton_mm_3172 0.2117 ms 62.8%
+  triton_mm_3180 0.2167 ms 61.4%
+  triton_mm_3179 0.2936 ms 45.3%
+  triton_mm_3182 0.3069 ms 43.4%
+SingleProcess AUTOTUNE takes 5.4179 seconds
+AUTOTUNE addmm(32768x1024, 32768x512, 512x1024)
+  bias_addmm 0.1761 ms 100.0%
+  triton_mm_3380 0.2173 ms 81.0%
+  triton_mm_3379 0.2182 ms 80.7%
+  triton_mm_3381 0.2564 ms 68.7%
+  triton_mm_3382 0.2598 ms 67.8%
+  triton_mm_3378 0.2699 ms 65.2%
+  addmm 0.2876 ms 61.2%
+  triton_mm_3386 0.3062 ms 57.5%
+  triton_mm_3385 0.3720 ms 47.3%
+  triton_mm_3388 0.4046 ms 43.5%
+SingleProcess AUTOTUNE takes 5.1103 seconds
+AUTOTUNE convolution(32x384x64x64, 256x384x3x3)
+  convolution 1.0826 ms 100.0%
+  triton_convolution_3395 6.3974 ms 16.9%
+  triton_convolution_3390 7.6504 ms 14.2%
+  triton_convolution_3396 7.9491 ms 13.6%
+  triton_convolution_3393 9.0596 ms 11.9%
+  triton_convolution_3391 14.8902 ms 7.3%
+  triton_convolution_3394 18.3857 ms 5.9%
+  triton_convolution_3392 29.1345 ms 3.7%
+SingleProcess AUTOTUNE takes 5.3506 seconds
+AUTOTUNE bmm(32x4096x256, 32x256x512)
+  bmm 0.2161 ms 100.0%
+  triton_bmm_3410 0.2447 ms 88.3%
+  triton_bmm_3411 0.2453 ms 88.1%
+  triton_bmm_3416 0.2699 ms 80.1%
+  triton_bmm_3413 0.2915 ms 74.1%
+  triton_bmm_3409 0.2926 ms 73.9%
+  triton_bmm_3412 0.2954 ms 73.1%
+  triton_bmm_3417 0.3404 ms 63.5%
+  triton_bmm_3419 0.4428 ms 48.8%
+  triton_bmm_3418 0.6174 ms 35.0%
+SingleProcess AUTOTUNE takes 4.6842 seconds
+AUTOTUNE mm(131072x512, 512x256)
+  mm 0.1913 ms 100.0%
+  triton_mm_3446 0.2233 ms 85.6%
+  triton_mm_3445 0.2283 ms 83.8%
+  triton_mm_3447 0.2572 ms 74.4%
+  triton_mm_3448 0.2575 ms 74.3%
+  triton_mm_3451 0.2737 ms 69.9%
+  triton_mm_3444 0.2924 ms 65.4%
+  triton_mm_3452 0.3013 ms 63.5%
+  triton_mm_3454 0.4568 ms 41.9%
+  triton_mm_3450 0.5447 ms 35.1%
+SingleProcess AUTOTUNE takes 4.9276 seconds
+AUTOTUNE convolution(32x256x64x64, 256x256x3x3)
+  convolution 0.7223 ms 100.0%
+  triton_convolution_3461 3.8244 ms 18.9%
+  triton_convolution_3459 4.3628 ms 16.6%
+  triton_convolution_3456 5.0471 ms 14.3%
+  triton_convolution_3462 5.1146 ms 14.1%
+  triton_convolution_3457 9.8179 ms 7.4%
+  triton_convolution_3460 10.1802 ms 7.1%
+  triton_convolution_3458 19.8864 ms 3.6%
+SingleProcess AUTOTUNE takes 5.5635 seconds
+AUTOTUNE addmm(131072x256, 131072x384, 384x256)
+  bias_addmm 0.1764 ms 100.0%
+  triton_mm_3477 0.1879 ms 93.9%
+  triton_mm_3476 0.1955 ms 90.2%
+  triton_mm_3479 0.2199 ms 80.2%
+  triton_mm_3478 0.2204 ms 80.0%
+  triton_mm_3475 0.2464 ms 71.6%
+  triton_mm_3483 0.2486 ms 70.9%
+  addmm 0.2744 ms 64.3%
+  triton_mm_3482 0.3487 ms 50.6%
+  triton_mm_3485 0.3845 ms 45.9%
+SingleProcess AUTOTUNE takes 5.4428 seconds
+AUTOTUNE addmm(131072x512, 131072x256, 256x512)
+  bias_addmm 0.2348 ms 100.0%
+  triton_mm_3683 0.2623 ms 89.5%
+  triton_mm_3682 0.2652 ms 88.5%
+  triton_mm_3688 0.2900 ms 81.0%
+  triton_mm_3681 0.3131 ms 75.0%
+  triton_mm_3684 0.3201 ms 73.4%
+  triton_mm_3685 0.3210 ms 73.2%
+  triton_mm_3689 0.3652 ms 64.3%
+  triton_mm_3691 0.4483 ms 52.4%
+  addmm 0.4606 ms 51.0%
+SingleProcess AUTOTUNE takes 5.4003 seconds
+AUTOTUNE convolution(32x256x128x128, 128x256x3x3)
+  convolution 1.4137 ms 100.0%
+  triton_convolution_3696 9.1828 ms 15.4%
+  triton_convolution_3699 10.1398 ms 13.9%
+  triton_convolution_3698 10.1509 ms 13.9%
+  triton_convolution_3693 12.1684 ms 11.6%
+  triton_convolution_3694 19.7416 ms 7.2%
+  triton_convolution_3697 20.5223 ms 6.9%
+  triton_convolution_3695 40.0098 ms 3.5%
+SingleProcess AUTOTUNE takes 4.8261 seconds
+AUTOTUNE addmm(524288x128, 524288x256, 256x128)
+  bias_addmm 0.3015 ms 100.0%
+  triton_mm_3721 0.3099 ms 97.3%
+  triton_mm_3720 0.3296 ms 91.5%
+  triton_mm_3726 0.3374 ms 89.4%
+  triton_mm_3723 0.3434 ms 87.8%
+  triton_mm_3722 0.3545 ms 85.0%
+  triton_mm_3727 0.4012 ms 75.1%
+  triton_mm_3719 0.4028 ms 74.9%
+  addmm 0.5316 ms 56.7%
+  triton_mm_3729 0.6380 ms 47.3%
+SingleProcess AUTOTUNE takes 5.6727 seconds
+AUTOTUNE addmm(524288x6, 524288x128, 128x6)
+  triton_mm_3835 0.1095 ms 100.0%
+  triton_mm_3834 0.1101 ms 99.5%
+  triton_mm_3837 0.1104 ms 99.1%
+  triton_mm_3840 0.1110 ms 98.6%
+  triton_mm_3838 0.1123 ms 97.5%
+  triton_mm_3833 0.1124 ms 97.4%
+  triton_mm_3842 0.1126 ms 97.2%
+  triton_mm_3836 0.1126 ms 97.2%
+  bias_addmm 0.1160 ms 94.4%
+  triton_mm_3843 0.1160 ms 94.4%
+SingleProcess AUTOTUNE takes 4.2625 seconds
+AUTOTUNE mm(32x16, 16x64)
+  triton_mm_3848 0.0061 ms 100.0%
+  triton_mm_3845 0.0061 ms 99.5%
+  triton_mm_3850 0.0061 ms 99.5%
+  triton_mm_3851 0.0061 ms 99.5%
+  triton_mm_3849 0.0066 ms 92.3%
+  triton_mm_3852 0.0067 ms 91.8%
+  triton_mm_3853 0.0067 ms 91.8%
+  triton_mm_3846 0.0067 ms 91.4%
+  triton_mm_3847 0.0067 ms 91.2%
+  mm 0.0069 ms 88.4%
+SingleProcess AUTOTUNE takes 2.3684 seconds
+AUTOTUNE mm(32x64, 64x256)
+  triton_mm_3863 0.0066 ms 100.0%
+  triton_mm_3854 0.0069 ms 95.8%
+  triton_mm_3855 0.0069 ms 95.8%
+  triton_mm_3858 0.0069 ms 95.8%
+  triton_mm_3859 0.0071 ms 93.5%
+  triton_mm_3856 0.0072 ms 92.0%
+  triton_mm_3862 0.0072 ms 92.0%
+  triton_mm_3860 0.0074 ms 89.2%
+  triton_mm_3857 0.0077 ms 85.5%
+  mm 0.0079 ms 84.1%
+SingleProcess AUTOTUNE takes 3.4331 seconds
+AUTOTUNE mm(32x64, 64x64)
+  triton_mm_3869 0.0064 ms 100.0%
+  triton_mm_3872 0.0064 ms 100.0%
+  triton_mm_3873 0.0064 ms 100.0%
+  triton_mm_3867 0.0065 ms 99.0%
+  triton_mm_3866 0.0067 ms 96.2%
+  triton_mm_3868 0.0067 ms 96.2%
+  triton_mm_3870 0.0070 ms 90.9%
+  mm 0.0072 ms 89.3%
+  triton_mm_3875 0.0074 ms 86.6%
+  triton_mm_3871 0.0077 ms 83.3%
+SingleProcess AUTOTUNE takes 2.9251 seconds
+AUTOTUNE mm(32x64, 64x32)
+  triton_mm_3877 0.0064 ms 100.0%
+  triton_mm_3878 0.0064 ms 100.0%
+  triton_mm_3876 0.0069 ms 92.6%
+  triton_mm_3879 0.0069 ms 92.6%
+  triton_mm_3880 0.0069 ms 92.6%
+  triton_mm_3881 0.0069 ms 92.1%
+  mm 0.0076 ms 83.4%
+  triton_mm_3883 0.0078 ms 82.1%
+  triton_mm_3882 0.0079 ms 80.2%
+SingleProcess AUTOTUNE takes 2.5001 seconds
+AUTOTUNE convolution(32x6x256x256, 8x6x3x3)
+  triton_convolution_3888 0.2348 ms 100.0%
+  triton_convolution_3887 0.2610 ms 90.0%
+  triton_convolution_3884 0.2667 ms 88.1%
+  convolution 0.3068 ms 76.5%
+  triton_convolution_3885 0.3169 ms 74.1%
+  triton_convolution_3886 0.3604 ms 65.2%
+SingleProcess AUTOTUNE takes 1.7943 seconds
+AUTOTUNE convolution(32x6x256x256, 4x6x7x7)
+  convolution 0.6508 ms 100.0%
+  triton_convolution_3890 1.5042 ms 43.3%
+  triton_convolution_3893 1.6258 ms 40.0%
+  triton_convolution_3892 1.6905 ms 38.5%
+  triton_convolution_3889 1.6939 ms 38.4%
+  triton_convolution_3891 2.1539 ms 30.2%
+SingleProcess AUTOTUNE takes 2.0074 seconds
+AUTOTUNE convolution(32x6x256x256, 4x6x15x15)
+  triton_convolution_3895 6.3242 ms 100.0%
+  triton_convolution_3898 6.8767 ms 92.0%
+  triton_convolution_3897 7.0000 ms 90.3%
+  triton_convolution_3894 7.0957 ms 89.1%
+  triton_convolution_3896 8.9857 ms 70.4%
+  convolution 11.9820 ms 52.8%
+SingleProcess AUTOTUNE takes 2.2368 seconds
+AUTOTUNE convolution(32x16x256x256, 16x16x3x3)
+  convolution 0.2094 ms 100.0%
+  triton_convolution_3903 0.3866 ms 54.2%
+  triton_convolution_3900 0.4039 ms 51.9%
+  triton_convolution_3902 0.4280 ms 48.9%
+  triton_convolution_3901 0.4491 ms 46.6%
+  triton_convolution_3899 0.4497 ms 46.6%
+SingleProcess AUTOTUNE takes 1.8030 seconds
+AUTOTUNE addmm(524288x16, 524288x64, 64x16)
+  triton_mm_3961 0.0685 ms 100.0%
+  triton_mm_3962 0.0704 ms 97.3%
+  bias_addmm 0.0707 ms 96.9%
+  triton_mm_3954 0.0711 ms 96.3%
+  triton_mm_3955 0.0718 ms 95.4%
+  triton_mm_3963 0.0723 ms 94.7%
+  triton_mm_3953 0.0725 ms 94.4%
+  triton_mm_3958 0.0728 ms 94.1%
+  triton_mm_3957 0.0728 ms 94.0%
+  triton_mm_3956 0.0730 ms 93.8%
+SingleProcess AUTOTUNE takes 3.6529 seconds
+AUTOTUNE convolution(32x16x128x128, 16x16x3x3)
+  convolution 0.0621 ms 100.0%
+  triton_convolution_3969 0.1046 ms 59.4%
+  triton_convolution_3966 0.1083 ms 57.4%
+  triton_convolution_3968 0.1138 ms 54.6%
+  triton_convolution_3965 0.1187 ms 52.3%
+  triton_convolution_3967 0.1235 ms 50.3%
+SingleProcess AUTOTUNE takes 2.2244 seconds
+AUTOTUNE mm(64x128, 128x1024)
+  triton_mm_3994 0.0074 ms 100.0%
+  triton_mm_3993 0.0076 ms 97.1%
+  triton_mm_3992 0.0077 ms 96.7%
+  triton_mm_3996 0.0077 ms 96.7%
+  triton_mm_3997 0.0077 ms 96.7%
+  mm 0.0077 ms 96.3%
+  triton_mm_3990 0.0079 ms 93.5%
+  triton_mm_3991 0.0082 ms 90.6%
+  triton_mm_3989 0.0091 ms 82.0%
+  triton_mm_3988 0.0100 ms 74.1%
+SingleProcess AUTOTUNE takes 3.9865 seconds
+AUTOTUNE bmm(32x16384x16, 32x16x512)
+  triton_bmm_4007 0.3460 ms 100.0%
+  triton_bmm_4003 0.3535 ms 97.9%
+  triton_bmm_4001 0.3552 ms 97.4%
+  triton_bmm_4004 0.3556 ms 97.3%
+  triton_bmm_4000 0.3599 ms 96.1%
+  triton_bmm_4008 0.3632 ms 95.3%
+  triton_bmm_4002 0.3633 ms 95.2%
+  triton_bmm_4006 0.4052 ms 85.4%
+  bmm 0.4165 ms 83.1%
+  triton_bmm_4010 0.4282 ms 80.8%
+SingleProcess AUTOTUNE takes 4.0379 seconds
+AUTOTUNE bmm(256x16384x64, 256x64x3)
+  triton_bmm_4013 0.3744 ms 100.0%
+  triton_bmm_4014 0.3792 ms 98.7%
+  triton_bmm_4022 0.3794 ms 98.7%
+  triton_bmm_4021 0.3799 ms 98.5%
+  triton_bmm_4019 0.3800 ms 98.5%
+  triton_bmm_4018 0.3809 ms 98.3%
+  triton_bmm_4011 0.3810 ms 98.3%
+  triton_bmm_4012 0.3816 ms 98.1%
+  triton_bmm_4015 0.3850 ms 97.3%
+  triton_bmm_4020 0.3863 ms 96.9%
+SingleProcess AUTOTUNE takes 4.1587 seconds
+AUTOTUNE bmm(256x16384x3, 256x3x64)
+  triton_bmm_4030 0.3772 ms 100.0%
+  triton_bmm_4027 0.3772 ms 100.0%
+  triton_bmm_4026 0.3809 ms 99.0%
+  triton_bmm_4031 0.3809 ms 99.0%
+  triton_bmm_4025 0.3828 ms 98.5%
+  triton_bmm_4024 0.3879 ms 97.2%
+  triton_bmm_4023 0.3885 ms 97.1%
+  triton_bmm_4029 0.4179 ms 90.2%
+  triton_bmm_4033 0.4466 ms 84.4%
+  triton_bmm_4028 0.4687 ms 80.5%
+SingleProcess AUTOTUNE takes 3.7565 seconds
+AUTOTUNE mm(524288x512, 512x16)
+  triton_mm_4042 0.3597 ms 100.0%
+  triton_mm_4043 0.3677 ms 97.8%
+  triton_mm_4036 0.3784 ms 95.1%
+  triton_mm_4039 0.3806 ms 94.5%
+  triton_mm_4040 0.3812 ms 94.3%
+  triton_mm_4035 0.3815 ms 94.3%
+  triton_mm_4037 0.3820 ms 94.2%
+  triton_mm_4038 0.3836 ms 93.8%
+  mm 0.3846 ms 93.5%
+  triton_mm_4041 0.5012 ms 71.8%
+SingleProcess AUTOTUNE takes 3.7468 seconds
+AUTOTUNE addmm(131072x32, 131072x64, 64x32)
+  triton_mm_4145 0.0284 ms 100.0%
+  bias_addmm 0.0285 ms 99.7%
+  triton_mm_4146 0.0300 ms 94.8%
+  triton_mm_4139 0.0303 ms 93.7%
+  triton_mm_4138 0.0304 ms 93.5%
+  triton_mm_4141 0.0308 ms 92.3%
+  triton_mm_4147 0.0308 ms 92.1%
+  triton_mm_4144 0.0311 ms 91.3%
+  triton_mm_4140 0.0316 ms 90.0%
+  triton_mm_4137 0.0322 ms 88.2%
+SingleProcess AUTOTUNE takes 4.0125 seconds
+AUTOTUNE convolution(32x32x64x64, 32x32x3x3)
+  convolution 0.0414 ms 100.0%
+  triton_convolution_4152 0.0702 ms 58.9%
+  triton_convolution_4155 0.0707 ms 58.5%
+  triton_convolution_4154 0.0763 ms 54.2%
+  triton_convolution_4153 0.0763 ms 54.2%
+  triton_convolution_4149 0.0812 ms 50.9%
+  triton_convolution_4150 0.1205 ms 34.3%
+  triton_convolution_4151 0.2179 ms 19.0%
+SingleProcess AUTOTUNE takes 2.8033 seconds
+AUTOTUNE bmm(32x4096x32, 32x32x512)
+  triton_bmm_4195 0.0977 ms 100.0%
+  triton_bmm_4193 0.0979 ms 99.8%
+  triton_bmm_4199 0.0981 ms 99.5%
+  triton_bmm_4196 0.0998 ms 97.8%
+  triton_bmm_4192 0.1000 ms 97.6%
+  triton_bmm_4194 0.1013 ms 96.4%
+  triton_bmm_4200 0.1044 ms 93.5%
+  triton_bmm_4202 0.1113 ms 87.7%
+  bmm 0.1120 ms 87.2%
+  triton_bmm_4198 0.1211 ms 80.7%
+SingleProcess AUTOTUNE takes 4.3635 seconds
+AUTOTUNE bmm(256x4096x64, 256x64x3)
+  triton_bmm_4212 0.1104 ms 100.0%
+  triton_bmm_4205 0.1107 ms 99.7%
+  triton_bmm_4206 0.1108 ms 99.6%
+  triton_bmm_4208 0.1112 ms 99.2%
+  triton_bmm_4207 0.1116 ms 98.9%
+  triton_bmm_4213 0.1121 ms 98.5%
+  triton_bmm_4204 0.1122 ms 98.4%
+  triton_bmm_4211 0.1127 ms 97.9%
+  triton_bmm_4214 0.1130 ms 97.7%
+  triton_bmm_4215 0.1132 ms 97.5%
+SingleProcess AUTOTUNE takes 3.6824 seconds
+AUTOTUNE bmm(256x4096x3, 256x3x64)
+  triton_bmm_4223 0.0994 ms 100.0%
+  triton_bmm_4224 0.0998 ms 99.6%
+  triton_bmm_4220 0.0999 ms 99.5%
+  triton_bmm_4219 0.1002 ms 99.2%
+  triton_bmm_4218 0.1011 ms 98.3%
+  triton_bmm_4216 0.1020 ms 97.4%
+  triton_bmm_4217 0.1024 ms 97.0%
+  triton_bmm_4222 0.1050 ms 94.7%
+  triton_bmm_4226 0.1122 ms 88.6%
+  triton_bmm_4221 0.1161 ms 85.6%
+SingleProcess AUTOTUNE takes 3.2179 seconds
+AUTOTUNE mm(131072x512, 512x32)
+  triton_mm_4235 0.1116 ms 100.0%
+  triton_mm_4229 0.1125 ms 99.2%
+  triton_mm_4236 0.1130 ms 98.8%
+  triton_mm_4228 0.1134 ms 98.4%
+  triton_mm_4231 0.1139 ms 97.9%
+  triton_mm_4233 0.1144 ms 97.6%
+  triton_mm_4230 0.1144 ms 97.5%
+  triton_mm_4232 0.1152 ms 96.8%
+  mm 0.1153 ms 96.8%
+  triton_mm_4234 0.1428 ms 78.2%
+SingleProcess AUTOTUNE takes 3.7885 seconds
+AUTOTUNE mm(32x64, 64x128)
+  triton_mm_4334 0.0064 ms 100.0%
+  triton_mm_4338 0.0064 ms 100.0%
+  triton_mm_4331 0.0067 ms 96.2%
+  triton_mm_4337 0.0067 ms 96.2%
+  triton_mm_4335 0.0069 ms 93.0%
+  triton_mm_4329 0.0069 ms 92.6%
+  triton_mm_4330 0.0074 ms 87.0%
+  triton_mm_4333 0.0074 ms 86.2%
+  mm 0.0076 ms 83.9%
+  triton_mm_4332 0.0077 ms 83.3%
+SingleProcess AUTOTUNE takes 3.6615 seconds
+AUTOTUNE addmm(32768x64, 32768x128, 128x64)
+  triton_mm_4341 0.0166 ms 100.0%
+  triton_mm_4342 0.0172 ms 96.8%
+  triton_mm_4348 0.0172 ms 96.6%
+  triton_mm_4343 0.0176 ms 94.4%
+  bias_addmm 0.0181 ms 91.7%
+  triton_mm_4344 0.0188 ms 88.4%
+  triton_mm_4349 0.0193 ms 85.9%
+  triton_mm_4345 0.0205 ms 81.0%
+  triton_mm_4347 0.0212 ms 78.3%
+  triton_mm_4351 0.0216 ms 77.0%
+SingleProcess AUTOTUNE takes 4.2577 seconds
+AUTOTUNE convolution(32x64x32x32, 64x64x3x3)
+  convolution 0.0325 ms 100.0%
+  triton_convolution_4358 0.0963 ms 33.8%
+  triton_convolution_4353 0.0991 ms 32.8%
+  triton_convolution_4359 0.1260 ms 25.8%
+  triton_convolution_4356 0.1289 ms 25.2%
+  triton_convolution_4357 0.1344 ms 24.2%
+  triton_convolution_4354 0.1809 ms 18.0%
+  triton_convolution_4355 0.4821 ms 6.7%
+SingleProcess AUTOTUNE takes 3.7054 seconds
+AUTOTUNE bmm(32x1024x64, 32x64x512)
+  triton_bmm_4400 0.0336 ms 100.0%
+  triton_bmm_4399 0.0358 ms 93.8%
+  triton_bmm_4402 0.0397 ms 84.5%
+  triton_bmm_4401 0.0402 ms 83.5%
+  triton_bmm_4406 0.0410 ms 81.9%
+  bmm 0.0433 ms 77.5%
+  triton_bmm_4398 0.0448 ms 74.9%
+  triton_bmm_4405 0.0461 ms 72.7%
+  triton_bmm_4408 0.0529 ms 63.5%
+  triton_bmm_4407 0.0592 ms 56.7%
+SingleProcess AUTOTUNE takes 4.5327 seconds
+AUTOTUNE bmm(256x1024x64, 256x64x3)
+  triton_bmm_4418 0.0408 ms 100.0%
+  triton_bmm_4419 0.0420 ms 97.2%
+  triton_bmm_4412 0.0428 ms 95.1%
+  triton_bmm_4411 0.0431 ms 94.7%
+  triton_bmm_4414 0.0441 ms 92.5%
+  triton_bmm_4410 0.0442 ms 92.2%
+  triton_bmm_4413 0.0450 ms 90.7%
+  triton_bmm_4417 0.0450 ms 90.5%
+  triton_bmm_4415 0.0451 ms 90.4%
+  triton_bmm_4420 0.0456 ms 89.4%
+SingleProcess AUTOTUNE takes 3.5838 seconds
+AUTOTUNE bmm(256x1024x3, 256x3x64)
+  triton_bmm_4430 0.0291 ms 100.0%
+  triton_bmm_4425 0.0296 ms 98.1%
+  triton_bmm_4426 0.0296 ms 98.1%
+  triton_bmm_4429 0.0300 ms 96.9%
+  triton_bmm_4424 0.0301 ms 96.4%
+  triton_bmm_4423 0.0302 ms 96.3%
+  triton_bmm_4422 0.0303 ms 95.8%
+  triton_bmm_4428 0.0321 ms 90.4%
+  triton_bmm_4432 0.0335 ms 86.8%
+  triton_bmm_4427 0.0343 ms 84.6%
+SingleProcess AUTOTUNE takes 3.6999 seconds
+AUTOTUNE mm(32768x512, 512x64)
+  triton_mm_4434 0.0422 ms 100.0%
+  triton_mm_4435 0.0424 ms 99.5%
+  triton_mm_4441 0.0427 ms 98.9%
+  triton_mm_4436 0.0428 ms 98.5%
+  mm 0.0457 ms 92.3%
+  triton_mm_4437 0.0459 ms 91.9%
+  triton_mm_4439 0.0483 ms 87.4%
+  triton_mm_4438 0.0484 ms 87.3%
+  triton_mm_4433 0.0505 ms 83.6%
+  triton_mm_4440 0.0515 ms 82.0%
+SingleProcess AUTOTUNE takes 4.7002 seconds
+AUTOTUNE addmm(8192x128, 8192x256, 256x128)
+  triton_mm_4551 0.0147 ms 100.0%
+  triton_mm_4550 0.0150 ms 98.1%
+  bias_addmm 0.0152 ms 96.8%
+  triton_mm_4557 0.0152 ms 96.6%
+  triton_mm_4553 0.0159 ms 92.7%
+  triton_mm_4552 0.0167 ms 88.0%
+  triton_mm_4549 0.0168 ms 87.4%
+  triton_mm_4554 0.0182 ms 80.8%
+  triton_mm_4555 0.0195 ms 75.7%
+  triton_mm_4556 0.0197 ms 74.7%
+SingleProcess AUTOTUNE takes 4.9367 seconds
+AUTOTUNE convolution(32x128x16x16, 128x128x3x3)
+  convolution 0.0257 ms 100.0%
+  triton_convolution_4564 0.1136 ms 22.7%
+  triton_convolution_4561 0.1206 ms 21.3%
+  triton_convolution_4567 0.1248 ms 20.6%
+  triton_convolution_4566 0.1392 ms 18.5%
+  triton_convolution_4565 0.1507 ms 17.1%
+  triton_convolution_4562 0.2572 ms 10.0%
+  triton_convolution_4563 0.4793 ms 5.4%
+SingleProcess AUTOTUNE takes 4.0051 seconds
+AUTOTUNE bmm(32x256x128, 32x128x512)
+  triton_bmm_4608 0.0181 ms 100.0%
+  triton_bmm_4607 0.0189 ms 95.8%
+  bmm 0.0207 ms 87.6%
+  triton_bmm_4610 0.0208 ms 87.4%
+  triton_bmm_4613 0.0209 ms 86.8%
+  triton_bmm_4606 0.0211 ms 86.2%
+  triton_bmm_4609 0.0213 ms 85.1%
+  triton_bmm_4614 0.0215 ms 84.4%
+  triton_bmm_4616 0.0299 ms 60.6%
+  triton_bmm_4615 0.0300 ms 60.4%
+SingleProcess AUTOTUNE takes 4.3480 seconds
+AUTOTUNE bmm(256x256x64, 256x64x3)
+  triton_bmm_4626 0.0156 ms 100.0%
+  triton_bmm_4619 0.0159 ms 98.2%
+  triton_bmm_4618 0.0161 ms 97.4%
+  triton_bmm_4625 0.0162 ms 96.4%
+  triton_bmm_4629 0.0162 ms 96.4%
+  triton_bmm_4621 0.0164 ms 95.3%
+  triton_bmm_4627 0.0165 ms 95.0%
+  triton_bmm_4622 0.0165 ms 94.6%
+  triton_bmm_4628 0.0166 ms 94.0%
+  triton_bmm_4623 0.0171 ms 91.6%
+SingleProcess AUTOTUNE takes 4.0471 seconds
+AUTOTUNE bmm(256x256x3, 256x3x64)
+  triton_bmm_4632 0.0116 ms 100.0%
+  triton_bmm_4631 0.0116 ms 99.7%
+  triton_bmm_4634 0.0118 ms 97.6%
+  triton_bmm_4637 0.0118 ms 97.6%
+  triton_bmm_4630 0.0121 ms 95.5%
+  triton_bmm_4638 0.0124 ms 93.0%
+  triton_bmm_4633 0.0124 ms 92.9%
+  triton_bmm_4636 0.0134 ms 86.1%
+  triton_bmm_4635 0.0137 ms 84.1%
+  triton_bmm_4639 0.0142 ms 81.5%
+SingleProcess AUTOTUNE takes 3.3492 seconds
+AUTOTUNE mm(8192x512, 512x128)
+  mm 0.0182 ms 100.0%
+  triton_mm_4643 0.0192 ms 94.8%
+  triton_mm_4642 0.0206 ms 88.3%
+  triton_mm_4645 0.0207 ms 87.8%
+  triton_mm_4649 0.0211 ms 86.1%
+  triton_mm_4644 0.0227 ms 80.2%
+  triton_mm_4641 0.0255 ms 71.2%
+  triton_mm_4646 0.0256 ms 71.0%
+  triton_mm_4647 0.0289 ms 62.9%
+  triton_mm_4650 0.0295 ms 61.6%
+SingleProcess AUTOTUNE takes 4.6211 seconds
+AUTOTUNE mm(32x64, 64x512)
+  triton_mm_4750 0.0066 ms 100.0%
+  triton_mm_4747 0.0067 ms 99.5%
+  triton_mm_4745 0.0069 ms 95.8%
+  triton_mm_4754 0.0071 ms 93.7%
+  triton_mm_4748 0.0072 ms 92.4%
+  mm 0.0072 ms 92.0%
+  triton_mm_4753 0.0073 ms 90.4%
+  triton_mm_4749 0.0074 ms 89.8%
+  triton_mm_4746 0.0074 ms 89.2%
+  triton_mm_4751 0.0074 ms 89.2%
+SingleProcess AUTOTUNE takes 3.6258 seconds
+AUTOTUNE addmm(8192x256, 8192x128, 128x256)
+  triton_mm_4759 0.0140 ms 100.0%
+  triton_mm_4758 0.0149 ms 93.6%
+  triton_mm_4757 0.0150 ms 93.1%
+  bias_addmm 0.0157 ms 89.2%
+  triton_mm_4764 0.0160 ms 87.6%
+  triton_mm_4761 0.0164 ms 85.4%
+  triton_mm_4765 0.0165 ms 84.5%
+  triton_mm_4760 0.0168 ms 83.4%
+  triton_mm_4767 0.0206 ms 68.0%
+  triton_mm_4763 0.0212 ms 66.0%
+SingleProcess AUTOTUNE takes 4.7730 seconds
+AUTOTUNE convolution(32x256x16x16, 256x256x3x3)
+  convolution 0.0590 ms 100.0%
+  triton_convolution_4774 0.4274 ms 13.8%
+  triton_convolution_4772 0.4354 ms 13.6%
+  triton_convolution_4775 0.4728 ms 12.5%
+  triton_convolution_4773 0.5424 ms 10.9%
+  triton_convolution_4769 0.5495 ms 10.7%
+  triton_convolution_4770 0.6488 ms 9.1%
+  triton_convolution_4771 1.8872 ms 3.1%
+SingleProcess AUTOTUNE takes 4.3538 seconds
+AUTOTUNE bmm(32x256x256, 32x256x512)
+  triton_bmm_4790 0.0223 ms 100.0%
+  triton_bmm_4789 0.0241 ms 92.3%
+  bmm 0.0246 ms 90.5%
+  triton_bmm_4792 0.0252 ms 88.4%
+  triton_bmm_4791 0.0255 ms 87.5%
+  triton_bmm_4788 0.0284 ms 78.4%
+  triton_bmm_4796 0.0286 ms 77.9%
+  triton_bmm_4795 0.0313 ms 71.1%
+  triton_bmm_4798 0.0393 ms 56.7%
+  triton_bmm_4794 0.0453 ms 49.2%
+SingleProcess AUTOTUNE takes 4.4461 seconds
+AUTOTUNE mm(8192x512, 512x256)
+  mm 0.0220 ms 100.0%
+  triton_mm_4825 0.0221 ms 99.4%
+  triton_mm_4824 0.0241 ms 91.4%
+  triton_mm_4827 0.0254 ms 86.5%
+  triton_mm_4826 0.0255 ms 86.1%
+  triton_mm_4831 0.0261 ms 84.1%
+  triton_mm_4823 0.0268 ms 82.0%
+  triton_mm_4830 0.0334 ms 65.9%
+  triton_mm_4833 0.0412 ms 53.4%
+  triton_mm_4829 0.0412 ms 53.3%
+SingleProcess AUTOTUNE takes 4.9161 seconds
+AUTOTUNE bmm(32x256x256, 32x256x64)
+  triton_bmm_4843 0.0123 ms 100.0%
+  triton_bmm_4846 0.0124 ms 99.7%
+  triton_bmm_4850 0.0124 ms 99.7%
+  triton_bmm_4844 0.0131 ms 94.1%
+  triton_bmm_4845 0.0132 ms 93.7%
+  triton_bmm_4847 0.0139 ms 88.5%
+  triton_bmm_4848 0.0144 ms 85.6%
+  bmm 0.0147 ms 84.1%
+  triton_bmm_4851 0.0155 ms 79.7%
+  triton_bmm_4842 0.0159 ms 77.5%
+SingleProcess AUTOTUNE takes 4.3241 seconds
+AUTOTUNE convolution(32x384x16x16, 256x384x3x3)
+  convolution 0.0817 ms 100.0%
+  triton_convolution_5004 0.7118 ms 11.5%
+  triton_convolution_5005 0.7376 ms 11.1%
+  triton_convolution_5000 0.8067 ms 10.1%
+  triton_convolution_4999 0.8202 ms 10.0%
+  triton_convolution_5002 0.8326 ms 9.8%
+  triton_convolution_5003 1.0726 ms 7.6%
+  triton_convolution_5001 2.7647 ms 3.0%
+SingleProcess AUTOTUNE takes 4.4695 seconds
+AUTOTUNE addmm(8192x256, 8192x384, 384x256)
+  triton_mm_5086 0.0216 ms 100.0%
+  bias_addmm 0.0222 ms 97.0%
+  triton_mm_5085 0.0224 ms 96.3%
+  triton_mm_5084 0.0236 ms 91.2%
+  triton_mm_5087 0.0240 ms 89.7%
+  triton_mm_5088 0.0241 ms 89.6%
+  triton_mm_5092 0.0241 ms 89.4%
+  addmm 0.0308 ms 70.1%
+  triton_mm_5089 0.0360 ms 60.0%
+  triton_mm_5090 0.0361 ms 59.7%
+SingleProcess AUTOTUNE takes 5.2410 seconds
+AUTOTUNE addmm(8192x512, 8192x256, 256x512)
+  triton_mm_5292 0.0240 ms 100.0%
+  bias_addmm 0.0252 ms 94.9%
+  triton_mm_5291 0.0256 ms 93.6%
+  triton_mm_5293 0.0270 ms 88.7%
+  triton_mm_5294 0.0272 ms 88.0%
+  triton_mm_5290 0.0294 ms 81.5%
+  triton_mm_5298 0.0312 ms 76.7%
+  triton_mm_5297 0.0326 ms 73.5%
+  addmm 0.0405 ms 59.2%
+  triton_mm_5300 0.0411 ms 58.3%
+SingleProcess AUTOTUNE takes 4.9757 seconds
+AUTOTUNE convolution(32x192x32x32, 128x192x3x3)
+  convolution 0.0872 ms 100.0%
+  triton_convolution_5305 0.4887 ms 17.8%
+  triton_convolution_5307 0.5077 ms 17.2%
+  triton_convolution_5308 0.5166 ms 16.9%
+  triton_convolution_5302 0.6514 ms 13.4%
+  triton_convolution_5306 0.6961 ms 12.5%
+  triton_convolution_5303 0.7900 ms 11.0%
+  triton_convolution_5304 2.1488 ms 4.1%
+SingleProcess AUTOTUNE takes 4.2071 seconds
+AUTOTUNE bmm(32x1024x128, 32x128x512)
+  triton_bmm_5323 0.0446 ms 100.0%
+  bmm 0.0486 ms 91.8%
+  triton_bmm_5322 0.0497 ms 89.8%
+  triton_bmm_5328 0.0497 ms 89.8%
+  triton_bmm_5325 0.0531 ms 84.0%
+  triton_bmm_5324 0.0534 ms 83.5%
+  triton_bmm_5321 0.0552 ms 80.8%
+  triton_bmm_5329 0.0591 ms 75.4%
+  triton_bmm_5331 0.0863 ms 51.7%
+  triton_bmm_5330 0.0940 ms 47.5%
+SingleProcess AUTOTUNE takes 4.5013 seconds
+AUTOTUNE mm(32768x512, 512x128)
+  triton_mm_5358 0.0472 ms 100.0%
+  mm 0.0483 ms 97.7%
+  triton_mm_5360 0.0489 ms 96.5%
+  triton_mm_5357 0.0503 ms 93.8%
+  triton_mm_5359 0.0505 ms 93.5%
+  triton_mm_5364 0.0536 ms 88.0%
+  triton_mm_5356 0.0621 ms 76.0%
+  triton_mm_5363 0.0661 ms 71.4%
+  triton_mm_5361 0.0736 ms 64.1%
+  triton_mm_5362 0.0805 ms 58.6%
+SingleProcess AUTOTUNE takes 4.5389 seconds
+AUTOTUNE convolution(32x128x32x32, 128x128x3x3)
+  convolution 0.0599 ms 100.0%
+  triton_convolution_5371 0.3117 ms 19.2%
+  triton_convolution_5373 0.3283 ms 18.2%
+  triton_convolution_5374 0.3326 ms 18.0%
+  triton_convolution_5368 0.3431 ms 17.5%
+  triton_convolution_5372 0.4023 ms 14.9%
+  triton_convolution_5369 0.6051 ms 9.9%
+  triton_convolution_5370 1.4625 ms 4.1%
+SingleProcess AUTOTUNE takes 3.9535 seconds
+AUTOTUNE addmm(32768x128, 32768x192, 192x128)
+  bias_addmm 0.0268 ms 100.0%
+  triton_mm_5389 0.0274 ms 97.5%
+  triton_mm_5388 0.0288 ms 92.9%
+  triton_mm_5391 0.0306 ms 87.4%
+  triton_mm_5390 0.0312 ms 85.7%
+  triton_mm_5395 0.0314 ms 85.1%
+  triton_mm_5394 0.0319 ms 83.8%
+  triton_mm_5387 0.0324 ms 82.6%
+  addmm 0.0413 ms 64.8%
+  triton_mm_5393 0.0423 ms 63.2%
+SingleProcess AUTOTUNE takes 5.4433 seconds
+AUTOTUNE addmm(32768x256, 32768x128, 128x256)
+  bias_addmm 0.0313 ms 100.0%
+  triton_mm_5595 0.0328 ms 95.3%
+  triton_mm_5594 0.0344 ms 90.9%
+  triton_mm_5600 0.0350 ms 89.5%
+  triton_mm_5593 0.0351 ms 89.2%
+  triton_mm_5597 0.0359 ms 87.2%
+  triton_mm_5596 0.0375 ms 83.4%
+  triton_mm_5601 0.0391 ms 80.0%
+  triton_mm_5603 0.0527 ms 59.3%
+  addmm 0.0571 ms 54.9%
+SingleProcess AUTOTUNE takes 4.8457 seconds
+AUTOTUNE convolution(32x96x64x64, 64x96x3x3)
+  convolution 0.0945 ms 100.0%
+  triton_convolution_5611 0.4330 ms 21.8%
+  triton_convolution_5605 0.4524 ms 20.9%
+  triton_convolution_5610 0.5086 ms 18.6%
+  triton_convolution_5606 0.6599 ms 14.3%
+  triton_convolution_5608 0.7101 ms 13.3%
+  triton_convolution_5609 0.7693 ms 12.3%
+  triton_convolution_5607 1.3102 ms 7.2%
+SingleProcess AUTOTUNE takes 3.9678 seconds
+AUTOTUNE bmm(32x4096x64, 32x64x512)
+  triton_bmm_5626 0.1119 ms 100.0%
+  triton_bmm_5625 0.1140 ms 98.1%
+  triton_bmm_5628 0.1300 ms 86.1%
+  triton_bmm_5627 0.1307 ms 85.7%
+  triton_bmm_5632 0.1388 ms 80.6%
+  bmm 0.1427 ms 78.4%
+  triton_bmm_5624 0.1453 ms 77.0%
+  triton_bmm_5631 0.1526 ms 73.3%
+  triton_bmm_5634 0.1811 ms 61.8%
+  triton_bmm_5633 0.2161 ms 51.8%
+SingleProcess AUTOTUNE takes 4.7241 seconds
+AUTOTUNE mm(131072x512, 512x64)
+  triton_mm_5660 0.1187 ms 100.0%
+  triton_mm_5661 0.1191 ms 99.7%
+  triton_mm_5667 0.1193 ms 99.5%
+  triton_mm_5663 0.1195 ms 99.4%
+  triton_mm_5662 0.1203 ms 98.7%
+  mm 0.1206 ms 98.4%
+  triton_mm_5664 0.1458 ms 81.4%
+  triton_mm_5659 0.1505 ms 78.9%
+  triton_mm_5666 0.1536 ms 77.3%
+  triton_mm_5665 0.1558 ms 76.2%
+SingleProcess AUTOTUNE takes 4.6237 seconds
+AUTOTUNE convolution(32x64x64x64, 64x64x3x3)
+  convolution 0.0720 ms 100.0%
+  triton_convolution_5677 0.2909 ms 24.7%
+  triton_convolution_5671 0.2912 ms 24.7%
+  triton_convolution_5676 0.3324 ms 21.6%
+  triton_convolution_5674 0.4597 ms 15.7%
+  triton_convolution_5672 0.4739 ms 15.2%
+  triton_convolution_5675 0.4963 ms 14.5%
+  triton_convolution_5673 1.2572 ms 5.7%
+SingleProcess AUTOTUNE takes 4.4199 seconds
+AUTOTUNE addmm(131072x64, 131072x96, 96x64)
+  triton_mm_5690 0.0423 ms 100.0%
+  triton_mm_5693 0.0437 ms 96.9%
+  bias_addmm 0.0440 ms 96.1%
+  triton_mm_5691 0.0442 ms 95.9%
+  triton_mm_5692 0.0442 ms 95.9%
+  triton_mm_5694 0.0444 ms 95.5%
+  triton_mm_5697 0.0445 ms 95.2%
+  triton_mm_5698 0.0445 ms 95.2%
+  triton_mm_5696 0.0506 ms 83.6%
+  triton_mm_5700 0.0524 ms 80.8%
+SingleProcess AUTOTUNE takes 4.4727 seconds
+AUTOTUNE addmm(131072x128, 131072x64, 64x128)
+  triton_mm_5895 0.0455 ms 100.0%
+  triton_mm_5896 0.0469 ms 97.1%
+  triton_mm_5902 0.0480 ms 94.9%
+  triton_mm_5897 0.0518 ms 87.9%
+  triton_mm_5898 0.0543 ms 83.9%
+  triton_mm_5894 0.0559 ms 81.4%
+  bias_addmm 0.0560 ms 81.3%
+  triton_mm_5904 0.0565 ms 80.5%
+  triton_mm_5901 0.0586 ms 77.7%
+  triton_mm_5903 0.0660 ms 69.0%
+SingleProcess AUTOTUNE takes 5.0997 seconds
+AUTOTUNE convolution(32x48x128x128, 32x48x3x3)
+  convolution 0.1798 ms 100.0%
+  triton_convolution_5906 0.4644 ms 38.7%
+  triton_convolution_5912 0.4768 ms 37.7%
+  triton_convolution_5909 0.4775 ms 37.7%
+  triton_convolution_5911 0.5025 ms 35.8%
+  triton_convolution_5910 0.5127 ms 35.1%
+  triton_convolution_5907 0.6233 ms 28.8%
+  triton_convolution_5908 0.7433 ms 24.2%
+SingleProcess AUTOTUNE takes 2.9171 seconds
+AUTOTUNE bmm(32x16384x32, 32x32x512)
+  triton_bmm_5926 0.3734 ms 100.0%
+  triton_bmm_5932 0.3749 ms 99.6%
+  triton_bmm_5928 0.3776 ms 98.9%
+  triton_bmm_5927 0.3886 ms 96.1%
+  triton_bmm_5929 0.3895 ms 95.9%
+  triton_bmm_5925 0.3896 ms 95.8%
+  bmm 0.4212 ms 88.7%
+  triton_bmm_5933 0.4229 ms 88.3%
+  triton_bmm_5935 0.4320 ms 86.4%
+  triton_bmm_5931 0.5012 ms 74.5%
+SingleProcess AUTOTUNE takes 4.1245 seconds
+AUTOTUNE mm(524288x512, 512x32)
+  triton_mm_5968 0.3843 ms 100.0%
+  triton_mm_5969 0.3914 ms 98.2%
+  triton_mm_5962 0.3955 ms 97.2%
+  mm 0.3963 ms 97.0%
+  triton_mm_5964 0.3993 ms 96.3%
+  triton_mm_5961 0.4002 ms 96.0%
+  triton_mm_5963 0.4042 ms 95.1%
+  triton_mm_5965 0.4070 ms 94.4%
+  triton_mm_5966 0.4074 ms 94.3%
+  triton_mm_5967 0.5183 ms 74.2%
+SingleProcess AUTOTUNE takes 4.0413 seconds
+AUTOTUNE convolution(32x32x128x128, 32x32x3x3)
+  convolution 0.1141 ms 100.0%
+  triton_convolution_5978 0.2418 ms 47.2%
+  triton_convolution_5975 0.2448 ms 46.6%
+  triton_convolution_5976 0.2701 ms 42.3%
+  triton_convolution_5977 0.2759 ms 41.4%
+  triton_convolution_5972 0.2933 ms 38.9%
+  triton_convolution_5973 0.3933 ms 29.0%
+  triton_convolution_5974 0.7427 ms 15.4%
+SingleProcess AUTOTUNE takes 3.0711 seconds
+AUTOTUNE addmm(524288x32, 524288x48, 48x32)
+  triton_mm_5997 0.0688 ms 100.0%
+  triton_mm_5991 0.0690 ms 99.7%
+  triton_mm_5998 0.0716 ms 96.1%
+  triton_mm_5993 0.0724 ms 95.0%
+  triton_mm_5990 0.0736 ms 93.5%
+  triton_mm_5996 0.0766 ms 89.9%
+  triton_mm_6000 0.0771 ms 89.2%
+  triton_mm_5989 0.0776 ms 88.7%
+  triton_mm_5999 0.0785 ms 87.6%
+  triton_mm_5992 0.0805 ms 85.5%
+SingleProcess AUTOTUNE takes 3.9849 seconds
+AUTOTUNE addmm(524288x64, 524288x32, 32x64)
+  triton_mm_6192 0.0732 ms 100.0%
+  triton_mm_6190 0.0732 ms 99.9%
+  triton_mm_6197 0.0735 ms 99.5%
+  triton_mm_6189 0.0737 ms 99.3%
+  triton_mm_6196 0.0739 ms 99.0%
+  triton_mm_6193 0.0740 ms 98.9%
+  triton_mm_6191 0.0749 ms 97.7%
+  bias_addmm 0.0763 ms 96.0%
+  triton_mm_6195 0.0768 ms 95.3%
+  triton_mm_6194 0.0821 ms 89.1%
+SingleProcess AUTOTUNE takes 4.5099 seconds
+AUTOTUNE convolution(32x32x256x256, 16x32x3x3)
+  convolution 0.3189 ms 100.0%
+  triton_convolution_6204 1.0215 ms 31.2%
+  triton_convolution_6202 1.2915 ms 24.7%
+  triton_convolution_6206 1.3159 ms 24.2%
+  triton_convolution_6205 1.3495 ms 23.6%
+  triton_convolution_6203 1.3976 ms 22.8%
+  triton_convolution_6201 1.4286 ms 22.3%
+SingleProcess AUTOTUNE takes 2.5250 seconds
+AUTOTUNE addmm(2097152x16, 2097152x32, 32x16)
+  triton_mm_6222 0.1313 ms 100.0%
+  triton_mm_6227 0.1317 ms 99.7%
+  triton_mm_6224 0.1322 ms 99.3%
+  triton_mm_6228 0.1355 ms 96.9%
+  triton_mm_6226 0.1359 ms 96.6%
+  triton_mm_6220 0.1386 ms 94.8%
+  triton_mm_6225 0.1386 ms 94.7%
+  triton_mm_6221 0.1389 ms 94.5%
+  triton_mm_6223 0.1390 ms 94.5%
+  triton_mm_6230 0.1457 ms 90.1%
+SingleProcess AUTOTUNE takes 3.1659 seconds
+AUTOTUNE addmm(2097152x3, 2097152x19, 19x3)
+  triton_mm_6322 0.1116 ms 100.0%
+  triton_mm_6323 0.1202 ms 92.8%
+  triton_mm_6320 0.1251 ms 89.2%
+  triton_mm_6315 0.1252 ms 89.1%
+  triton_mm_6317 0.1263 ms 88.3%
+  triton_mm_6321 0.1402 ms 79.6%
+  triton_mm_6319 0.1403 ms 79.5%
+  triton_mm_6313 0.1485 ms 75.1%
+  triton_mm_6316 0.1485 ms 75.1%
+  triton_mm_6318 0.1485 ms 75.1%
+SingleProcess AUTOTUNE takes 4.0570 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:17,  1.66it/s]running benchmark:   7%|▋         | 2/30 [00:01<00:15,  1.85it/s]running benchmark:  10%|█         | 3/30 [00:01<00:14,  1.92it/s]running benchmark:  13%|█▎        | 4/30 [00:02<00:13,  1.95it/s]running benchmark:  17%|█▋        | 5/30 [00:02<00:12,  1.97it/s]running benchmark:  20%|██        | 6/30 [00:03<00:12,  1.98it/s]running benchmark:  23%|██▎       | 7/30 [00:03<00:11,  1.99it/s]running benchmark:  27%|██▋       | 8/30 [00:04<00:11,  2.00it/s]running benchmark:  30%|███       | 9/30 [00:04<00:10,  2.00it/s]running benchmark:  33%|███▎      | 10/30 [00:05<00:09,  2.00it/s]running benchmark:  37%|███▋      | 11/30 [00:05<00:09,  2.00it/s]running benchmark:  40%|████      | 12/30 [00:06<00:08,  2.00it/s]running benchmark:  43%|████▎     | 13/30 [00:06<00:08,  2.01it/s]running benchmark:  47%|████▋     | 14/30 [00:07<00:07,  2.01it/s]running benchmark:  50%|█████     | 15/30 [00:07<00:07,  2.00it/s]running benchmark:  53%|█████▎    | 16/30 [00:08<00:06,  2.01it/s]running benchmark:  57%|█████▋    | 17/30 [00:08<00:06,  2.00it/s]running benchmark:  60%|██████    | 18/30 [00:09<00:05,  2.00it/s]running benchmark:  63%|██████▎   | 19/30 [00:09<00:05,  2.01it/s]running benchmark:  67%|██████▋   | 20/30 [00:10<00:04,  2.01it/s]running benchmark:  70%|███████   | 21/30 [00:10<00:04,  2.01it/s]running benchmark:  73%|███████▎  | 22/30 [00:11<00:03,  2.01it/s]running benchmark:  77%|███████▋  | 23/30 [00:11<00:03,  2.01it/s]running benchmark:  80%|████████  | 24/30 [00:12<00:02,  2.01it/s]running benchmark:  83%|████████▎ | 25/30 [00:12<00:02,  2.01it/s]running benchmark:  87%|████████▋ | 26/30 [00:13<00:01,  2.01it/s]running benchmark:  90%|█████████ | 27/30 [00:13<00:01,  2.00it/s]running benchmark:  93%|█████████▎| 28/30 [00:14<00:00,  2.00it/s]running benchmark:  97%|█████████▋| 29/30 [00:14<00:00,  2.00it/s]running benchmark: 100%|██████████| 30/30 [00:15<00:00,  2.00it/s]running benchmark: 100%|██████████| 30/30 [00:15<00:00,  1.99it/s]
+2224.043ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:04, ?it/s]
+LearningToPaint
+cuda eval  LearningToPaint                     baseline-bs32             
+AUTOTUNE convolution(32x9x128x128, 64x9x3x3)
+  triton_convolution_4 0.0635 ms 100.0%
+  triton_convolution_3 0.0676 ms 93.8%
+  convolution 0.0852 ms 74.5%
+  triton_convolution_5 0.0913 ms 69.5%
+  triton_convolution_0 0.0988 ms 64.2%
+  triton_convolution_2 0.1532 ms 41.4%
+  triton_convolution_1 0.1836 ms 34.6%
+SingleProcess AUTOTUNE takes 1.0333 seconds
+AUTOTUNE convolution(32x64x64x64, 64x64x3x3)
+  convolution 0.0401 ms 100.0%
+  triton_convolution_11 0.1699 ms 23.6%
+  triton_convolution_6 0.1872 ms 21.4%
+  triton_convolution_12 0.1925 ms 20.8%
+  triton_convolution_7 0.2436 ms 16.5%
+  triton_convolution_9 0.2438 ms 16.4%
+  triton_convolution_10 0.2684 ms 14.9%
+  triton_convolution_8 0.5090 ms 7.9%
+SingleProcess AUTOTUNE takes 1.0195 seconds
+AUTOTUNE convolution(32x64x32x32, 64x64x3x3)
+  convolution 0.0302 ms 100.0%
+  triton_convolution_18 0.0970 ms 31.1%
+  triton_convolution_13 0.0989 ms 30.6%
+  triton_convolution_19 0.1257 ms 24.0%
+  triton_convolution_16 0.1288 ms 23.5%
+  triton_convolution_17 0.1351 ms 22.4%
+  triton_convolution_14 0.1779 ms 17.0%
+  triton_convolution_15 0.4814 ms 6.3%
+SingleProcess AUTOTUNE takes 1.0185 seconds
+AUTOTUNE convolution(32x64x64x64, 64x64x1x1)
+  convolution 0.0164 ms 100.0%
+  triton_convolution_24 0.0176 ms 93.1%
+  triton_convolution_25 0.0176 ms 92.9%
+  triton_convolution_21 0.0193 ms 85.0%
+  triton_convolution_20 0.0197 ms 83.3%
+  triton_convolution_23 0.0204 ms 80.1%
+  triton_convolution_26 0.0219 ms 74.7%
+  triton_convolution_22 0.0647 ms 25.3%
+SingleProcess AUTOTUNE takes 1.0235 seconds
+AUTOTUNE convolution(32x64x32x32, 128x64x3x3)
+  convolution 0.0204 ms 100.0%
+  triton_convolution_47 0.0937 ms 21.8%
+  triton_convolution_41 0.1041 ms 19.6%
+  triton_convolution_44 0.1109 ms 18.4%
+  triton_convolution_46 0.1142 ms 17.9%
+  triton_convolution_45 0.1398 ms 14.6%
+  triton_convolution_42 0.1594 ms 12.8%
+  triton_convolution_43 0.2550 ms 8.0%
+SingleProcess AUTOTUNE takes 1.0064 seconds
+AUTOTUNE convolution(32x128x16x16, 128x128x3x3)
+  convolution 0.0248 ms 100.0%
+  triton_convolution_51 0.1146 ms 21.7%
+  triton_convolution_48 0.1192 ms 20.8%
+  triton_convolution_54 0.1253 ms 19.8%
+  triton_convolution_53 0.1406 ms 17.7%
+  triton_convolution_52 0.1496 ms 16.6%
+  triton_convolution_49 0.2596 ms 9.6%
+  triton_convolution_50 0.4778 ms 5.2%
+SingleProcess AUTOTUNE takes 1.0239 seconds
+AUTOTUNE convolution(32x64x32x32, 128x64x1x1)
+  convolution 0.0111 ms 100.0%
+  triton_convolution_55 0.0121 ms 91.8%
+  triton_convolution_59 0.0122 ms 90.8%
+  triton_convolution_58 0.0123 ms 90.1%
+  triton_convolution_61 0.0134 ms 82.6%
+  triton_convolution_60 0.0147 ms 75.4%
+  triton_convolution_56 0.0149 ms 74.5%
+  triton_convolution_57 0.0350 ms 31.7%
+SingleProcess AUTOTUNE takes 1.0286 seconds
+AUTOTUNE convolution(32x128x16x16, 256x128x3x3)
+  convolution 0.0183 ms 100.0%
+  triton_convolution_81 0.1582 ms 11.5%
+  triton_convolution_80 0.1861 ms 9.8%
+  triton_convolution_82 0.1957 ms 9.3%
+  triton_convolution_79 0.2342 ms 7.8%
+  triton_convolution_76 0.3015 ms 6.1%
+  triton_convolution_77 0.3418 ms 5.3%
+  triton_convolution_78 0.4920 ms 3.7%
+SingleProcess AUTOTUNE takes 1.0239 seconds
+AUTOTUNE convolution(32x256x8x8, 256x256x3x3)
+  convolution 0.0273 ms 100.0%
+  triton_convolution_87 0.2354 ms 11.6%
+  triton_convolution_88 0.2456 ms 11.1%
+  triton_convolution_86 0.2840 ms 9.6%
+  triton_convolution_89 0.2862 ms 9.5%
+  triton_convolution_83 0.5391 ms 5.1%
+  triton_convolution_84 0.5709 ms 4.8%
+  triton_convolution_85 0.9104 ms 3.0%
+SingleProcess AUTOTUNE takes 1.0709 seconds
+AUTOTUNE convolution(32x128x16x16, 256x128x1x1)
+  convolution 0.0099 ms 100.0%
+  triton_convolution_94 0.0120 ms 82.7%
+  triton_convolution_93 0.0165 ms 60.0%
+  triton_convolution_95 0.0173 ms 57.3%
+  triton_convolution_96 0.0174 ms 57.0%
+  triton_convolution_91 0.0213 ms 46.5%
+  triton_convolution_90 0.0220 ms 45.0%
+  triton_convolution_92 0.0619 ms 16.0%
+SingleProcess AUTOTUNE takes 1.3018 seconds
+AUTOTUNE convolution(32x256x8x8, 512x256x3x3)
+  convolution 0.0194 ms 100.0%
+  triton_convolution_116 0.3276 ms 5.9%
+  triton_convolution_117 0.3928 ms 4.9%
+  triton_convolution_115 0.3949 ms 4.9%
+  triton_convolution_114 0.4701 ms 4.1%
+  triton_convolution_113 0.5917 ms 3.3%
+  triton_convolution_111 0.6188 ms 3.1%
+  triton_convolution_112 0.6970 ms 2.8%
+SingleProcess AUTOTUNE takes 1.2014 seconds
+AUTOTUNE convolution(32x512x4x4, 512x512x3x3)
+  convolution 0.0288 ms 100.0%
+  triton_convolution_122 0.5036 ms 5.7%
+  triton_convolution_123 0.5508 ms 5.2%
+  triton_convolution_124 0.6708 ms 4.3%
+  triton_convolution_121 0.7496 ms 3.8%
+  triton_convolution_120 0.8130 ms 3.5%
+  triton_convolution_119 1.1815 ms 2.4%
+  triton_convolution_118 1.2668 ms 2.3%
+SingleProcess AUTOTUNE takes 1.3202 seconds
+AUTOTUNE convolution(32x256x8x8, 512x256x1x1)
+  convolution 0.0104 ms 100.0%
+  triton_convolution_129 0.0163 ms 64.0%
+  triton_convolution_128 0.0257 ms 40.6%
+  triton_convolution_131 0.0260 ms 40.2%
+  triton_convolution_130 0.0264 ms 39.5%
+  triton_convolution_126 0.0344 ms 30.3%
+  triton_convolution_125 0.0373 ms 28.0%
+  triton_convolution_127 0.0726 ms 14.4%
+SingleProcess AUTOTUNE takes 1.1676 seconds
+AUTOTUNE mm(32x512, 512x65)
+  triton_mm_151 0.0102 ms 100.0%
+  triton_mm_155 0.0103 ms 99.1%
+  triton_mm_152 0.0104 ms 98.0%
+  triton_mm_150 0.0105 ms 97.3%
+  triton_mm_154 0.0111 ms 92.2%
+  triton_mm_149 0.0121 ms 84.5%
+  triton_mm_148 0.0123 ms 83.3%
+  mm 0.0135 ms 75.8%
+  triton_mm_147 0.0135 ms 75.6%
+  triton_mm_146 0.0180 ms 56.8%
+SingleProcess AUTOTUNE takes 3.9255 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  83%|████████▎ | 25/30 [00:00<00:00, 240.90it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 241.79it/s]
+3575.345ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model Super_SloMo does not support bfloat16, running with amp instead
+loading model: 0it [00:03, ?it/s]
+WARNING:common:Model Super_SloMo does not support bfloat16, running with amp instead
+Super_SloMo
+cuda eval  Super_SloMo                         baseline-bs32             
+WARNING:common:Model Super_SloMo does not support bfloat16, running with amp instead
+AUTOTUNE convolution(10x6x352x352, 32x6x7x7)
+  convolution 0.4051 ms 100.0%
+  triton_convolution_0 1.6728 ms 24.2%
+  triton_convolution_4 1.7142 ms 23.6%
+  triton_convolution_3 1.7154 ms 23.6%
+  triton_convolution_1 2.0016 ms 20.2%
+  triton_convolution_5 2.0110 ms 20.1%
+  triton_convolution_2 2.0408 ms 19.8%
+SingleProcess AUTOTUNE takes 1.0102 seconds
+AUTOTUNE convolution(10x32x352x352, 32x32x7x7)
+  convolution 1.0227 ms 100.0%
+  triton_convolution_12 2.6025 ms 39.3%
+  triton_convolution_9 2.8543 ms 35.8%
+  triton_convolution_10 3.1631 ms 32.3%
+  triton_convolution_11 3.2987 ms 31.0%
+  triton_convolution_6 3.3156 ms 30.8%
+  triton_convolution_7 4.0781 ms 25.1%
+  triton_convolution_8 8.4153 ms 12.2%
+SingleProcess AUTOTUNE takes 1.2314 seconds
+AUTOTUNE convolution(10x32x176x176, 64x32x5x5)
+  convolution 0.1714 ms 100.0%
+  triton_convolution_13 0.8038 ms 21.3%
+  triton_convolution_19 0.8417 ms 20.4%
+  triton_convolution_18 0.8697 ms 19.7%
+  triton_convolution_14 1.1790 ms 14.5%
+  triton_convolution_17 1.2292 ms 13.9%
+  triton_convolution_16 1.2782 ms 13.4%
+  triton_convolution_15 2.2396 ms 7.7%
+SingleProcess AUTOTUNE takes 1.1204 seconds
+AUTOTUNE convolution(10x64x176x176, 64x64x5x5)
+  convolution 0.3466 ms 100.0%
+  triton_convolution_26 1.6010 ms 21.6%
+  triton_convolution_20 1.6411 ms 21.1%
+  triton_convolution_25 1.7525 ms 19.8%
+  triton_convolution_24 2.4291 ms 14.3%
+  triton_convolution_23 2.5250 ms 13.7%
+  triton_convolution_21 2.6160 ms 13.2%
+  triton_convolution_22 7.9167 ms 4.4%
+SingleProcess AUTOTUNE takes 1.1982 seconds
+AUTOTUNE convolution(10x64x88x88, 128x64x3x3)
+  convolution 0.0668 ms 100.0%
+  triton_convolution_33 0.3317 ms 20.1%
+  triton_convolution_30 0.3477 ms 19.2%
+  triton_convolution_27 0.3574 ms 18.7%
+  triton_convolution_32 0.4096 ms 16.3%
+  triton_convolution_28 0.5118 ms 13.0%
+  triton_convolution_31 0.5417 ms 12.3%
+  triton_convolution_29 1.4522 ms 4.6%
+SingleProcess AUTOTUNE takes 1.0754 seconds
+AUTOTUNE convolution(10x128x88x88, 128x128x3x3)
+  convolution 0.1110 ms 100.0%
+  triton_convolution_40 0.6823 ms 16.3%
+  triton_convolution_37 0.6860 ms 16.2%
+  triton_convolution_34 0.8097 ms 13.7%
+  triton_convolution_39 0.8132 ms 13.6%
+  triton_convolution_35 1.0634 ms 10.4%
+  triton_convolution_38 1.2116 ms 9.2%
+  triton_convolution_36 2.8825 ms 3.9%
+SingleProcess AUTOTUNE takes 1.1243 seconds
+AUTOTUNE convolution(10x128x44x44, 256x128x3x3)
+  convolution 0.0572 ms 100.0%
+  triton_convolution_46 0.2893 ms 19.8%
+  triton_convolution_44 0.3039 ms 18.8%
+  triton_convolution_47 0.3189 ms 17.9%
+  triton_convolution_45 0.4777 ms 12.0%
+  triton_convolution_41 0.5059 ms 11.3%
+  triton_convolution_42 0.6265 ms 9.1%
+  triton_convolution_43 1.4288 ms 4.0%
+SingleProcess AUTOTUNE takes 1.1300 seconds
+AUTOTUNE convolution(10x256x44x44, 256x256x3x3)
+  convolution 0.1002 ms 100.0%
+  triton_convolution_53 0.6347 ms 15.8%
+  triton_convolution_51 0.6525 ms 15.4%
+  triton_convolution_54 0.7544 ms 13.3%
+  triton_convolution_48 1.0560 ms 9.5%
+  triton_convolution_52 1.2252 ms 8.2%
+  triton_convolution_49 1.2689 ms 7.9%
+  triton_convolution_50 2.8444 ms 3.5%
+SingleProcess AUTOTUNE takes 1.1120 seconds
+AUTOTUNE convolution(10x256x22x22, 512x256x3x3)
+  convolution 0.0668 ms 100.0%
+  triton_convolution_60 0.4024 ms 16.6%
+  triton_convolution_58 0.4129 ms 16.2%
+  triton_convolution_61 0.5147 ms 13.0%
+  triton_convolution_55 0.5696 ms 11.7%
+  triton_convolution_59 0.6031 ms 11.1%
+  triton_convolution_56 0.6616 ms 10.1%
+  triton_convolution_57 1.8366 ms 3.6%
+SingleProcess AUTOTUNE takes 1.2430 seconds
+AUTOTUNE convolution(10x512x22x22, 512x512x3x3)
+  convolution 0.1225 ms 100.0%
+  triton_convolution_67 0.8697 ms 14.1%
+  triton_convolution_68 1.0618 ms 11.5%
+  triton_convolution_62 1.1185 ms 11.0%
+  triton_convolution_65 1.1919 ms 10.3%
+  triton_convolution_63 1.3020 ms 9.4%
+  triton_convolution_66 1.5615 ms 7.8%
+  triton_convolution_64 3.6808 ms 3.3%
+SingleProcess AUTOTUNE takes 1.1338 seconds
+AUTOTUNE convolution(10x512x11x11, 512x512x3x3)
+  convolution 0.0457 ms 100.0%
+  triton_convolution_74 0.5009 ms 9.1%
+  triton_convolution_73 0.5816 ms 7.9%
+  triton_convolution_75 0.6004 ms 7.6%
+  triton_convolution_72 0.6480 ms 7.1%
+  triton_convolution_69 1.1056 ms 4.1%
+  triton_convolution_70 1.2163 ms 3.8%
+  triton_convolution_71 1.7476 ms 2.6%
+SingleProcess AUTOTUNE takes 1.0900 seconds
+AUTOTUNE convolution(10x1024x22x22, 512x1024x3x3)
+  convolution 0.2382 ms 100.0%
+  triton_convolution_95 1.7358 ms 13.7%
+  triton_convolution_96 2.2332 ms 10.7%
+  triton_convolution_90 2.2414 ms 10.6%
+  triton_convolution_93 2.4071 ms 9.9%
+  triton_convolution_91 2.5958 ms 9.2%
+  triton_convolution_94 3.1264 ms 7.6%
+  triton_convolution_92 7.3504 ms 3.2%
+SingleProcess AUTOTUNE takes 1.1987 seconds
+AUTOTUNE convolution(10x512x44x44, 256x512x3x3)
+  convolution 0.1947 ms 100.0%
+  triton_convolution_102 1.3578 ms 14.3%
+  triton_convolution_103 1.6089 ms 12.1%
+  triton_convolution_100 1.9177 ms 10.2%
+  triton_convolution_97 2.1178 ms 9.2%
+  triton_convolution_98 3.2503 ms 6.0%
+  triton_convolution_101 3.2693 ms 6.0%
+  triton_convolution_99 5.7004 ms 3.4%
+SingleProcess AUTOTUNE takes 1.2007 seconds
+AUTOTUNE convolution(10x256x88x88, 128x256x3x3)
+  convolution 0.2054 ms 100.0%
+  triton_convolution_114 1.4644 ms 14.0%
+  triton_convolution_116 1.6188 ms 12.7%
+  triton_convolution_117 1.6679 ms 12.3%
+  triton_convolution_111 1.9225 ms 10.7%
+  triton_convolution_112 3.0883 ms 6.7%
+  triton_convolution_115 3.1782 ms 6.5%
+  triton_convolution_113 5.8652 ms 3.5%
+SingleProcess AUTOTUNE takes 1.1853 seconds
+AUTOTUNE convolution(10x128x176x176, 64x128x3x3)
+  convolution 0.2506 ms 100.0%
+  triton_convolution_131 1.3774 ms 18.2%
+  triton_convolution_130 1.5745 ms 15.9%
+  triton_convolution_125 1.7185 ms 14.6%
+  triton_convolution_126 2.1403 ms 11.7%
+  triton_convolution_129 2.3830 ms 10.5%
+  triton_convolution_128 2.4105 ms 10.4%
+  triton_convolution_127 5.7690 ms 4.3%
+SingleProcess AUTOTUNE takes 1.1762 seconds
+AUTOTUNE convolution(10x64x352x352, 32x64x3x3)
+  convolution 0.4347 ms 100.0%
+  triton_convolution_139 1.4183 ms 30.6%
+  triton_convolution_145 1.5649 ms 27.8%
+  triton_convolution_142 1.6545 ms 26.3%
+  triton_convolution_143 1.7294 ms 25.1%
+  triton_convolution_144 1.8341 ms 23.7%
+  triton_convolution_140 1.9008 ms 22.9%
+  triton_convolution_141 5.5900 ms 7.8%
+SingleProcess AUTOTUNE takes 1.1733 seconds
+AUTOTUNE convolution(10x32x352x352, 4x32x3x3)
+  convolution 0.1819 ms 100.0%
+  triton_convolution_154 0.4645 ms 39.2%
+  triton_convolution_153 0.5057 ms 36.0%
+  triton_convolution_155 0.5179 ms 35.1%
+  triton_convolution_157 0.8042 ms 22.6%
+  triton_convolution_158 0.8069 ms 22.5%
+  triton_convolution_156 0.8303 ms 21.9%
+SingleProcess AUTOTUNE takes 0.9586 seconds
+AUTOTUNE convolution(10x20x352x352, 32x20x7x7)
+  convolution 0.9061 ms 100.0%
+  triton_convolution_165 2.2383 ms 40.5%
+  triton_convolution_162 2.2791 ms 39.8%
+  triton_convolution_164 2.7223 ms 33.3%
+  triton_convolution_163 2.9282 ms 30.9%
+  triton_convolution_159 3.5591 ms 25.5%
+  triton_convolution_160 3.6533 ms 24.8%
+  triton_convolution_161 4.8866 ms 18.5%
+SingleProcess AUTOTUNE takes 1.3022 seconds
+AUTOTUNE convolution(10x32x352x352, 5x32x3x3)
+  convolution 0.1832 ms 100.0%
+  triton_convolution_313 0.5725 ms 32.0%
+  triton_convolution_314 0.7561 ms 24.2%
+  triton_convolution_317 0.8319 ms 22.0%
+  triton_convolution_318 0.8323 ms 22.0%
+  triton_convolution_316 0.9472 ms 19.3%
+  triton_convolution_315 12.1200 ms 1.5%
+SingleProcess AUTOTUNE takes 1.0423 seconds
+AUTOTUNE convolution(10x3x352x352, 64x3x3x3)
+  convolution 0.2279 ms 100.0%
+  triton_convolution_323 0.4771 ms 47.8%
+  triton_convolution_322 0.4906 ms 46.5%
+  triton_convolution_324 0.6371 ms 35.8%
+  triton_convolution_319 0.6589 ms 34.6%
+  triton_convolution_321 0.7163 ms 31.8%
+  triton_convolution_320 1.0734 ms 21.2%
+SingleProcess AUTOTUNE takes 0.9668 seconds
+AUTOTUNE convolution(10x64x352x352, 64x64x3x3)
+  convolution 0.5611 ms 100.0%
+  triton_convolution_331 2.6913 ms 20.8%
+  triton_convolution_325 2.7278 ms 20.6%
+  triton_convolution_330 3.0791 ms 18.2%
+  triton_convolution_326 3.9841 ms 14.1%
+  triton_convolution_328 4.6171 ms 12.2%
+  triton_convolution_329 4.7695 ms 11.8%
+  triton_convolution_327 10.9648 ms 5.1%
+SingleProcess AUTOTUNE takes 1.2558 seconds
+AUTOTUNE convolution(10x64x176x176, 128x64x3x3)
+  convolution 0.2482 ms 100.0%
+  triton_convolution_332 1.3398 ms 18.5%
+  triton_convolution_338 1.3429 ms 18.5%
+  triton_convolution_335 1.3693 ms 18.1%
+  triton_convolution_337 1.5425 ms 16.1%
+  triton_convolution_333 2.0479 ms 12.1%
+  triton_convolution_336 2.3715 ms 10.5%
+  triton_convolution_334 5.5835 ms 4.4%
+SingleProcess AUTOTUNE takes 1.1699 seconds
+AUTOTUNE convolution(10x128x176x176, 128x128x3x3)
+  convolution 0.4437 ms 100.0%
+  triton_convolution_345 2.6730 ms 16.6%
+  triton_convolution_342 2.7119 ms 16.4%
+  triton_convolution_344 3.0632 ms 14.5%
+  triton_convolution_339 3.0802 ms 14.4%
+  triton_convolution_340 4.2563 ms 10.4%
+  triton_convolution_343 4.7128 ms 9.4%
+  triton_convolution_341 11.0801 ms 4.0%
+SingleProcess AUTOTUNE takes 1.2551 seconds
+AUTOTUNE convolution(10x128x88x88, 256x128x3x3)
+  convolution 0.2179 ms 100.0%
+  triton_convolution_351 1.1058 ms 19.7%
+  triton_convolution_352 1.3376 ms 16.3%
+  triton_convolution_349 1.3411 ms 16.2%
+  triton_convolution_346 1.5644 ms 13.9%
+  triton_convolution_347 2.1046 ms 10.4%
+  triton_convolution_350 2.3796 ms 9.2%
+  triton_convolution_348 5.7073 ms 3.8%
+SingleProcess AUTOTUNE takes 1.1721 seconds
+AUTOTUNE convolution(10x256x88x88, 256x256x3x3)
+  convolution 0.4098 ms 100.0%
+  triton_convolution_358 2.4003 ms 17.1%
+  triton_convolution_356 2.8635 ms 14.3%
+  triton_convolution_353 3.1903 ms 12.8%
+  triton_convolution_359 3.2783 ms 12.5%
+  triton_convolution_354 6.2100 ms 6.6%
+  triton_convolution_357 6.2844 ms 6.5%
+  triton_convolution_355 11.5706 ms 3.5%
+SingleProcess AUTOTUNE takes 1.2831 seconds
+AUTOTUNE convolution(10x256x44x44, 512x256x3x3)
+  convolution 0.1993 ms 100.0%
+  triton_convolution_420 1.2011 ms 16.6%
+  triton_convolution_418 1.2233 ms 16.3%
+  triton_convolution_421 1.4406 ms 13.8%
+  triton_convolution_415 1.6411 ms 12.1%
+  triton_convolution_416 1.9428 ms 10.3%
+  triton_convolution_419 2.3146 ms 8.6%
+  triton_convolution_417 5.6465 ms 3.5%
+SingleProcess AUTOTUNE takes 1.1683 seconds
+AUTOTUNE convolution(10x512x44x44, 512x512x3x3)
+  convolution 0.3909 ms 100.0%
+  triton_convolution_427 2.6605 ms 14.7%
+  triton_convolution_428 3.0964 ms 12.6%
+  triton_convolution_422 3.3313 ms 11.7%
+  triton_convolution_425 3.6798 ms 10.6%
+  triton_convolution_423 5.5264 ms 7.1%
+  triton_convolution_426 6.2080 ms 6.3%
+  triton_convolution_424 11.3295 ms 3.5%
+SingleProcess AUTOTUNE takes 1.2748 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:05,  5.63it/s]running benchmark:  10%|█         | 3/30 [00:00<00:02,  9.27it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:02, 10.44it/s]running benchmark:  23%|██▎       | 7/30 [00:00<00:02, 11.02it/s]running benchmark:  30%|███       | 9/30 [00:00<00:01, 11.39it/s]running benchmark:  37%|███▋      | 11/30 [00:01<00:01, 11.59it/s]running benchmark:  43%|████▎     | 13/30 [00:01<00:01, 11.68it/s]running benchmark:  50%|█████     | 15/30 [00:01<00:01, 11.76it/s]running benchmark:  57%|█████▋    | 17/30 [00:01<00:01, 11.85it/s]running benchmark:  63%|██████▎   | 19/30 [00:01<00:00, 11.89it/s]running benchmark:  70%|███████   | 21/30 [00:01<00:00, 11.90it/s]running benchmark:  77%|███████▋  | 23/30 [00:02<00:00, 11.91it/s]running benchmark:  83%|████████▎ | 25/30 [00:02<00:00, 11.95it/s]running benchmark:  90%|█████████ | 27/30 [00:02<00:00, 11.96it/s]running benchmark:  97%|█████████▋| 29/30 [00:02<00:00, 11.94it/s]running benchmark: 100%|██████████| 30/30 [00:02<00:00, 11.52it/s]
+1800.606ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+alexnet
+cuda eval  alexnet                             baseline-bs32             
+AUTOTUNE convolution(32x3x224x224, 64x3x11x11)
+  convolution 0.2904 ms 100.0%
+  triton_convolution_3 0.4692 ms 61.9%
+  triton_convolution_4 0.5008 ms 58.0%
+  triton_convolution_5 0.5524 ms 52.6%
+  triton_convolution_0 0.6201 ms 46.8%
+  triton_convolution_2 0.6622 ms 43.8%
+  triton_convolution_1 1.7181 ms 16.9%
+SingleProcess AUTOTUNE takes 1.1425 seconds
+AUTOTUNE convolution(32x64x27x27, 192x64x5x5)
+  convolution 0.0795 ms 100.0%
+  triton_convolution_12 0.3843 ms 20.7%
+  triton_convolution_11 0.4126 ms 19.3%
+  triton_convolution_9 0.4413 ms 18.0%
+  triton_convolution_6 0.4968 ms 16.0%
+  triton_convolution_10 0.5092 ms 15.6%
+  triton_convolution_7 0.8180 ms 9.7%
+  triton_convolution_8 1.8821 ms 4.2%
+SingleProcess AUTOTUNE takes 1.0993 seconds
+AUTOTUNE convolution(32x192x13x13, 384x192x3x3)
+  convolution 0.0468 ms 100.0%
+  triton_convolution_18 0.2633 ms 17.8%
+  triton_convolution_16 0.2869 ms 16.3%
+  triton_convolution_19 0.3322 ms 14.1%
+  triton_convolution_17 0.3573 ms 13.1%
+  triton_convolution_13 0.3885 ms 12.0%
+  triton_convolution_14 0.5605 ms 8.3%
+  triton_convolution_15 1.3319 ms 3.5%
+SingleProcess AUTOTUNE takes 1.2252 seconds
+AUTOTUNE convolution(32x384x13x13, 256x384x3x3)
+  convolution 0.0534 ms 100.0%
+  triton_convolution_25 0.3549 ms 15.0%
+  triton_convolution_26 0.4321 ms 12.4%
+  triton_convolution_23 0.4595 ms 11.6%
+  triton_convolution_24 0.7596 ms 7.0%
+  triton_convolution_20 0.7784 ms 6.9%
+  triton_convolution_21 0.9505 ms 5.6%
+  triton_convolution_22 1.3391 ms 4.0%
+SingleProcess AUTOTUNE takes 1.0980 seconds
+AUTOTUNE convolution(32x256x13x13, 256x256x3x3)
+  convolution 0.0394 ms 100.0%
+  triton_convolution_32 0.2235 ms 17.6%
+  triton_convolution_30 0.2271 ms 17.4%
+  triton_convolution_33 0.2740 ms 14.4%
+  triton_convolution_31 0.3857 ms 10.2%
+  triton_convolution_27 0.5354 ms 7.4%
+  triton_convolution_28 0.5780 ms 6.8%
+  triton_convolution_29 0.8882 ms 4.4%
+SingleProcess AUTOTUNE takes 1.0794 seconds
+AUTOTUNE mm(32x9216, 9216x4096)
+  mm 0.0752 ms 100.0%
+  triton_mm_39 0.0900 ms 83.5%
+  triton_mm_40 0.0932 ms 80.6%
+  triton_mm_43 0.0949 ms 79.2%
+  triton_mm_42 0.0951 ms 79.0%
+  triton_mm_38 0.1019 ms 73.8%
+  triton_mm_37 0.1139 ms 66.0%
+  triton_mm_36 0.1414 ms 53.2%
+  triton_mm_35 0.1539 ms 48.8%
+  triton_mm_34 0.2047 ms 36.7%
+SingleProcess AUTOTUNE takes 4.3083 seconds
+AUTOTUNE mm(32x4096, 4096x4096)
+  mm 0.0428 ms 100.0%
+  triton_mm_51 0.0484 ms 88.5%
+  triton_mm_54 0.0486 ms 88.2%
+  triton_mm_52 0.0492 ms 87.0%
+  triton_mm_55 0.0494 ms 86.7%
+  triton_mm_50 0.0522 ms 82.1%
+  triton_mm_49 0.0572 ms 74.9%
+  triton_mm_48 0.0707 ms 60.6%
+  triton_mm_47 0.0756 ms 56.6%
+  triton_mm_46 0.0964 ms 44.5%
+SingleProcess AUTOTUNE takes 3.9536 seconds
+AUTOTUNE addmm(32x1000, 32x4096, 4096x1000)
+  bias_addmm 0.0198 ms 100.0%
+  addmm 0.0232 ms 85.4%
+  triton_mm_63 0.0345 ms 57.4%
+  triton_mm_64 0.0390 ms 50.7%
+  triton_mm_67 0.0391 ms 50.6%
+  triton_mm_66 0.0398 ms 49.7%
+  triton_mm_62 0.0437 ms 45.3%
+  triton_mm_61 0.0499 ms 39.7%
+  triton_mm_60 0.0626 ms 31.7%
+  triton_mm_59 0.0672 ms 29.5%
+SingleProcess AUTOTUNE takes 3.9300 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 407.43it/s]
+1262.211ms
+loading model: 0it [00:00, ?it/s]basic_gnn_edgecnn
+loading model: 0it [00:03, ?it/s]
+cuda eval  basic_gnn_edgecnn                   baseline-bs32             
+AUTOTUNE mm(200000x128, 128x64)
+  triton_mm_2 0.0633 ms 100.0%
+  triton_mm_1 0.0634 ms 99.9%
+  triton_mm_8 0.0640 ms 98.9%
+  triton_mm_3 0.0642 ms 98.7%
+  triton_mm_7 0.0647 ms 97.8%
+  triton_mm_0 0.0648 ms 97.7%
+  triton_mm_4 0.0660 ms 95.9%
+  mm 0.0695 ms 91.2%
+  triton_mm_10 0.0792 ms 80.0%
+  triton_mm_9 0.0795 ms 79.7%
+SingleProcess AUTOTUNE takes 1.6979 seconds
+AUTOTUNE addmm(200000x64, 200000x64, 64x64)
+  triton_mm_20 0.0432 ms 100.0%
+  triton_mm_13 0.0474 ms 91.1%
+  triton_mm_14 0.0480 ms 89.9%
+  triton_mm_15 0.0485 ms 89.0%
+  triton_mm_12 0.0496 ms 87.0%
+  triton_mm_16 0.0497 ms 86.9%
+  bias_addmm 0.0498 ms 86.6%
+  triton_mm_19 0.0524 ms 82.5%
+  triton_mm_21 0.0554 ms 77.9%
+  triton_mm_23 0.0640 ms 67.4%
+SingleProcess AUTOTUNE takes 1.7634 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  60%|██████    | 18/30 [00:00<00:00, 175.51it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 187.54it/s]
+1327.952ms
+loading model: 0it [00:00, ?it/s]basic_gnn_gcn
+loading model: 0it [00:03, ?it/s]
+cuda eval  basic_gnn_gcn                       baseline-bs32             
+AUTOTUNE mm(10000x64, 64x64)
+  triton_mm_2 0.0088 ms 100.0%
+  triton_mm_1 0.0091 ms 97.5%
+  triton_mm_0 0.0091 ms 97.2%
+  triton_mm_9 0.0091 ms 97.2%
+  triton_mm_3 0.0093 ms 95.2%
+  triton_mm_5 0.0093 ms 94.5%
+  triton_mm_4 0.0099 ms 89.6%
+  triton_mm_6 0.0099 ms 89.6%
+  triton_mm_8 0.0099 ms 89.6%
+  triton_mm_7 0.0101 ms 87.3%
+SingleProcess AUTOTUNE takes 4.2090 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  50%|█████     | 15/30 [00:00<00:00, 141.99it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 142.39it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 142.15it/s]
+988.501ms
+loading model: 0it [00:00, ?it/s]basic_gnn_gin
+loading model: 0it [00:03, ?it/s]
+cuda eval  basic_gnn_gin                       baseline-bs32             
+AUTOTUNE addmm(10000x64, 10000x64, 64x64)
+  triton_mm_60 0.0094 ms 100.0%
+  triton_mm_63 0.0096 ms 97.7%
+  triton_mm_61 0.0099 ms 94.8%
+  triton_mm_68 0.0100 ms 94.5%
+  triton_mm_62 0.0100 ms 93.9%
+  triton_mm_67 0.0101 ms 93.0%
+  triton_mm_66 0.0102 ms 92.5%
+  triton_mm_64 0.0102 ms 92.2%
+  triton_mm_69 0.0102 ms 92.2%
+  triton_mm_65 0.0105 ms 89.9%
+SingleProcess AUTOTUNE takes 1.7924 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 354.37it/s]
+1213.320ms
+loading model: 0it [00:00, ?it/s]basic_gnn_sage
+loading model: 0it [00:04, ?it/s]
+cuda eval  basic_gnn_sage                      baseline-bs32             
+AUTOTUNE addmm(10000x64, 10000x64, 64x64)
+  triton_mm_69 0.0105 ms 100.0%
+  triton_mm_63 0.0106 ms 98.5%
+  triton_mm_60 0.0108 ms 96.5%
+  triton_mm_61 0.0109 ms 95.9%
+  triton_mm_68 0.0110 ms 95.1%
+  triton_mm_66 0.0110 ms 94.8%
+  triton_mm_67 0.0112 ms 93.4%
+  triton_mm_62 0.0112 ms 93.2%
+  triton_mm_64 0.0114 ms 92.1%
+  triton_mm_65 0.0114 ms 92.1%
+SingleProcess AUTOTUNE takes 1.6425 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 164.91it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 168.75it/s]
+1261.878ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:06, ?it/s]
+cm3leon_generate
+cuda eval  cm3leon_generate                    baseline-bs32             
+AUTOTUNE mm(32x1536, 1536x1536)
+  mm 0.0163 ms 100.0%
+  triton_mm_5 0.0179 ms 91.0%
+  triton_mm_9 0.0195 ms 83.4%
+  triton_mm_6 0.0195 ms 83.3%
+  triton_mm_8 0.0198 ms 82.1%
+  triton_mm_4 0.0210 ms 77.4%
+  triton_mm_3 0.0239 ms 68.1%
+  triton_mm_2 0.0279 ms 58.2%
+  triton_mm_1 0.0300 ms 54.3%
+  triton_mm_0 0.0480 ms 33.9%
+SingleProcess AUTOTUNE takes 4.1946 seconds
+AUTOTUNE addmm(32x1536, 32x1536, 1536x1536)
+  bias_addmm 0.0169 ms 100.0%
+  triton_mm_17 0.0180 ms 94.1%
+  triton_mm_20 0.0199 ms 84.8%
+  triton_mm_21 0.0200 ms 84.6%
+  triton_mm_18 0.0201 ms 84.2%
+  addmm 0.0213 ms 79.2%
+  triton_mm_16 0.0220 ms 76.6%
+  triton_mm_15 0.0246 ms 68.7%
+  triton_mm_14 0.0285 ms 59.3%
+  triton_mm_13 0.0310 ms 54.4%
+SingleProcess AUTOTUNE takes 4.2464 seconds
+AUTOTUNE bmm(512x1x96, 512x96x1)
+  triton_bmm_26 0.0077 ms 100.0%
+  triton_bmm_28 0.0079 ms 97.2%
+  triton_bmm_25 0.0082 ms 93.8%
+  triton_bmm_29 0.0084 ms 91.5%
+  triton_bmm_24 0.0084 ms 91.3%
+  triton_bmm_27 0.0084 ms 91.3%
+  triton_bmm_31 0.0095 ms 81.1%
+  triton_bmm_30 0.0100 ms 77.1%
+  bmm 0.0109 ms 70.9%
+SingleProcess AUTOTUNE takes 2.3931 seconds
+AUTOTUNE bmm(512x1x1, 512x1x96)
+  triton_bmm_45 0.0077 ms 100.0%
+  triton_bmm_44 0.0077 ms 99.6%
+  triton_bmm_46 0.0077 ms 99.6%
+  triton_bmm_47 0.0080 ms 95.6%
+  triton_bmm_51 0.0081 ms 94.9%
+  triton_bmm_48 0.0083 ms 93.0%
+  triton_bmm_50 0.0083 ms 93.0%
+  triton_bmm_49 0.0085 ms 90.6%
+  triton_bmm_52 0.0085 ms 90.6%
+  triton_bmm_53 0.0085 ms 90.6%
+SingleProcess AUTOTUNE takes 2.9004 seconds
+AUTOTUNE mm(32x1536, 1536x6144)
+  triton_mm_74 0.0268 ms 100.0%
+  triton_mm_75 0.0271 ms 98.7%
+  triton_mm_71 0.0275 ms 97.2%
+  mm 0.0276 ms 97.0%
+  triton_mm_72 0.0281 ms 95.1%
+  triton_mm_70 0.0282 ms 94.8%
+  triton_mm_69 0.0303 ms 88.4%
+  triton_mm_68 0.0341 ms 78.5%
+  triton_mm_67 0.0360 ms 74.3%
+  triton_mm_66 0.0560 ms 47.8%
+SingleProcess AUTOTUNE takes 3.9465 seconds
+AUTOTUNE mm(32x6144, 6144x1536)
+  mm 0.0304 ms 100.0%
+  triton_mm_83 0.0514 ms 59.1%
+  triton_mm_84 0.0567 ms 53.5%
+  triton_mm_87 0.0575 ms 52.8%
+  triton_mm_86 0.0588 ms 51.6%
+  triton_mm_82 0.0645 ms 47.1%
+  triton_mm_81 0.0744 ms 40.8%
+  triton_mm_80 0.0953 ms 31.9%
+  triton_mm_79 0.1018 ms 29.8%
+  triton_mm_78 0.1509 ms 20.1%
+SingleProcess AUTOTUNE takes 3.8361 seconds
+AUTOTUNE bmm(512x1x96, 512x96x2)
+  triton_bmm_119 0.0077 ms 100.0%
+  triton_bmm_117 0.0082 ms 94.1%
+  triton_bmm_116 0.0082 ms 93.8%
+  triton_bmm_118 0.0083 ms 93.4%
+  triton_bmm_114 0.0090 ms 85.9%
+  triton_bmm_120 0.0095 ms 81.1%
+  triton_bmm_121 0.0097 ms 79.3%
+  bmm 0.0114 ms 67.5%
+  triton_bmm_115 0.0282 ms 27.3%
+SingleProcess AUTOTUNE takes 2.3537 seconds
+AUTOTUNE bmm(512x1x2, 512x2x96)
+  triton_bmm_134 0.0077 ms 100.0%
+  triton_bmm_137 0.0077 ms 100.0%
+  triton_bmm_139 0.0077 ms 100.0%
+  triton_bmm_140 0.0077 ms 100.0%
+  triton_bmm_143 0.0077 ms 100.0%
+  triton_bmm_135 0.0080 ms 95.6%
+  triton_bmm_141 0.0081 ms 94.9%
+  triton_bmm_136 0.0081 ms 94.5%
+  triton_bmm_138 0.0082 ms 93.4%
+  triton_bmm_142 0.0082 ms 93.4%
+SingleProcess AUTOTUNE takes 2.5650 seconds
+AUTOTUNE bmm(512x1x96, 512x96x3)
+  triton_bmm_205 0.0074 ms 100.0%
+  triton_bmm_207 0.0076 ms 97.9%
+  triton_bmm_208 0.0081 ms 91.3%
+  triton_bmm_206 0.0083 ms 89.9%
+  triton_bmm_209 0.0085 ms 87.5%
+  triton_bmm_204 0.0089 ms 83.6%
+  triton_bmm_211 0.0092 ms 80.6%
+  triton_bmm_210 0.0095 ms 78.1%
+  bmm 0.0112 ms 66.5%
+SingleProcess AUTOTUNE takes 2.4457 seconds
+AUTOTUNE bmm(512x1x3, 512x3x96)
+  triton_bmm_228 0.0077 ms 100.0%
+  triton_bmm_229 0.0079 ms 97.2%
+  triton_bmm_232 0.0079 ms 97.2%
+  triton_bmm_233 0.0079 ms 97.2%
+  triton_bmm_230 0.0081 ms 95.3%
+  triton_bmm_225 0.0082 ms 93.8%
+  triton_bmm_231 0.0082 ms 93.8%
+  triton_bmm_227 0.0083 ms 93.4%
+  triton_bmm_224 0.0083 ms 93.2%
+  triton_bmm_226 0.0084 ms 91.6%
+SingleProcess AUTOTUNE takes 2.6483 seconds
+AUTOTUNE bmm(512x1x96, 512x96x4)
+  triton_bmm_295 0.0075 ms 100.0%
+  triton_bmm_299 0.0079 ms 94.0%
+  triton_bmm_297 0.0082 ms 91.2%
+  triton_bmm_296 0.0083 ms 90.3%
+  triton_bmm_298 0.0083 ms 89.6%
+  triton_bmm_294 0.0084 ms 88.3%
+  triton_bmm_301 0.0092 ms 81.2%
+  triton_bmm_300 0.0095 ms 78.5%
+  bmm 0.0116 ms 64.2%
+SingleProcess AUTOTUNE takes 2.7816 seconds
+AUTOTUNE bmm(512x1x4, 512x4x96)
+  triton_bmm_315 0.0077 ms 100.0%
+  triton_bmm_319 0.0079 ms 96.8%
+  triton_bmm_322 0.0079 ms 96.8%
+  triton_bmm_323 0.0079 ms 96.8%
+  bmm 0.0081 ms 95.2%
+  triton_bmm_321 0.0081 ms 94.9%
+  triton_bmm_316 0.0082 ms 93.4%
+  triton_bmm_314 0.0083 ms 93.0%
+  triton_bmm_318 0.0083 ms 93.0%
+  triton_bmm_320 0.0083 ms 93.0%
+SingleProcess AUTOTUNE takes 2.6268 seconds
+AUTOTUNE bmm(512x1x96, 512x96x5)
+  triton_bmm_387 0.0077 ms 100.0%
+  triton_bmm_389 0.0079 ms 97.2%
+  triton_bmm_385 0.0082 ms 94.5%
+  triton_bmm_386 0.0082 ms 93.8%
+  triton_bmm_388 0.0084 ms 92.0%
+  triton_bmm_384 0.0084 ms 91.3%
+  triton_bmm_391 0.0092 ms 83.7%
+  triton_bmm_390 0.0101 ms 76.5%
+  bmm 0.0114 ms 67.7%
+SingleProcess AUTOTUNE takes 2.7271 seconds
+AUTOTUNE bmm(512x1x5, 512x5x96)
+  triton_bmm_410 0.0077 ms 100.0%
+  triton_bmm_408 0.0077 ms 99.6%
+  triton_bmm_413 0.0080 ms 96.8%
+  bmm 0.0082 ms 93.8%
+  triton_bmm_411 0.0083 ms 93.4%
+  triton_bmm_407 0.0084 ms 92.0%
+  triton_bmm_404 0.0084 ms 92.0%
+  triton_bmm_406 0.0084 ms 91.3%
+  triton_bmm_405 0.0085 ms 90.9%
+  triton_bmm_412 0.0085 ms 90.3%
+SingleProcess AUTOTUNE takes 2.8818 seconds
+AUTOTUNE bmm(512x1x96, 512x96x6)
+  triton_bmm_476 0.0077 ms 100.0%
+  triton_bmm_477 0.0077 ms 100.0%
+  triton_bmm_479 0.0080 ms 97.2%
+  triton_bmm_475 0.0082 ms 94.0%
+  triton_bmm_478 0.0085 ms 91.3%
+  triton_bmm_474 0.0089 ms 86.7%
+  triton_bmm_480 0.0097 ms 79.9%
+  triton_bmm_481 0.0098 ms 79.3%
+  bmm 0.0116 ms 66.5%
+SingleProcess AUTOTUNE takes 2.5636 seconds
+AUTOTUNE bmm(512x1x6, 512x6x96)
+  triton_bmm_497 0.0078 ms 100.0%
+  triton_bmm_501 0.0078 ms 100.0%
+  triton_bmm_494 0.0079 ms 98.8%
+  triton_bmm_498 0.0079 ms 98.8%
+  triton_bmm_499 0.0079 ms 98.8%
+  triton_bmm_500 0.0079 ms 98.8%
+  triton_bmm_495 0.0083 ms 94.6%
+  triton_bmm_502 0.0084 ms 93.2%
+  triton_bmm_496 0.0085 ms 92.5%
+  triton_bmm_503 0.0085 ms 92.5%
+SingleProcess AUTOTUNE takes 2.4387 seconds
+AUTOTUNE bmm(512x1x96, 512x96x7)
+  triton_bmm_565 0.0077 ms 100.0%
+  triton_bmm_566 0.0083 ms 93.4%
+  triton_bmm_567 0.0083 ms 93.4%
+  triton_bmm_568 0.0085 ms 91.1%
+  triton_bmm_569 0.0085 ms 91.0%
+  triton_bmm_564 0.0092 ms 83.7%
+  triton_bmm_571 0.0094 ms 82.6%
+  triton_bmm_570 0.0103 ms 75.4%
+  bmm 0.0114 ms 68.0%
+SingleProcess AUTOTUNE takes 2.4407 seconds
+AUTOTUNE bmm(512x1x7, 512x7x96)
+  triton_bmm_587 0.0079 ms 100.0%
+  triton_bmm_591 0.0079 ms 100.0%
+  triton_bmm_585 0.0081 ms 98.0%
+  triton_bmm_589 0.0082 ms 96.9%
+  triton_bmm_584 0.0084 ms 94.7%
+  triton_bmm_586 0.0085 ms 93.2%
+  triton_bmm_588 0.0085 ms 93.2%
+  triton_bmm_590 0.0085 ms 93.2%
+  triton_bmm_592 0.0087 ms 90.8%
+  triton_bmm_593 0.0087 ms 90.8%
+SingleProcess AUTOTUNE takes 2.4613 seconds
+AUTOTUNE bmm(512x1x96, 512x96x8)
+  triton_bmm_658 0.0078 ms 100.0%
+  triton_bmm_657 0.0082 ms 95.1%
+  triton_bmm_655 0.0083 ms 95.0%
+  triton_bmm_656 0.0083 ms 94.6%
+  triton_bmm_654 0.0085 ms 92.5%
+  triton_bmm_659 0.0086 ms 90.7%
+  triton_bmm_661 0.0098 ms 80.3%
+  triton_bmm_660 0.0103 ms 76.3%
+  bmm 0.0120 ms 65.5%
+SingleProcess AUTOTUNE takes 2.4164 seconds
+AUTOTUNE bmm(512x1x8, 512x8x96)
+  triton_bmm_675 0.0079 ms 100.0%
+  triton_bmm_676 0.0080 ms 99.6%
+  triton_bmm_677 0.0080 ms 99.6%
+  triton_bmm_680 0.0080 ms 99.6%
+  triton_bmm_681 0.0080 ms 99.6%
+  triton_bmm_679 0.0082 ms 96.9%
+  triton_bmm_682 0.0082 ms 96.9%
+  triton_bmm_683 0.0082 ms 96.9%
+  bmm 0.0084 ms 93.9%
+  triton_bmm_674 0.0085 ms 93.2%
+SingleProcess AUTOTUNE takes 2.5870 seconds
+AUTOTUNE bmm(512x1x96, 512x96x9)
+  triton_bmm_746 0.0077 ms 100.0%
+  triton_bmm_745 0.0080 ms 97.2%
+  triton_bmm_749 0.0082 ms 94.5%
+  triton_bmm_747 0.0085 ms 91.5%
+  triton_bmm_748 0.0085 ms 90.6%
+  triton_bmm_744 0.0092 ms 84.0%
+  triton_bmm_750 0.0100 ms 77.6%
+  triton_bmm_751 0.0100 ms 77.6%
+  bmm 0.0119 ms 65.2%
+SingleProcess AUTOTUNE takes 2.6410 seconds
+AUTOTUNE bmm(512x1x9, 512x9x96)
+  triton_bmm_767 0.0080 ms 100.0%
+  triton_bmm_764 0.0081 ms 98.4%
+  triton_bmm_766 0.0081 ms 98.4%
+  triton_bmm_765 0.0082 ms 97.3%
+  triton_bmm_772 0.0085 ms 94.0%
+  triton_bmm_773 0.0085 ms 94.0%
+  triton_bmm_771 0.0085 ms 93.6%
+  triton_bmm_770 0.0087 ms 91.9%
+  triton_bmm_768 0.0087 ms 91.4%
+  triton_bmm_769 0.0090 ms 88.6%
+SingleProcess AUTOTUNE takes 3.1762 seconds
+AUTOTUNE bmm(512x1x96, 512x96x10)
+  triton_bmm_837 0.0079 ms 100.0%
+  triton_bmm_836 0.0084 ms 93.9%
+  triton_bmm_834 0.0087 ms 91.5%
+  triton_bmm_838 0.0087 ms 91.5%
+  triton_bmm_835 0.0087 ms 91.2%
+  triton_bmm_839 0.0089 ms 89.2%
+  triton_bmm_841 0.0100 ms 79.7%
+  triton_bmm_840 0.0105 ms 75.6%
+  bmm 0.0121 ms 65.4%
+SingleProcess AUTOTUNE takes 2.5336 seconds
+AUTOTUNE bmm(512x1x10, 512x10x96)
+  triton_bmm_861 0.0082 ms 100.0%
+  triton_bmm_857 0.0082 ms 99.6%
+  triton_bmm_862 0.0082 ms 99.6%
+  triton_bmm_863 0.0082 ms 99.2%
+  triton_bmm_859 0.0083 ms 98.1%
+  triton_bmm_860 0.0084 ms 97.0%
+  triton_bmm_856 0.0087 ms 93.4%
+  triton_bmm_858 0.0088 ms 93.1%
+  triton_bmm_854 0.0089 ms 91.7%
+  triton_bmm_855 0.0090 ms 91.1%
+SingleProcess AUTOTUNE takes 2.4736 seconds
+AUTOTUNE bmm(512x1x96, 512x96x11)
+  triton_bmm_927 0.0080 ms 100.0%
+  triton_bmm_928 0.0082 ms 96.9%
+  triton_bmm_926 0.0085 ms 93.3%
+  triton_bmm_925 0.0087 ms 91.5%
+  triton_bmm_929 0.0090 ms 88.9%
+  triton_bmm_924 0.0095 ms 84.1%
+  triton_bmm_931 0.0100 ms 79.6%
+  triton_bmm_930 0.0106 ms 75.3%
+  bmm 0.0121 ms 65.7%
+SingleProcess AUTOTUNE takes 2.4467 seconds
+AUTOTUNE bmm(512x1x11, 512x11x96)
+  triton_bmm_951 0.0084 ms 100.0%
+  triton_bmm_944 0.0084 ms 99.4%
+  triton_bmm_945 0.0084 ms 99.4%
+  triton_bmm_946 0.0084 ms 99.4%
+  triton_bmm_950 0.0084 ms 99.4%
+  triton_bmm_953 0.0087 ms 96.1%
+  triton_bmm_947 0.0088 ms 95.4%
+  triton_bmm_948 0.0089 ms 93.7%
+  triton_bmm_949 0.0092 ms 90.5%
+  triton_bmm_952 0.0092 ms 90.5%
+SingleProcess AUTOTUNE takes 2.6642 seconds
+AUTOTUNE bmm(512x1x96, 512x96x12)
+  triton_bmm_1015 0.0080 ms 100.0%
+  triton_bmm_1017 0.0080 ms 100.0%
+  triton_bmm_1019 0.0087 ms 92.3%
+  triton_bmm_1016 0.0088 ms 91.4%
+  triton_bmm_1018 0.0088 ms 90.9%
+  triton_bmm_1014 0.0089 ms 89.6%
+  triton_bmm_1021 0.0101 ms 79.1%
+  triton_bmm_1020 0.0102 ms 78.6%
+  bmm 0.0124 ms 64.6%
+SingleProcess AUTOTUNE takes 2.5259 seconds
+AUTOTUNE bmm(512x1x12, 512x12x96)
+  triton_bmm_1041 0.0084 ms 100.0%
+  triton_bmm_1034 0.0084 ms 99.6%
+  triton_bmm_1038 0.0084 ms 99.6%
+  triton_bmm_1040 0.0084 ms 99.6%
+  triton_bmm_1042 0.0087 ms 96.7%
+  triton_bmm_1036 0.0090 ms 93.9%
+  triton_bmm_1037 0.0090 ms 93.9%
+  triton_bmm_1035 0.0090 ms 93.6%
+  triton_bmm_1039 0.0092 ms 91.0%
+  triton_bmm_1043 0.0092 ms 91.0%
+SingleProcess AUTOTUNE takes 2.5124 seconds
+AUTOTUNE bmm(512x1x96, 512x96x13)
+  triton_bmm_1107 0.0082 ms 100.0%
+  triton_bmm_1106 0.0088 ms 94.0%
+  triton_bmm_1105 0.0088 ms 93.8%
+  triton_bmm_1104 0.0089 ms 92.1%
+  triton_bmm_1108 0.0090 ms 91.5%
+  triton_bmm_1109 0.0092 ms 89.5%
+  triton_bmm_1111 0.0102 ms 80.3%
+  triton_bmm_1110 0.0108 ms 76.5%
+  bmm 0.0124 ms 66.4%
+SingleProcess AUTOTUNE takes 2.5397 seconds
+AUTOTUNE bmm(512x1x13, 512x13x96)
+  triton_bmm_1127 0.0084 ms 100.0%
+  triton_bmm_1131 0.0084 ms 100.0%
+  triton_bmm_1126 0.0085 ms 99.6%
+  triton_bmm_1130 0.0085 ms 99.6%
+  triton_bmm_1132 0.0090 ms 94.3%
+  triton_bmm_1124 0.0090 ms 94.0%
+  triton_bmm_1128 0.0090 ms 94.0%
+  triton_bmm_1125 0.0092 ms 91.7%
+  triton_bmm_1129 0.0095 ms 89.2%
+  triton_bmm_1133 0.0095 ms 89.2%
+SingleProcess AUTOTUNE takes 2.5464 seconds
+AUTOTUNE bmm(512x1x96, 512x96x14)
+  triton_bmm_1196 0.0083 ms 100.0%
+  triton_bmm_1197 0.0083 ms 100.0%
+  triton_bmm_1195 0.0083 ms 99.2%
+  triton_bmm_1199 0.0087 ms 94.9%
+  triton_bmm_1198 0.0091 ms 91.2%
+  triton_bmm_1194 0.0095 ms 86.9%
+  triton_bmm_1201 0.0103 ms 80.4%
+  triton_bmm_1200 0.0106 ms 77.6%
+  bmm 0.0126 ms 65.5%
+SingleProcess AUTOTUNE takes 2.4147 seconds
+AUTOTUNE bmm(512x1x14, 512x14x96)
+  triton_bmm_1215 0.0087 ms 100.0%
+  triton_bmm_1217 0.0087 ms 100.0%
+  triton_bmm_1219 0.0087 ms 100.0%
+  triton_bmm_1223 0.0087 ms 100.0%
+  triton_bmm_1214 0.0087 ms 99.6%
+  triton_bmm_1221 0.0092 ms 94.1%
+  triton_bmm_1222 0.0092 ms 94.1%
+  triton_bmm_1216 0.0092 ms 93.8%
+  triton_bmm_1218 0.0092 ms 93.8%
+  triton_bmm_1220 0.0092 ms 93.8%
+SingleProcess AUTOTUNE takes 3.0712 seconds
+AUTOTUNE bmm(512x1x96, 512x96x15)
+  triton_bmm_1286 0.0083 ms 100.0%
+  triton_bmm_1285 0.0085 ms 97.4%
+  triton_bmm_1288 0.0085 ms 97.0%
+  triton_bmm_1287 0.0088 ms 93.5%
+  triton_bmm_1284 0.0090 ms 92.1%
+  triton_bmm_1289 0.0092 ms 89.3%
+  triton_bmm_1291 0.0098 ms 84.3%
+  triton_bmm_1290 0.0102 ms 80.9%
+  bmm 0.0124 ms 66.7%
+SingleProcess AUTOTUNE takes 2.4027 seconds
+AUTOTUNE bmm(512x1x15, 512x15x96)
+  triton_bmm_1304 0.0087 ms 100.0%
+  triton_bmm_1306 0.0087 ms 100.0%
+  triton_bmm_1308 0.0087 ms 100.0%
+  triton_bmm_1310 0.0087 ms 100.0%
+  triton_bmm_1305 0.0089 ms 97.8%
+  triton_bmm_1307 0.0092 ms 94.4%
+  triton_bmm_1311 0.0092 ms 94.4%
+  triton_bmm_1309 0.0095 ms 91.6%
+  triton_bmm_1313 0.0095 ms 91.6%
+  triton_bmm_1312 0.0095 ms 91.3%
+SingleProcess AUTOTUNE takes 2.7228 seconds
+AUTOTUNE bmm(512x1x96, 512x96x16)
+  triton_bmm_1376 0.0083 ms 100.0%
+  triton_bmm_1377 0.0083 ms 100.0%
+  triton_bmm_1379 0.0089 ms 92.8%
+  triton_bmm_1374 0.0090 ms 92.5%
+  triton_bmm_1375 0.0090 ms 91.8%
+  triton_bmm_1378 0.0091 ms 91.2%
+  triton_bmm_1381 0.0103 ms 80.7%
+  triton_bmm_1380 0.0105 ms 78.7%
+  bmm 0.0128 ms 64.9%
+SingleProcess AUTOTUNE takes 2.6931 seconds
+AUTOTUNE bmm(512x1x16, 512x16x96)
+  triton_bmm_1394 0.0087 ms 100.0%
+  triton_bmm_1403 0.0087 ms 100.0%
+  triton_bmm_1395 0.0090 ms 97.1%
+  triton_bmm_1397 0.0092 ms 94.1%
+  triton_bmm_1399 0.0092 ms 94.1%
+  triton_bmm_1401 0.0092 ms 94.1%
+  triton_bmm_1396 0.0093 ms 93.8%
+  triton_bmm_1402 0.0093 ms 93.8%
+  triton_bmm_1398 0.0094 ms 92.5%
+  triton_bmm_1400 0.0094 ms 92.2%
+SingleProcess AUTOTUNE takes 2.5320 seconds
+AUTOTUNE bmm(512x1x96, 512x96x17)
+  triton_bmm_1466 0.0088 ms 100.0%
+  triton_bmm_1464 0.0093 ms 94.8%
+  triton_bmm_1467 0.0095 ms 92.3%
+  triton_bmm_1465 0.0096 ms 91.7%
+  triton_bmm_1468 0.0098 ms 89.7%
+  triton_bmm_1469 0.0100 ms 88.4%
+  triton_bmm_1471 0.0100 ms 88.1%
+  triton_bmm_1470 0.0108 ms 81.4%
+  bmm 0.0142 ms 61.9%
+SingleProcess AUTOTUNE takes 2.5036 seconds
+AUTOTUNE bmm(512x1x17, 512x17x96)
+  triton_bmm_1487 0.0092 ms 100.0%
+  triton_bmm_1484 0.0097 ms 95.4%
+  triton_bmm_1486 0.0097 ms 95.4%
+  triton_bmm_1490 0.0097 ms 95.4%
+  triton_bmm_1491 0.0098 ms 94.8%
+  triton_bmm_1485 0.0100 ms 92.6%
+  triton_bmm_1488 0.0101 ms 91.9%
+  triton_bmm_1494 0.0101 ms 91.3%
+  triton_bmm_1492 0.0103 ms 90.0%
+  triton_bmm_1493 0.0105 ms 87.8%
+SingleProcess AUTOTUNE takes 2.9956 seconds
+AUTOTUNE bmm(512x1x96, 512x96x18)
+  triton_bmm_1557 0.0089 ms 100.0%
+  triton_bmm_1558 0.0089 ms 99.3%
+  triton_bmm_1559 0.0093 ms 95.2%
+  triton_bmm_1556 0.0096 ms 92.3%
+  triton_bmm_1555 0.0100 ms 88.8%
+  triton_bmm_1562 0.0100 ms 88.8%
+  triton_bmm_1561 0.0108 ms 82.2%
+  triton_bmm_1560 0.0111 ms 80.1%
+  bmm 0.0134 ms 66.0%
+SingleProcess AUTOTUNE takes 2.4297 seconds
+AUTOTUNE bmm(512x1x18, 512x18x96)
+  triton_bmm_1576 0.0094 ms 100.0%
+  triton_bmm_1578 0.0095 ms 99.7%
+  triton_bmm_1582 0.0095 ms 99.7%
+  triton_bmm_1583 0.0095 ms 99.7%
+  triton_bmm_1579 0.0095 ms 99.3%
+  triton_bmm_1585 0.0095 ms 99.3%
+  triton_bmm_1580 0.0100 ms 94.2%
+  triton_bmm_1581 0.0100 ms 94.2%
+  triton_bmm_1577 0.0101 ms 93.1%
+  triton_bmm_1575 0.0102 ms 92.2%
+SingleProcess AUTOTUNE takes 2.9474 seconds
+AUTOTUNE bmm(512x1x96, 512x96x19)
+  triton_bmm_1648 0.0091 ms 100.0%
+  triton_bmm_1649 0.0091 ms 100.0%
+  triton_bmm_1647 0.0093 ms 97.9%
+  triton_bmm_1650 0.0095 ms 95.9%
+  triton_bmm_1646 0.0095 ms 95.3%
+  triton_bmm_1651 0.0102 ms 89.0%
+  triton_bmm_1653 0.0102 ms 89.0%
+  triton_bmm_1652 0.0108 ms 83.8%
+  bmm 0.0143 ms 63.4%
+SingleProcess AUTOTUNE takes 2.4476 seconds
+AUTOTUNE bmm(512x1x19, 512x19x96)
+  triton_bmm_1667 0.0095 ms 100.0%
+  triton_bmm_1669 0.0095 ms 100.0%
+  triton_bmm_1673 0.0095 ms 100.0%
+  triton_bmm_1666 0.0098 ms 96.7%
+  triton_bmm_1672 0.0099 ms 95.8%
+  triton_bmm_1674 0.0100 ms 94.9%
+  triton_bmm_1670 0.0103 ms 91.9%
+  triton_bmm_1676 0.0103 ms 91.9%
+  triton_bmm_1668 0.0105 ms 90.5%
+  triton_bmm_1671 0.0105 ms 90.0%
+SingleProcess AUTOTUNE takes 2.9947 seconds
+AUTOTUNE bmm(512x1x96, 512x96x20)
+  triton_bmm_1739 0.0091 ms 100.0%
+  triton_bmm_1740 0.0091 ms 100.0%
+  triton_bmm_1737 0.0095 ms 95.6%
+  triton_bmm_1741 0.0095 ms 95.6%
+  triton_bmm_1738 0.0098 ms 92.8%
+  triton_bmm_1743 0.0104 ms 87.7%
+  triton_bmm_1744 0.0105 ms 86.6%
+  triton_bmm_1742 0.0106 ms 86.4%
+  bmm 0.0135 ms 67.4%
+SingleProcess AUTOTUNE takes 2.3647 seconds
+AUTOTUNE bmm(512x1x20, 512x20x96)
+  triton_bmm_1758 0.0095 ms 100.0%
+  triton_bmm_1757 0.0098 ms 97.4%
+  triton_bmm_1761 0.0098 ms 97.4%
+  triton_bmm_1766 0.0100 ms 95.2%
+  triton_bmm_1760 0.0102 ms 93.1%
+  triton_bmm_1764 0.0102 ms 93.1%
+  triton_bmm_1762 0.0102 ms 92.8%
+  triton_bmm_1765 0.0102 ms 92.8%
+  triton_bmm_1767 0.0103 ms 92.5%
+  triton_bmm_1763 0.0104 ms 91.0%
+SingleProcess AUTOTUNE takes 2.9434 seconds
+AUTOTUNE bmm(512x1x96, 512x96x21)
+  triton_bmm_1831 0.0093 ms 100.0%
+  triton_bmm_1829 0.0095 ms 97.7%
+  triton_bmm_1828 0.0098 ms 95.4%
+  triton_bmm_1830 0.0099 ms 94.5%
+  triton_bmm_1835 0.0102 ms 90.9%
+  triton_bmm_1832 0.0103 ms 90.1%
+  triton_bmm_1833 0.0105 ms 88.7%
+  triton_bmm_1834 0.0111 ms 84.1%
+  bmm 0.0145 ms 64.1%
+SingleProcess AUTOTUNE takes 2.3526 seconds
+AUTOTUNE bmm(512x1x21, 512x21x96)
+  triton_bmm_1851 0.0097 ms 100.0%
+  triton_bmm_1849 0.0097 ms 99.7%
+  triton_bmm_1853 0.0100 ms 96.5%
+  triton_bmm_1850 0.0101 ms 96.2%
+  triton_bmm_1855 0.0102 ms 94.7%
+  triton_bmm_1858 0.0105 ms 92.1%
+  triton_bmm_1856 0.0106 ms 91.5%
+  triton_bmm_1848 0.0108 ms 90.2%
+  triton_bmm_1852 0.0108 ms 90.2%
+  triton_bmm_1854 0.0108 ms 89.9%
+SingleProcess AUTOTUNE takes 2.9579 seconds
+AUTOTUNE bmm(512x1x96, 512x96x22)
+  triton_bmm_1921 0.0093 ms 100.0%
+  triton_bmm_1923 0.0098 ms 95.7%
+  triton_bmm_1919 0.0098 ms 95.4%
+  triton_bmm_1922 0.0100 ms 93.9%
+  triton_bmm_1920 0.0101 ms 92.7%
+  triton_bmm_1925 0.0106 ms 88.5%
+  triton_bmm_1926 0.0108 ms 86.6%
+  triton_bmm_1924 0.0124 ms 75.6%
+  bmm 0.0138 ms 67.9%
+SingleProcess AUTOTUNE takes 2.6746 seconds
+AUTOTUNE bmm(512x1x22, 512x22x96)
+  triton_bmm_1944 0.0097 ms 100.0%
+  triton_bmm_1946 0.0097 ms 100.0%
+  triton_bmm_1940 0.0098 ms 99.3%
+  triton_bmm_1941 0.0100 ms 97.4%
+  triton_bmm_1943 0.0100 ms 97.4%
+  triton_bmm_1948 0.0102 ms 95.3%
+  triton_bmm_1942 0.0103 ms 94.7%
+  triton_bmm_1947 0.0104 ms 93.5%
+  triton_bmm_1949 0.0105 ms 92.7%
+  triton_bmm_1939 0.0105 ms 92.4%
+SingleProcess AUTOTUNE takes 2.9074 seconds
+AUTOTUNE bmm(512x1x96, 512x96x23)
+  triton_bmm_2012 0.0096 ms 100.0%
+  triton_bmm_2011 0.0096 ms 99.7%
+  triton_bmm_2013 0.0101 ms 94.9%
+  triton_bmm_2010 0.0104 ms 92.6%
+  triton_bmm_2014 0.0104 ms 92.3%
+  triton_bmm_2017 0.0105 ms 91.5%
+  triton_bmm_2015 0.0107 ms 89.6%
+  triton_bmm_2016 0.0111 ms 86.7%
+  bmm 0.0147 ms 65.2%
+SingleProcess AUTOTUNE takes 2.3952 seconds
+AUTOTUNE bmm(512x1x23, 512x23x96)
+  triton_bmm_2031 0.0100 ms 100.0%
+  triton_bmm_2040 0.0102 ms 97.5%
+  triton_bmm_2030 0.0103 ms 96.9%
+  triton_bmm_2035 0.0103 ms 96.6%
+  triton_bmm_2036 0.0103 ms 96.6%
+  triton_bmm_2039 0.0104 ms 95.7%
+  triton_bmm_2033 0.0105 ms 94.5%
+  triton_bmm_2037 0.0105 ms 94.5%
+  triton_bmm_2032 0.0108 ms 92.0%
+  triton_bmm_2034 0.0108 ms 92.0%
+SingleProcess AUTOTUNE takes 2.9807 seconds
+AUTOTUNE bmm(512x1x96, 512x96x24)
+  triton_bmm_2103 0.0096 ms 100.0%
+  triton_bmm_2102 0.0101 ms 95.6%
+  triton_bmm_2104 0.0102 ms 94.7%
+  triton_bmm_2106 0.0102 ms 94.1%
+  triton_bmm_2105 0.0105 ms 91.8%
+  triton_bmm_2101 0.0105 ms 91.5%
+  triton_bmm_2108 0.0110 ms 87.5%
+  triton_bmm_2107 0.0113 ms 85.3%
+  bmm 0.0142 ms 67.8%
+SingleProcess AUTOTUNE takes 2.3463 seconds
+AUTOTUNE bmm(512x1x24, 512x24x96)
+  triton_bmm_2126 0.0099 ms 100.0%
+  triton_bmm_2129 0.0099 ms 100.0%
+  triton_bmm_2124 0.0100 ms 99.4%
+  triton_bmm_2131 0.0100 ms 99.0%
+  triton_bmm_2122 0.0100 ms 98.7%
+  triton_bmm_2121 0.0102 ms 96.6%
+  triton_bmm_2128 0.0105 ms 94.2%
+  triton_bmm_2123 0.0108 ms 91.7%
+  triton_bmm_2125 0.0108 ms 91.7%
+  triton_bmm_2127 0.0108 ms 91.7%
+SingleProcess AUTOTUNE takes 3.3916 seconds
+AUTOTUNE mm(32x1536, 1536x2048)
+  mm 0.0153 ms 100.0%
+  triton_mm_2173 0.0180 ms 84.9%
+  triton_mm_2174 0.0195 ms 78.6%
+  triton_mm_2176 0.0202 ms 75.6%
+  triton_mm_2177 0.0203 ms 75.5%
+  triton_mm_2172 0.0212 ms 72.0%
+  triton_mm_2171 0.0238 ms 64.3%
+  triton_mm_2170 0.0282 ms 54.3%
+  triton_mm_2169 0.0304 ms 50.4%
+  triton_mm_2168 0.0490 ms 31.2%
+SingleProcess AUTOTUNE takes 4.1050 seconds
+AUTOTUNE bmm(512x1x96, 512x96x25)
+  triton_bmm_2204 0.0100 ms 100.0%
+  triton_bmm_2208 0.0101 ms 99.7%
+  triton_bmm_2207 0.0103 ms 97.8%
+  triton_bmm_2206 0.0103 ms 97.5%
+  triton_bmm_2205 0.0104 ms 96.9%
+  triton_bmm_2211 0.0110 ms 91.0%
+  triton_bmm_2209 0.0113 ms 89.2%
+  triton_bmm_2210 0.0114 ms 88.5%
+  bmm 0.0151 ms 66.4%
+SingleProcess AUTOTUNE takes 2.8310 seconds
+AUTOTUNE bmm(512x1x25, 512x25x96)
+  triton_bmm_2227 0.0100 ms 100.0%
+  triton_bmm_2225 0.0101 ms 98.4%
+  triton_bmm_2234 0.0102 ms 97.5%
+  triton_bmm_2229 0.0105 ms 95.1%
+  triton_bmm_2230 0.0105 ms 95.1%
+  triton_bmm_2228 0.0105 ms 94.8%
+  triton_bmm_2231 0.0105 ms 94.8%
+  triton_bmm_2226 0.0109 ms 91.2%
+  triton_bmm_2224 0.0110 ms 90.4%
+  triton_bmm_2232 0.0110 ms 90.4%
+SingleProcess AUTOTUNE takes 3.0126 seconds
+AUTOTUNE bmm(512x1x96, 512x96x26)
+  triton_bmm_2296 0.0099 ms 100.0%
+  triton_bmm_2297 0.0099 ms 100.0%
+  triton_bmm_2298 0.0099 ms 100.0%
+  triton_bmm_2295 0.0100 ms 98.1%
+  triton_bmm_2299 0.0101 ms 97.2%
+  triton_bmm_2302 0.0110 ms 89.3%
+  triton_bmm_2301 0.0113 ms 87.3%
+  triton_bmm_2300 0.0130 ms 76.0%
+  bmm 0.0142 ms 69.5%
+SingleProcess AUTOTUNE takes 2.6754 seconds
+AUTOTUNE bmm(512x1x26, 512x26x96)
+  triton_bmm_2318 0.0100 ms 100.0%
+  triton_bmm_2316 0.0102 ms 97.5%
+  triton_bmm_2324 0.0103 ms 96.6%
+  triton_bmm_2315 0.0103 ms 96.3%
+  triton_bmm_2317 0.0103 ms 96.3%
+  triton_bmm_2319 0.0103 ms 96.3%
+  triton_bmm_2322 0.0105 ms 94.8%
+  triton_bmm_2323 0.0107 ms 93.0%
+  triton_bmm_2320 0.0108 ms 92.6%
+  triton_bmm_2325 0.0108 ms 92.6%
+SingleProcess AUTOTUNE takes 3.2046 seconds
+AUTOTUNE bmm(512x1x96, 512x96x27)
+  triton_bmm_2386 0.0102 ms 100.0%
+  triton_bmm_2388 0.0105 ms 97.9%
+  triton_bmm_2389 0.0105 ms 97.3%
+  triton_bmm_2387 0.0106 ms 96.4%
+  triton_bmm_2393 0.0108 ms 95.0%
+  triton_bmm_2390 0.0108 ms 94.4%
+  triton_bmm_2391 0.0115 ms 88.8%
+  triton_bmm_2392 0.0116 ms 88.6%
+  bmm 0.0153 ms 67.1%
+SingleProcess AUTOTUNE takes 2.7865 seconds
+AUTOTUNE bmm(512x1x27, 512x27x96)
+  triton_bmm_2409 0.0102 ms 100.0%
+  triton_bmm_2407 0.0104 ms 98.5%
+  triton_bmm_2411 0.0106 ms 96.7%
+  triton_bmm_2414 0.0106 ms 96.7%
+  triton_bmm_2410 0.0107 ms 96.1%
+  triton_bmm_2412 0.0107 ms 96.1%
+  triton_bmm_2413 0.0108 ms 95.0%
+  triton_bmm_2416 0.0109 ms 94.1%
+  triton_bmm_2406 0.0111 ms 92.2%
+  triton_bmm_2408 0.0112 ms 91.2%
+SingleProcess AUTOTUNE takes 3.3844 seconds
+AUTOTUNE bmm(512x1x96, 512x96x28)
+  triton_bmm_2477 0.0103 ms 100.0%
+  triton_bmm_2481 0.0104 ms 99.1%
+  triton_bmm_2479 0.0106 ms 97.0%
+  triton_bmm_2480 0.0106 ms 97.0%
+  triton_bmm_2478 0.0107 ms 95.8%
+  triton_bmm_2484 0.0113 ms 90.9%
+  triton_bmm_2482 0.0115 ms 89.2%
+  triton_bmm_2483 0.0116 ms 88.9%
+  bmm 0.0142 ms 72.1%
+SingleProcess AUTOTUNE takes 2.3206 seconds
+AUTOTUNE bmm(512x1x28, 512x28x96)
+  triton_bmm_2502 0.0103 ms 100.0%
+  triton_bmm_2498 0.0103 ms 99.7%
+  triton_bmm_2507 0.0105 ms 98.2%
+  triton_bmm_2499 0.0106 ms 96.7%
+  triton_bmm_2503 0.0106 ms 96.7%
+  triton_bmm_2500 0.0108 ms 95.5%
+  triton_bmm_2504 0.0108 ms 95.5%
+  triton_bmm_2505 0.0108 ms 95.3%
+  triton_bmm_2506 0.0110 ms 93.3%
+  triton_bmm_2497 0.0112 ms 91.6%
+SingleProcess AUTOTUNE takes 3.0546 seconds
+AUTOTUNE bmm(512x1x96, 512x96x29)
+  triton_bmm_2571 0.0102 ms 100.0%
+  triton_bmm_2568 0.0104 ms 98.5%
+  triton_bmm_2570 0.0108 ms 95.2%
+  triton_bmm_2569 0.0109 ms 94.1%
+  triton_bmm_2572 0.0111 ms 92.2%
+  triton_bmm_2575 0.0115 ms 89.4%
+  triton_bmm_2574 0.0116 ms 88.6%
+  triton_bmm_2573 0.0117 ms 87.7%
+  bmm 0.0155 ms 66.1%
+SingleProcess AUTOTUNE takes 2.5899 seconds
+AUTOTUNE bmm(512x1x29, 512x29x96)
+  triton_bmm_2591 0.0105 ms 100.0%
+  triton_bmm_2595 0.0105 ms 100.0%
+  triton_bmm_2593 0.0108 ms 97.0%
+  triton_bmm_2590 0.0109 ms 96.5%
+  triton_bmm_2594 0.0109 ms 96.5%
+  triton_bmm_2598 0.0111 ms 94.8%
+  triton_bmm_2589 0.0112 ms 94.0%
+  triton_bmm_2596 0.0113 ms 92.7%
+  triton_bmm_2592 0.0114 ms 92.1%
+  triton_bmm_2588 0.0114 ms 92.0%
+SingleProcess AUTOTUNE takes 2.8982 seconds
+AUTOTUNE bmm(512x1x96, 512x96x30)
+  triton_bmm_2662 0.0103 ms 100.0%
+  triton_bmm_2660 0.0103 ms 99.2%
+  triton_bmm_2663 0.0106 ms 96.8%
+  triton_bmm_2661 0.0108 ms 95.1%
+  triton_bmm_2666 0.0108 ms 95.1%
+  triton_bmm_2665 0.0110 ms 92.9%
+  triton_bmm_2659 0.0111 ms 92.6%
+  triton_bmm_2664 0.0133 ms 77.0%
+  bmm 0.0145 ms 70.8%
+SingleProcess AUTOTUNE takes 2.8634 seconds
+AUTOTUNE bmm(512x1x30, 512x30x96)
+  triton_bmm_2688 0.0107 ms 100.0%
+  triton_bmm_2681 0.0108 ms 99.4%
+  triton_bmm_2685 0.0108 ms 99.4%
+  triton_bmm_2687 0.0110 ms 97.4%
+  triton_bmm_2682 0.0110 ms 97.1%
+  triton_bmm_2684 0.0110 ms 97.1%
+  triton_bmm_2686 0.0110 ms 97.1%
+  triton_bmm_2680 0.0113 ms 95.2%
+  triton_bmm_2689 0.0113 ms 95.2%
+  triton_bmm_2679 0.0113 ms 94.9%
+SingleProcess AUTOTUNE takes 3.1406 seconds
+AUTOTUNE bmm(512x1x96, 512x96x31)
+  triton_bmm_2753 0.0104 ms 100.0%
+  triton_bmm_2754 0.0106 ms 97.9%
+  triton_bmm_2751 0.0110 ms 94.2%
+  triton_bmm_2750 0.0111 ms 93.6%
+  triton_bmm_2756 0.0116 ms 89.8%
+  triton_bmm_2757 0.0116 ms 89.8%
+  triton_bmm_2755 0.0117 ms 88.5%
+  bmm 0.0156 ms 66.5%
+  triton_bmm_2752 0.0184 ms 56.3%
+SingleProcess AUTOTUNE takes 2.5317 seconds
+AUTOTUNE bmm(512x1x31, 512x31x96)
+  triton_bmm_2780 0.0108 ms 100.0%
+  triton_bmm_2775 0.0110 ms 98.0%
+  triton_bmm_2772 0.0111 ms 97.1%
+  triton_bmm_2779 0.0111 ms 97.1%
+  triton_bmm_2770 0.0111 ms 96.8%
+  triton_bmm_2773 0.0113 ms 95.5%
+  triton_bmm_2771 0.0113 ms 95.2%
+  triton_bmm_2777 0.0113 ms 95.2%
+  triton_bmm_2778 0.0114 ms 94.1%
+  triton_bmm_2776 0.0116 ms 92.8%
+SingleProcess AUTOTUNE takes 2.8547 seconds
+AUTOTUNE bmm(512x1x96, 512x96x32)
+  triton_bmm_2843 0.0103 ms 100.0%
+  triton_bmm_2842 0.0103 ms 99.7%
+  triton_bmm_2845 0.0106 ms 96.7%
+  triton_bmm_2846 0.0108 ms 95.5%
+  triton_bmm_2841 0.0109 ms 94.4%
+  triton_bmm_2844 0.0109 ms 94.4%
+  triton_bmm_2847 0.0112 ms 91.7%
+  triton_bmm_2848 0.0114 ms 90.4%
+  bmm 0.0148 ms 69.5%
+SingleProcess AUTOTUNE takes 2.8107 seconds
+AUTOTUNE bmm(512x1x32, 512x32x96)
+  triton_bmm_2866 0.0105 ms 100.0%
+  triton_bmm_2867 0.0106 ms 98.8%
+  triton_bmm_2863 0.0107 ms 98.5%
+  triton_bmm_2868 0.0107 ms 98.5%
+  triton_bmm_2871 0.0108 ms 97.3%
+  triton_bmm_2869 0.0110 ms 95.3%
+  triton_bmm_2861 0.0111 ms 94.3%
+  triton_bmm_2864 0.0112 ms 93.8%
+  triton_bmm_2865 0.0112 ms 93.7%
+  triton_bmm_2870 0.0113 ms 92.7%
+SingleProcess AUTOTUNE takes 2.9098 seconds
+AUTOTUNE bmm(512x1x96, 512x96x33)
+  triton_bmm_2932 0.0110 ms 100.0%
+  triton_bmm_2934 0.0112 ms 98.6%
+  triton_bmm_2936 0.0113 ms 97.5%
+  triton_bmm_2933 0.0114 ms 96.6%
+  triton_bmm_2935 0.0116 ms 94.8%
+  triton_bmm_2940 0.0119 ms 93.0%
+  triton_bmm_2937 0.0120 ms 92.0%
+  triton_bmm_2938 0.0123 ms 90.1%
+  triton_bmm_2939 0.0126 ms 87.6%
+  bmm 0.0171 ms 64.7%
+SingleProcess AUTOTUNE takes 2.7338 seconds
+AUTOTUNE bmm(512x1x33, 512x33x96)
+  triton_bmm_2964 0.0116 ms 100.0%
+  triton_bmm_2961 0.0118 ms 98.1%
+  triton_bmm_2963 0.0121 ms 96.3%
+  bmm 0.0122 ms 95.3%
+  triton_bmm_2955 0.0122 ms 95.3%
+  triton_bmm_2954 0.0123 ms 94.5%
+  triton_bmm_2962 0.0124 ms 93.8%
+  triton_bmm_2960 0.0124 ms 93.3%
+  triton_bmm_2953 0.0128 ms 90.5%
+  triton_bmm_2959 0.0129 ms 90.3%
+SingleProcess AUTOTUNE takes 3.2892 seconds
+AUTOTUNE bmm(512x1x96, 512x96x34)
+  triton_bmm_3025 0.0109 ms 100.0%
+  triton_bmm_3029 0.0111 ms 97.7%
+  triton_bmm_3026 0.0114 ms 95.8%
+  triton_bmm_3027 0.0115 ms 94.4%
+  triton_bmm_3028 0.0117 ms 93.2%
+  triton_bmm_3033 0.0119 ms 91.6%
+  triton_bmm_3030 0.0120 ms 90.4%
+  triton_bmm_3032 0.0121 ms 90.2%
+  triton_bmm_3031 0.0136 ms 80.0%
+  bmm 0.0150 ms 72.5%
+SingleProcess AUTOTUNE takes 2.6676 seconds
+AUTOTUNE bmm(512x1x34, 512x34x96)
+  triton_bmm_3056 0.0113 ms 100.0%
+  triton_bmm_3054 0.0116 ms 97.5%
+  triton_bmm_3057 0.0116 ms 96.7%
+  triton_bmm_3053 0.0117 ms 96.2%
+  triton_bmm_3047 0.0118 ms 95.7%
+  bmm 0.0121 ms 93.4%
+  triton_bmm_3048 0.0122 ms 92.4%
+  triton_bmm_3055 0.0124 ms 91.2%
+  triton_bmm_3046 0.0125 ms 90.0%
+  triton_bmm_3049 0.0138 ms 81.9%
+SingleProcess AUTOTUNE takes 4.1644 seconds
+AUTOTUNE bmm(512x1x96, 512x96x35)
+  triton_bmm_3118 0.0111 ms 100.0%
+  triton_bmm_3120 0.0112 ms 98.6%
+  triton_bmm_3122 0.0114 ms 97.5%
+  triton_bmm_3123 0.0116 ms 95.8%
+  triton_bmm_3121 0.0122 ms 91.1%
+  triton_bmm_3125 0.0122 ms 91.1%
+  triton_bmm_3119 0.0122 ms 90.6%
+  triton_bmm_3124 0.0122 ms 90.6%
+  triton_bmm_3126 0.0126 ms 87.8%
+  bmm 0.0171 ms 64.8%
+SingleProcess AUTOTUNE takes 2.8802 seconds
+AUTOTUNE bmm(512x1x35, 512x35x96)
+  triton_bmm_3150 0.0116 ms 100.0%
+  triton_bmm_3140 0.0118 ms 97.8%
+  triton_bmm_3146 0.0119 ms 97.3%
+  triton_bmm_3147 0.0120 ms 96.3%
+  triton_bmm_3148 0.0120 ms 96.3%
+  triton_bmm_3139 0.0123 ms 94.0%
+  triton_bmm_3141 0.0124 ms 93.8%
+  triton_bmm_3149 0.0127 ms 91.4%
+  triton_bmm_3145 0.0130 ms 89.4%
+  bmm 0.0133 ms 87.0%
+SingleProcess AUTOTUNE takes 4.0760 seconds
+AUTOTUNE bmm(512x1x96, 512x96x36)
+  triton_bmm_3211 0.0107 ms 100.0%
+  triton_bmm_3212 0.0114 ms 93.8%
+  triton_bmm_3213 0.0116 ms 92.5%
+  triton_bmm_3215 0.0116 ms 92.0%
+  triton_bmm_3214 0.0117 ms 91.5%
+  triton_bmm_3219 0.0120 ms 89.6%
+  triton_bmm_3218 0.0120 ms 89.3%
+  triton_bmm_3217 0.0121 ms 88.9%
+  triton_bmm_3216 0.0121 ms 88.6%
+  bmm 0.0150 ms 71.4%
+SingleProcess AUTOTUNE takes 2.9489 seconds
+AUTOTUNE bmm(512x1x36, 512x36x96)
+  triton_bmm_3243 0.0118 ms 100.0%
+  triton_bmm_3234 0.0118 ms 99.6%
+  triton_bmm_3242 0.0118 ms 99.6%
+  triton_bmm_3241 0.0119 ms 99.1%
+  triton_bmm_3233 0.0119 ms 98.8%
+  triton_bmm_3239 0.0123 ms 96.0%
+  triton_bmm_3232 0.0123 ms 95.7%
+  triton_bmm_3240 0.0124 ms 95.2%
+  bmm 0.0133 ms 88.3%
+  triton_bmm_3238 0.0133 ms 88.1%
+SingleProcess AUTOTUNE takes 3.6255 seconds
+AUTOTUNE bmm(512x1x96, 512x96x37)
+  triton_bmm_3306 0.0114 ms 100.0%
+  triton_bmm_3304 0.0116 ms 98.1%
+  triton_bmm_3307 0.0119 ms 95.7%
+  triton_bmm_3308 0.0120 ms 94.7%
+  triton_bmm_3311 0.0122 ms 93.4%
+  triton_bmm_3309 0.0122 ms 93.2%
+  triton_bmm_3310 0.0123 ms 92.7%
+  triton_bmm_3305 0.0123 ms 92.5%
+  triton_bmm_3312 0.0127 ms 89.8%
+  bmm 0.0171 ms 66.5%
+SingleProcess AUTOTUNE takes 2.8011 seconds
+AUTOTUNE bmm(512x1x37, 512x37x96)
+  triton_bmm_3336 0.0113 ms 100.0%
+  triton_bmm_3332 0.0120 ms 94.1%
+  triton_bmm_3334 0.0121 ms 93.1%
+  triton_bmm_3335 0.0122 ms 92.1%
+  triton_bmm_3327 0.0125 ms 90.0%
+  triton_bmm_3326 0.0125 ms 89.8%
+  triton_bmm_3333 0.0126 ms 89.3%
+  triton_bmm_3325 0.0128 ms 87.8%
+  bmm 0.0130 ms 86.5%
+  triton_bmm_3328 0.0130 ms 86.5%
+SingleProcess AUTOTUNE takes 3.3294 seconds
+AUTOTUNE bmm(512x1x96, 512x96x38)
+  triton_bmm_3397 0.0111 ms 100.0%
+  triton_bmm_3401 0.0113 ms 98.0%
+  triton_bmm_3398 0.0115 ms 96.4%
+  triton_bmm_3402 0.0116 ms 95.3%
+  triton_bmm_3399 0.0118 ms 94.3%
+  triton_bmm_3405 0.0121 ms 91.6%
+  triton_bmm_3400 0.0123 ms 90.1%
+  triton_bmm_3404 0.0127 ms 87.4%
+  triton_bmm_3403 0.0141 ms 78.7%
+  bmm 0.0153 ms 72.7%
+SingleProcess AUTOTUNE takes 2.7759 seconds
+AUTOTUNE bmm(512x1x38, 512x38x96)
+  triton_bmm_3419 0.0114 ms 100.0%
+  triton_bmm_3425 0.0117 ms 97.3%
+  triton_bmm_3426 0.0119 ms 95.7%
+  triton_bmm_3429 0.0119 ms 95.7%
+  triton_bmm_3418 0.0120 ms 94.7%
+  triton_bmm_3428 0.0120 ms 94.4%
+  triton_bmm_3420 0.0123 ms 92.2%
+  triton_bmm_3427 0.0126 ms 89.9%
+  bmm 0.0128 ms 88.5%
+  triton_bmm_3421 0.0135 ms 83.9%
+SingleProcess AUTOTUNE takes 3.7945 seconds
+AUTOTUNE bmm(512x1x96, 512x96x39)
+  triton_bmm_3490 0.0112 ms 100.0%
+  triton_bmm_3492 0.0115 ms 97.8%
+  triton_bmm_3494 0.0116 ms 97.2%
+  triton_bmm_3493 0.0119 ms 94.1%
+  triton_bmm_3498 0.0123 ms 91.2%
+  triton_bmm_3495 0.0124 ms 90.9%
+  triton_bmm_3497 0.0124 ms 90.7%
+  triton_bmm_3491 0.0125 ms 90.0%
+  triton_bmm_3496 0.0125 ms 90.0%
+  bmm 0.0173 ms 65.0%
+SingleProcess AUTOTUNE takes 2.8738 seconds
+AUTOTUNE bmm(512x1x39, 512x39x96)
+  triton_bmm_3522 0.0118 ms 100.0%
+  triton_bmm_3512 0.0121 ms 97.6%
+  triton_bmm_3511 0.0124 ms 95.4%
+  triton_bmm_3518 0.0126 ms 93.7%
+  triton_bmm_3519 0.0127 ms 93.4%
+  triton_bmm_3520 0.0127 ms 93.4%
+  triton_bmm_3521 0.0129 ms 91.8%
+  bmm 0.0131 ms 90.7%
+  triton_bmm_3515 0.0132 ms 90.0%
+  triton_bmm_3513 0.0132 ms 89.4%
+SingleProcess AUTOTUNE takes 3.4998 seconds
+AUTOTUNE bmm(512x1x96, 512x96x40)
+  triton_bmm_3583 0.0109 ms 100.0%
+  triton_bmm_3585 0.0112 ms 96.9%
+  triton_bmm_3587 0.0113 ms 96.0%
+  triton_bmm_3584 0.0116 ms 94.2%
+  triton_bmm_3591 0.0121 ms 89.7%
+  triton_bmm_3590 0.0122 ms 89.0%
+  triton_bmm_3586 0.0124 ms 87.9%
+  triton_bmm_3588 0.0124 ms 87.9%
+  triton_bmm_3589 0.0128 ms 84.8%
+  bmm 0.0158 ms 69.0%
+SingleProcess AUTOTUNE takes 2.6187 seconds
+AUTOTUNE bmm(512x1x40, 512x40x96)
+  triton_bmm_3611 0.0118 ms 100.0%
+  triton_bmm_3604 0.0119 ms 99.5%
+  triton_bmm_3606 0.0121 ms 97.9%
+  triton_bmm_3614 0.0121 ms 97.9%
+  triton_bmm_3615 0.0121 ms 97.9%
+  triton_bmm_3605 0.0121 ms 97.4%
+  triton_bmm_3612 0.0121 ms 97.4%
+  triton_bmm_3613 0.0122 ms 97.1%
+  bmm 0.0125 ms 94.1%
+  triton_bmm_3608 0.0137 ms 86.2%
+SingleProcess AUTOTUNE takes 3.4694 seconds
+AUTOTUNE bmm(512x1x96, 512x96x41)
+  triton_bmm_3676 0.0114 ms 100.0%
+  triton_bmm_3678 0.0117 ms 97.0%
+  triton_bmm_3679 0.0121 ms 93.9%
+  triton_bmm_3680 0.0124 ms 92.0%
+  triton_bmm_3684 0.0124 ms 91.5%
+  triton_bmm_3681 0.0124 ms 91.3%
+  triton_bmm_3683 0.0125 ms 90.8%
+  triton_bmm_3677 0.0126 ms 89.9%
+  triton_bmm_3682 0.0131 ms 86.6%
+  bmm 0.0179 ms 63.6%
+SingleProcess AUTOTUNE takes 3.1151 seconds
+AUTOTUNE bmm(512x1x41, 512x41x96)
+  triton_bmm_3708 0.0121 ms 100.0%
+  triton_bmm_3705 0.0124 ms 97.7%
+  triton_bmm_3706 0.0124 ms 97.4%
+  triton_bmm_3707 0.0125 ms 96.4%
+  triton_bmm_3704 0.0127 ms 95.0%
+  triton_bmm_3698 0.0129 ms 93.8%
+  triton_bmm_3697 0.0130 ms 92.6%
+  bmm 0.0131 ms 92.2%
+  triton_bmm_3700 0.0133 ms 90.6%
+  triton_bmm_3699 0.0135 ms 89.5%
+SingleProcess AUTOTUNE takes 3.3523 seconds
+AUTOTUNE bmm(512x1x96, 512x96x42)
+  triton_bmm_3769 0.0114 ms 100.0%
+  triton_bmm_3771 0.0116 ms 98.6%
+  triton_bmm_3770 0.0119 ms 96.0%
+  triton_bmm_3774 0.0119 ms 96.0%
+  triton_bmm_3772 0.0120 ms 94.7%
+  triton_bmm_3773 0.0122 ms 93.7%
+  triton_bmm_3776 0.0124 ms 91.8%
+  triton_bmm_3777 0.0129 ms 88.3%
+  triton_bmm_3775 0.0144 ms 79.4%
+  bmm 0.0155 ms 73.4%
+SingleProcess AUTOTUNE takes 2.6256 seconds
+AUTOTUNE bmm(512x1x42, 512x42x96)
+  triton_bmm_3801 0.0115 ms 100.0%
+  triton_bmm_3800 0.0121 ms 95.5%
+  triton_bmm_3798 0.0121 ms 95.2%
+  triton_bmm_3790 0.0121 ms 95.0%
+  triton_bmm_3791 0.0123 ms 93.7%
+  triton_bmm_3792 0.0124 ms 93.3%
+  triton_bmm_3799 0.0124 ms 93.3%
+  triton_bmm_3797 0.0124 ms 93.0%
+  bmm 0.0135 ms 85.5%
+  triton_bmm_3793 0.0142 ms 80.9%
+SingleProcess AUTOTUNE takes 3.4546 seconds
+AUTOTUNE bmm(512x1x96, 512x96x43)
+  triton_bmm_3866 0.0119 ms 100.0%
+  triton_bmm_3862 0.0120 ms 99.5%
+  triton_bmm_3865 0.0122 ms 97.6%
+  triton_bmm_3864 0.0124 ms 96.1%
+  triton_bmm_3867 0.0126 ms 94.4%
+  triton_bmm_3869 0.0126 ms 94.4%
+  triton_bmm_3863 0.0128 ms 93.2%
+  triton_bmm_3870 0.0131 ms 91.4%
+  triton_bmm_3868 0.0134 ms 89.2%
+  bmm 0.0180 ms 66.5%
+SingleProcess AUTOTUNE takes 2.7140 seconds
+AUTOTUNE bmm(512x1x43, 512x43x96)
+  triton_bmm_3894 0.0121 ms 100.0%
+  triton_bmm_3884 0.0124 ms 97.4%
+  triton_bmm_3892 0.0126 ms 96.2%
+  triton_bmm_3883 0.0127 ms 95.2%
+  triton_bmm_3890 0.0128 ms 94.5%
+  triton_bmm_3891 0.0131 ms 92.4%
+  triton_bmm_3893 0.0131 ms 92.2%
+  triton_bmm_3885 0.0137 ms 88.5%
+  triton_bmm_3886 0.0140 ms 86.3%
+  triton_bmm_3887 0.0141 ms 85.5%
+SingleProcess AUTOTUNE takes 3.9978 seconds
+AUTOTUNE bmm(512x1x96, 512x96x44)
+  triton_bmm_3955 0.0113 ms 100.0%
+  triton_bmm_3960 0.0120 ms 93.9%
+  triton_bmm_3957 0.0121 ms 92.9%
+  triton_bmm_3959 0.0122 ms 92.4%
+  triton_bmm_3958 0.0122 ms 92.1%
+  triton_bmm_3956 0.0125 ms 89.8%
+  triton_bmm_3962 0.0130 ms 86.9%
+  triton_bmm_3963 0.0130 ms 86.7%
+  triton_bmm_3961 0.0131 ms 85.9%
+  bmm 0.0156 ms 72.1%
+SingleProcess AUTOTUNE takes 2.6980 seconds
+AUTOTUNE bmm(512x1x44, 512x44x96)
+  triton_bmm_3977 0.0116 ms 100.0%
+  triton_bmm_3987 0.0117 ms 99.5%
+  triton_bmm_3986 0.0121 ms 95.8%
+  triton_bmm_3983 0.0124 ms 93.3%
+  triton_bmm_3976 0.0125 ms 92.8%
+  triton_bmm_3985 0.0125 ms 92.8%
+  triton_bmm_3978 0.0128 ms 91.0%
+  triton_bmm_3984 0.0129 ms 90.1%
+  triton_bmm_3982 0.0137 ms 85.0%
+  triton_bmm_3980 0.0138 ms 84.4%
+SingleProcess AUTOTUNE takes 4.0719 seconds
+AUTOTUNE bmm(512x1x96, 512x96x45)
+  triton_bmm_4050 0.0119 ms 100.0%
+  triton_bmm_4048 0.0121 ms 98.4%
+  triton_bmm_4053 0.0121 ms 98.4%
+  triton_bmm_4051 0.0124 ms 95.9%
+  triton_bmm_4052 0.0129 ms 92.8%
+  triton_bmm_4049 0.0129 ms 92.3%
+  triton_bmm_4056 0.0132 ms 90.5%
+  triton_bmm_4055 0.0132 ms 90.1%
+  triton_bmm_4054 0.0133 ms 89.4%
+  bmm 0.0181 ms 66.0%
+SingleProcess AUTOTUNE takes 3.1758 seconds
+AUTOTUNE bmm(512x1x45, 512x45x96)
+  triton_bmm_4080 0.0121 ms 100.0%
+  triton_bmm_4076 0.0123 ms 98.4%
+  triton_bmm_4070 0.0125 ms 96.7%
+  triton_bmm_4078 0.0126 ms 96.2%
+  triton_bmm_4069 0.0127 ms 95.5%
+  triton_bmm_4079 0.0127 ms 95.5%
+  triton_bmm_4077 0.0132 ms 92.0%
+  triton_bmm_4071 0.0132 ms 91.8%
+  triton_bmm_4073 0.0138 ms 88.1%
+  bmm 0.0139 ms 87.5%
+SingleProcess AUTOTUNE takes 3.4417 seconds
+AUTOTUNE bmm(512x1x96, 512x96x46)
+  triton_bmm_4141 0.0116 ms 100.0%
+  triton_bmm_4143 0.0119 ms 97.8%
+  triton_bmm_4145 0.0124 ms 93.3%
+  triton_bmm_4149 0.0126 ms 92.1%
+  triton_bmm_4146 0.0127 ms 91.4%
+  triton_bmm_4142 0.0127 ms 91.2%
+  triton_bmm_4144 0.0128 ms 90.5%
+  triton_bmm_4148 0.0132 ms 88.1%
+  triton_bmm_4147 0.0146 ms 79.4%
+  bmm 0.0158 ms 73.5%
+SingleProcess AUTOTUNE takes 2.6176 seconds
+AUTOTUNE bmm(512x1x46, 512x46x96)
+  triton_bmm_4163 0.0118 ms 100.0%
+  triton_bmm_4162 0.0122 ms 96.9%
+  triton_bmm_4172 0.0123 ms 96.1%
+  triton_bmm_4173 0.0123 ms 96.1%
+  triton_bmm_4169 0.0124 ms 95.1%
+  triton_bmm_4170 0.0130 ms 90.8%
+  triton_bmm_4164 0.0131 ms 90.5%
+  triton_bmm_4171 0.0131 ms 90.2%
+  bmm 0.0143 ms 82.6%
+  triton_bmm_4166 0.0144 ms 82.4%
+SingleProcess AUTOTUNE takes 3.4591 seconds
+AUTOTUNE bmm(512x1x96, 512x96x47)
+  triton_bmm_4234 0.0118 ms 100.0%
+  triton_bmm_4238 0.0123 ms 95.8%
+  triton_bmm_4242 0.0127 ms 92.7%
+  triton_bmm_4236 0.0128 ms 92.2%
+  triton_bmm_4239 0.0130 ms 90.9%
+  triton_bmm_4237 0.0131 ms 90.4%
+  triton_bmm_4235 0.0131 ms 90.2%
+  triton_bmm_4240 0.0134 ms 88.3%
+  triton_bmm_4241 0.0134 ms 88.3%
+  bmm 0.0181 ms 65.2%
+SingleProcess AUTOTUNE takes 2.6720 seconds
+AUTOTUNE bmm(512x1x47, 512x47x96)
+  triton_bmm_4266 0.0123 ms 100.0%
+  triton_bmm_4265 0.0128 ms 96.3%
+  triton_bmm_4255 0.0128 ms 96.0%
+  triton_bmm_4264 0.0129 ms 95.8%
+  triton_bmm_4262 0.0129 ms 95.3%
+  triton_bmm_4256 0.0132 ms 93.0%
+  triton_bmm_4263 0.0133 ms 92.5%
+  triton_bmm_4258 0.0138 ms 89.3%
+  bmm 0.0139 ms 88.5%
+  triton_bmm_4257 0.0140 ms 88.3%
+SingleProcess AUTOTUNE takes 3.3178 seconds
+AUTOTUNE bmm(512x1x96, 512x96x48)
+  triton_bmm_4331 0.0119 ms 100.0%
+  triton_bmm_4327 0.0120 ms 99.7%
+  triton_bmm_4328 0.0122 ms 98.2%
+  triton_bmm_4332 0.0124 ms 96.6%
+  triton_bmm_4330 0.0124 ms 96.4%
+  triton_bmm_4329 0.0125 ms 95.6%
+  triton_bmm_4335 0.0126 ms 94.7%
+  triton_bmm_4334 0.0132 ms 90.5%
+  triton_bmm_4333 0.0133 ms 89.9%
+  bmm 0.0163 ms 73.4%
+SingleProcess AUTOTUNE takes 2.6500 seconds
+AUTOTUNE bmm(512x1x48, 512x48x96)
+  triton_bmm_4359 0.0116 ms 100.0%
+  triton_bmm_4348 0.0121 ms 95.8%
+  triton_bmm_4358 0.0124 ms 93.5%
+  triton_bmm_4357 0.0124 ms 93.3%
+  triton_bmm_4355 0.0124 ms 92.8%
+  triton_bmm_4349 0.0126 ms 91.4%
+  triton_bmm_4356 0.0126 ms 91.4%
+  triton_bmm_4350 0.0129 ms 89.6%
+  triton_bmm_4354 0.0136 ms 84.7%
+  triton_bmm_4352 0.0138 ms 83.6%
+SingleProcess AUTOTUNE takes 3.6134 seconds
+AUTOTUNE mm(32x1536, 1536x1536)
+  mm 0.0162 ms 100.0%
+  triton_mm_4401 0.0175 ms 92.7%
+  triton_mm_4402 0.0195 ms 83.0%
+  triton_mm_4404 0.0198 ms 81.6%
+  triton_mm_4405 0.0200 ms 81.0%
+  triton_mm_4400 0.0210 ms 77.1%
+  triton_mm_4399 0.0235 ms 69.0%
+  triton_mm_4398 0.0279 ms 58.0%
+  triton_mm_4397 0.0303 ms 53.4%
+  triton_mm_4396 0.0480 ms 33.7%
+SingleProcess AUTOTUNE takes 1.5173 seconds
+AUTOTUNE addmm(32x1536, 32x1536, 1536x1536)
+  bias_addmm 0.0168 ms 100.0%
+  triton_mm_4413 0.0185 ms 91.0%
+  triton_mm_4414 0.0197 ms 85.2%
+  triton_mm_4417 0.0200 ms 84.0%
+  triton_mm_4416 0.0203 ms 82.8%
+  triton_mm_4412 0.0215 ms 78.1%
+  addmm 0.0222 ms 75.6%
+  triton_mm_4411 0.0246 ms 68.4%
+  triton_mm_4410 0.0290 ms 58.0%
+  triton_mm_4409 0.0305 ms 55.0%
+SingleProcess AUTOTUNE takes 1.6595 seconds
+AUTOTUNE bmm(512x1x96, 512x96x49)
+  triton_bmm_4425 0.0125 ms 100.0%
+  triton_bmm_4422 0.0125 ms 99.7%
+  triton_bmm_4420 0.0126 ms 99.2%
+  triton_bmm_4421 0.0127 ms 98.7%
+  triton_bmm_4423 0.0129 ms 96.8%
+  triton_bmm_4428 0.0131 ms 95.6%
+  triton_bmm_4424 0.0131 ms 95.4%
+  triton_bmm_4427 0.0133 ms 94.2%
+  triton_bmm_4426 0.0140 ms 89.7%
+  bmm 0.0165 ms 75.6%
+SingleProcess AUTOTUNE takes 3.0143 seconds
+AUTOTUNE bmm(512x1x49, 512x49x96)
+  triton_bmm_4442 0.0129 ms 100.0%
+  triton_bmm_4452 0.0132 ms 98.1%
+  triton_bmm_4448 0.0135 ms 95.5%
+  bmm 0.0139 ms 93.1%
+  triton_bmm_4441 0.0139 ms 93.1%
+  triton_bmm_4449 0.0140 ms 92.4%
+  triton_bmm_4443 0.0140 ms 91.8%
+  triton_bmm_4451 0.0141 ms 91.6%
+  triton_bmm_4445 0.0142 ms 90.8%
+  triton_bmm_4450 0.0144 ms 89.6%
+SingleProcess AUTOTUNE takes 3.5242 seconds
+AUTOTUNE mm(32x1536, 1536x6144)
+  triton_mm_4474 0.0271 ms 100.0%
+  triton_mm_4473 0.0272 ms 99.9%
+  triton_mm_4470 0.0275 ms 98.8%
+  triton_mm_4471 0.0276 ms 98.1%
+  mm 0.0277 ms 97.9%
+  triton_mm_4469 0.0286 ms 94.9%
+  triton_mm_4468 0.0299 ms 90.8%
+  triton_mm_4467 0.0337 ms 80.5%
+  triton_mm_4466 0.0360 ms 75.4%
+  triton_mm_4465 0.0559 ms 48.5%
+SingleProcess AUTOTUNE takes 2.0357 seconds
+AUTOTUNE mm(32x6144, 6144x1536)
+  mm 0.0304 ms 100.0%
+  triton_mm_4482 0.0514 ms 59.3%
+  triton_mm_4483 0.0567 ms 53.6%
+  triton_mm_4486 0.0576 ms 52.8%
+  triton_mm_4485 0.0585 ms 52.0%
+  triton_mm_4481 0.0650 ms 46.8%
+  triton_mm_4480 0.0743 ms 40.9%
+  triton_mm_4479 0.0952 ms 32.0%
+  triton_mm_4478 0.1024 ms 29.7%
+  triton_mm_4477 0.1509 ms 20.2%
+SingleProcess AUTOTUNE takes 1.4815 seconds
+AUTOTUNE bmm(512x1x96, 512x96x50)
+  triton_bmm_4517 0.0125 ms 100.0%
+  triton_bmm_4513 0.0126 ms 99.5%
+  triton_bmm_4514 0.0126 ms 99.5%
+  triton_bmm_4516 0.0128 ms 97.5%
+  triton_bmm_4515 0.0130 ms 96.1%
+  triton_bmm_4518 0.0132 ms 95.1%
+  triton_bmm_4520 0.0137 ms 91.4%
+  triton_bmm_4521 0.0137 ms 91.1%
+  triton_bmm_4519 0.0143 ms 87.3%
+  bmm 0.0165 ms 75.6%
+SingleProcess AUTOTUNE takes 2.7418 seconds
+AUTOTUNE bmm(512x1x50, 512x50x96)
+  triton_bmm_4534 0.0124 ms 100.0%
+  triton_bmm_4545 0.0126 ms 99.0%
+  triton_bmm_4535 0.0127 ms 97.7%
+  triton_bmm_4542 0.0134 ms 93.1%
+  triton_bmm_4536 0.0135 ms 92.4%
+  triton_bmm_4544 0.0136 ms 91.7%
+  triton_bmm_4541 0.0137 ms 91.1%
+  triton_bmm_4537 0.0141 ms 88.4%
+  triton_bmm_4543 0.0141 ms 88.0%
+  triton_bmm_4539 0.0142 ms 87.8%
+SingleProcess AUTOTUNE takes 4.0587 seconds
+AUTOTUNE bmm(512x1x96, 512x96x51)
+  triton_bmm_4606 0.0123 ms 100.0%
+  triton_bmm_4608 0.0127 ms 96.7%
+  triton_bmm_4611 0.0127 ms 96.5%
+  triton_bmm_4610 0.0132 ms 92.8%
+  triton_bmm_4614 0.0133 ms 92.3%
+  triton_bmm_4613 0.0134 ms 91.9%
+  triton_bmm_4607 0.0135 ms 91.2%
+  triton_bmm_4609 0.0135 ms 90.8%
+  triton_bmm_4612 0.0142 ms 86.7%
+  bmm 0.0166 ms 74.0%
+SingleProcess AUTOTUNE takes 2.8886 seconds
+AUTOTUNE bmm(512x1x51, 512x51x96)
+  triton_bmm_4628 0.0131 ms 100.0%
+  triton_bmm_4638 0.0133 ms 98.6%
+  triton_bmm_4627 0.0135 ms 97.4%
+  triton_bmm_4634 0.0136 ms 96.2%
+  triton_bmm_4636 0.0139 ms 94.3%
+  triton_bmm_4629 0.0143 ms 91.9%
+  triton_bmm_4632 0.0143 ms 91.7%
+  bmm 0.0146 ms 89.9%
+  triton_bmm_4635 0.0147 ms 89.3%
+  triton_bmm_4637 0.0147 ms 89.1%
+SingleProcess AUTOTUNE takes 3.4381 seconds
+AUTOTUNE bmm(512x1x96, 512x96x52)
+  triton_bmm_4699 0.0121 ms 100.0%
+  triton_bmm_4703 0.0126 ms 96.2%
+  triton_bmm_4704 0.0127 ms 95.7%
+  triton_bmm_4701 0.0132 ms 92.2%
+  triton_bmm_4706 0.0132 ms 92.0%
+  triton_bmm_4700 0.0132 ms 91.8%
+  triton_bmm_4707 0.0132 ms 91.5%
+  triton_bmm_4705 0.0133 ms 91.1%
+  triton_bmm_4702 0.0135 ms 89.8%
+  bmm 0.0166 ms 73.0%
+SingleProcess AUTOTUNE takes 2.6388 seconds
+AUTOTUNE bmm(512x1x52, 512x52x96)
+  triton_bmm_4721 0.0121 ms 100.0%
+  triton_bmm_4731 0.0126 ms 96.2%
+  triton_bmm_4730 0.0130 ms 93.1%
+  triton_bmm_4720 0.0131 ms 92.9%
+  triton_bmm_4722 0.0134 ms 90.5%
+  triton_bmm_4728 0.0139 ms 87.5%
+  triton_bmm_4726 0.0139 ms 87.1%
+  triton_bmm_4729 0.0140 ms 86.5%
+  triton_bmm_4727 0.0143 ms 84.8%
+  triton_bmm_4723 0.0148 ms 81.9%
+SingleProcess AUTOTUNE takes 3.8527 seconds
+AUTOTUNE bmm(512x1x96, 512x96x53)
+  triton_bmm_4792 0.0124 ms 100.0%
+  triton_bmm_4793 0.0130 ms 95.3%
+  triton_bmm_4795 0.0132 ms 94.2%
+  triton_bmm_4794 0.0134 ms 92.6%
+  triton_bmm_4797 0.0134 ms 92.6%
+  triton_bmm_4796 0.0134 ms 92.4%
+  triton_bmm_4800 0.0140 ms 88.8%
+  triton_bmm_4799 0.0140 ms 88.2%
+  triton_bmm_4798 0.0142 ms 87.0%
+  bmm 0.0168 ms 73.7%
+SingleProcess AUTOTUNE takes 2.9808 seconds
+AUTOTUNE bmm(512x1x53, 512x53x96)
+  triton_bmm_4820 0.0132 ms 100.0%
+  triton_bmm_4814 0.0133 ms 99.3%
+  triton_bmm_4824 0.0134 ms 98.3%
+  triton_bmm_4813 0.0135 ms 97.6%
+  triton_bmm_4815 0.0138 ms 95.6%
+  triton_bmm_4822 0.0141 ms 93.4%
+  triton_bmm_4821 0.0142 ms 92.8%
+  triton_bmm_4817 0.0144 ms 91.6%
+  triton_bmm_4818 0.0145 ms 91.2%
+  bmm 0.0148 ms 89.2%
+SingleProcess AUTOTUNE takes 3.7956 seconds
+AUTOTUNE bmm(512x1x96, 512x96x54)
+  triton_bmm_4887 0.0128 ms 100.0%
+  triton_bmm_4889 0.0128 ms 100.0%
+  triton_bmm_4890 0.0128 ms 99.8%
+  triton_bmm_4885 0.0129 ms 99.3%
+  triton_bmm_4886 0.0129 ms 98.8%
+  triton_bmm_4893 0.0134 ms 95.2%
+  triton_bmm_4888 0.0135 ms 94.3%
+  triton_bmm_4892 0.0140 ms 91.3%
+  triton_bmm_4891 0.0144 ms 88.9%
+  bmm 0.0168 ms 76.0%
+SingleProcess AUTOTUNE takes 2.9306 seconds
+AUTOTUNE bmm(512x1x54, 512x54x96)
+  triton_bmm_4907 0.0130 ms 100.0%
+  triton_bmm_4908 0.0131 ms 99.6%
+  triton_bmm_4917 0.0132 ms 98.9%
+  triton_bmm_4916 0.0135 ms 96.3%
+  triton_bmm_4913 0.0138 ms 94.1%
+  triton_bmm_4909 0.0143 ms 91.1%
+  triton_bmm_4914 0.0144 ms 90.5%
+  triton_bmm_4910 0.0148 ms 88.2%
+  triton_bmm_4911 0.0148 ms 88.2%
+  triton_bmm_4915 0.0148 ms 88.2%
+SingleProcess AUTOTUNE takes 3.8305 seconds
+AUTOTUNE bmm(512x1x96, 512x96x55)
+  triton_bmm_4983 0.0129 ms 100.0%
+  triton_bmm_4978 0.0131 ms 98.8%
+  triton_bmm_4979 0.0132 ms 98.1%
+  triton_bmm_4980 0.0135 ms 96.0%
+  triton_bmm_4982 0.0136 ms 95.3%
+  triton_bmm_4986 0.0136 ms 95.1%
+  triton_bmm_4981 0.0137 ms 94.4%
+  triton_bmm_4984 0.0139 ms 93.3%
+  triton_bmm_4985 0.0142 ms 91.2%
+  bmm 0.0169 ms 76.5%
+SingleProcess AUTOTUNE takes 2.6915 seconds
+AUTOTUNE bmm(512x1x55, 512x55x96)
+  triton_bmm_5006 0.0133 ms 100.0%
+  triton_bmm_5010 0.0134 ms 99.4%
+  triton_bmm_5001 0.0140 ms 95.4%
+  triton_bmm_5000 0.0140 ms 94.8%
+  triton_bmm_4999 0.0142 ms 93.7%
+  triton_bmm_5004 0.0147 ms 90.6%
+  bmm 0.0148 ms 89.7%
+  triton_bmm_5009 0.0149 ms 89.3%
+  triton_bmm_5007 0.0149 ms 89.1%
+  triton_bmm_5008 0.0150 ms 88.7%
+SingleProcess AUTOTUNE takes 3.6388 seconds
+AUTOTUNE bmm(512x1x96, 512x96x56)
+  triton_bmm_5073 0.0129 ms 100.0%
+  triton_bmm_5071 0.0129 ms 99.8%
+  triton_bmm_5076 0.0130 ms 99.3%
+  triton_bmm_5072 0.0130 ms 99.0%
+  triton_bmm_5074 0.0132 ms 97.8%
+  triton_bmm_5078 0.0134 ms 96.0%
+  triton_bmm_5075 0.0135 ms 95.7%
+  triton_bmm_5077 0.0138 ms 93.7%
+  triton_bmm_5079 0.0140 ms 92.0%
+  bmm 0.0168 ms 76.6%
+SingleProcess AUTOTUNE takes 2.9056 seconds
+AUTOTUNE bmm(512x1x56, 512x56x96)
+  triton_bmm_5093 0.0124 ms 100.0%
+  triton_bmm_5103 0.0128 ms 97.3%
+  triton_bmm_5092 0.0133 ms 93.5%
+  triton_bmm_5094 0.0136 ms 91.3%
+  triton_bmm_5102 0.0136 ms 91.3%
+  triton_bmm_5100 0.0140 ms 89.2%
+  bmm 0.0142 ms 87.8%
+  triton_bmm_5095 0.0143 ms 86.8%
+  triton_bmm_5096 0.0145 ms 86.1%
+  triton_bmm_5099 0.0145 ms 85.9%
+SingleProcess AUTOTUNE takes 3.5556 seconds
+AUTOTUNE bmm(512x1x96, 512x96x57)
+  triton_bmm_5164 0.0127 ms 100.0%
+  triton_bmm_5169 0.0132 ms 96.6%
+  triton_bmm_5167 0.0134 ms 95.0%
+  triton_bmm_5168 0.0138 ms 92.6%
+  triton_bmm_5166 0.0138 ms 92.3%
+  triton_bmm_5171 0.0138 ms 92.1%
+  triton_bmm_5165 0.0139 ms 91.5%
+  triton_bmm_5170 0.0139 ms 91.5%
+  triton_bmm_5172 0.0142 ms 89.6%
+  bmm 0.0171 ms 74.5%
+SingleProcess AUTOTUNE takes 3.1931 seconds
+AUTOTUNE bmm(512x1x57, 512x57x96)
+  triton_bmm_5196 0.0136 ms 100.0%
+  triton_bmm_5192 0.0141 ms 96.6%
+  triton_bmm_5187 0.0141 ms 96.2%
+  triton_bmm_5186 0.0143 ms 95.4%
+  triton_bmm_5185 0.0143 ms 95.1%
+  triton_bmm_5193 0.0145 ms 93.6%
+  triton_bmm_5195 0.0146 ms 93.2%
+  triton_bmm_5190 0.0148 ms 91.6%
+  triton_bmm_5194 0.0152 ms 89.7%
+  triton_bmm_5189 0.0154 ms 88.4%
+SingleProcess AUTOTUNE takes 3.5467 seconds
+AUTOTUNE bmm(512x1x96, 512x96x58)
+  triton_bmm_5262 0.0131 ms 100.0%
+  triton_bmm_5259 0.0132 ms 99.5%
+  triton_bmm_5257 0.0132 ms 99.3%
+  triton_bmm_5258 0.0132 ms 99.3%
+  triton_bmm_5265 0.0137 ms 95.8%
+  triton_bmm_5261 0.0138 ms 95.3%
+  triton_bmm_5260 0.0139 ms 94.5%
+  triton_bmm_5264 0.0143 ms 91.7%
+  triton_bmm_5263 0.0151 ms 86.7%
+  bmm 0.0171 ms 76.8%
+SingleProcess AUTOTUNE takes 2.7053 seconds
+AUTOTUNE bmm(512x1x58, 512x58x96)
+  triton_bmm_5279 0.0127 ms 100.0%
+  triton_bmm_5289 0.0135 ms 93.8%
+  triton_bmm_5278 0.0136 ms 93.2%
+  triton_bmm_5288 0.0137 ms 92.5%
+  triton_bmm_5280 0.0140 ms 90.8%
+  triton_bmm_5286 0.0140 ms 90.8%
+  triton_bmm_5283 0.0144 ms 88.0%
+  triton_bmm_5285 0.0146 ms 86.7%
+  triton_bmm_5287 0.0146 ms 86.7%
+  triton_bmm_5282 0.0149 ms 85.2%
+SingleProcess AUTOTUNE takes 3.7738 seconds
+AUTOTUNE bmm(512x1x96, 512x96x59)
+  triton_bmm_5352 0.0133 ms 100.0%
+  triton_bmm_5350 0.0134 ms 99.8%
+  triton_bmm_5354 0.0134 ms 99.6%
+  triton_bmm_5358 0.0139 ms 96.3%
+  triton_bmm_5357 0.0139 ms 95.9%
+  triton_bmm_5355 0.0140 ms 95.4%
+  triton_bmm_5353 0.0140 ms 95.2%
+  triton_bmm_5351 0.0141 ms 94.9%
+  triton_bmm_5356 0.0141 ms 94.8%
+  bmm 0.0173 ms 76.9%
+SingleProcess AUTOTUNE takes 3.0332 seconds
+AUTOTUNE bmm(512x1x59, 512x59x96)
+  triton_bmm_5382 0.0131 ms 100.0%
+  triton_bmm_5372 0.0138 ms 95.3%
+  triton_bmm_5371 0.0139 ms 94.3%
+  triton_bmm_5378 0.0142 ms 92.1%
+  triton_bmm_5381 0.0146 ms 89.7%
+  triton_bmm_5379 0.0147 ms 89.3%
+  triton_bmm_5373 0.0149 ms 88.0%
+  triton_bmm_5380 0.0153 ms 85.8%
+  triton_bmm_5377 0.0155 ms 84.7%
+  triton_bmm_5375 0.0156 ms 84.2%
+SingleProcess AUTOTUNE takes 3.4657 seconds
+AUTOTUNE bmm(512x1x96, 512x96x60)
+  triton_bmm_5443 0.0132 ms 100.0%
+  triton_bmm_5445 0.0133 ms 99.3%
+  triton_bmm_5447 0.0133 ms 99.3%
+  triton_bmm_5448 0.0134 ms 98.6%
+  triton_bmm_5446 0.0135 ms 98.0%
+  triton_bmm_5449 0.0136 ms 96.7%
+  triton_bmm_5451 0.0138 ms 95.6%
+  triton_bmm_5444 0.0139 ms 95.2%
+  triton_bmm_5450 0.0143 ms 92.2%
+  bmm 0.0172 ms 76.9%
+SingleProcess AUTOTUNE takes 2.7835 seconds
+AUTOTUNE bmm(512x1x60, 512x60x96)
+  triton_bmm_5475 0.0131 ms 100.0%
+  triton_bmm_5465 0.0133 ms 97.8%
+  triton_bmm_5464 0.0136 ms 96.2%
+  triton_bmm_5474 0.0139 ms 94.0%
+  triton_bmm_5466 0.0140 ms 93.4%
+  triton_bmm_5473 0.0141 ms 92.7%
+  triton_bmm_5471 0.0141 ms 92.3%
+  triton_bmm_5470 0.0144 ms 90.5%
+  triton_bmm_5472 0.0145 ms 89.9%
+  triton_bmm_5467 0.0152 ms 85.9%
+SingleProcess AUTOTUNE takes 3.5930 seconds
+AUTOTUNE bmm(512x1x96, 512x96x61)
+  triton_bmm_5536 0.0136 ms 100.0%
+  triton_bmm_5539 0.0138 ms 98.6%
+  triton_bmm_5544 0.0140 ms 97.2%
+  triton_bmm_5541 0.0140 ms 96.6%
+  triton_bmm_5543 0.0140 ms 96.6%
+  triton_bmm_5542 0.0141 ms 96.1%
+  triton_bmm_5540 0.0142 ms 95.7%
+  triton_bmm_5538 0.0142 ms 95.6%
+  triton_bmm_5537 0.0143 ms 95.1%
+  bmm 0.0173 ms 78.2%
+SingleProcess AUTOTUNE takes 3.1343 seconds
+AUTOTUNE bmm(512x1x61, 512x61x96)
+  triton_bmm_5568 0.0132 ms 100.0%
+  triton_bmm_5564 0.0138 ms 96.1%
+  triton_bmm_5558 0.0139 ms 95.2%
+  triton_bmm_5557 0.0145 ms 91.1%
+  triton_bmm_5567 0.0148 ms 89.2%
+  triton_bmm_5559 0.0151 ms 87.7%
+  triton_bmm_5562 0.0153 ms 86.6%
+  triton_bmm_5565 0.0154 ms 86.1%
+  triton_bmm_5566 0.0155 ms 85.4%
+  triton_bmm_5563 0.0156 ms 84.8%
+SingleProcess AUTOTUNE takes 3.8284 seconds
+AUTOTUNE bmm(512x1x96, 512x96x62)
+  triton_bmm_5629 0.0131 ms 100.0%
+  triton_bmm_5633 0.0135 ms 96.9%
+  triton_bmm_5634 0.0136 ms 96.2%
+  triton_bmm_5632 0.0137 ms 95.3%
+  triton_bmm_5636 0.0140 ms 92.9%
+  triton_bmm_5631 0.0141 ms 92.5%
+  triton_bmm_5630 0.0142 ms 91.9%
+  triton_bmm_5637 0.0145 ms 90.1%
+  triton_bmm_5635 0.0148 ms 88.4%
+  bmm 0.0173 ms 75.3%
+SingleProcess AUTOTUNE takes 3.5227 seconds
+AUTOTUNE bmm(512x1x62, 512x62x96)
+  triton_bmm_5661 0.0132 ms 100.0%
+  triton_bmm_5651 0.0137 ms 96.5%
+  triton_bmm_5650 0.0139 ms 95.2%
+  triton_bmm_5660 0.0140 ms 94.7%
+  triton_bmm_5652 0.0141 ms 93.4%
+  triton_bmm_5655 0.0145 ms 91.0%
+  triton_bmm_5653 0.0147 ms 90.2%
+  triton_bmm_5657 0.0149 ms 88.6%
+  triton_bmm_5658 0.0149 ms 88.6%
+  triton_bmm_5659 0.0149 ms 88.6%
+SingleProcess AUTOTUNE takes 3.9287 seconds
+AUTOTUNE bmm(512x1x96, 512x96x63)
+  triton_bmm_5722 0.0137 ms 100.0%
+  triton_bmm_5723 0.0137 ms 100.0%
+  triton_bmm_5726 0.0137 ms 100.0%
+  triton_bmm_5725 0.0139 ms 98.6%
+  triton_bmm_5730 0.0141 ms 97.5%
+  triton_bmm_5724 0.0143 ms 96.2%
+  triton_bmm_5727 0.0143 ms 96.0%
+  triton_bmm_5729 0.0147 ms 93.3%
+  triton_bmm_5728 0.0149 ms 92.3%
+  bmm 0.0175 ms 78.3%
+SingleProcess AUTOTUNE takes 3.0792 seconds
+AUTOTUNE bmm(512x1x63, 512x63x96)
+  triton_bmm_5754 0.0138 ms 100.0%
+  triton_bmm_5743 0.0143 ms 96.6%
+  triton_bmm_5750 0.0147 ms 94.3%
+  triton_bmm_5744 0.0148 ms 93.7%
+  triton_bmm_5752 0.0150 ms 91.9%
+  triton_bmm_5745 0.0152 ms 90.8%
+  triton_bmm_5753 0.0154 ms 89.6%
+  triton_bmm_5751 0.0156 ms 88.9%
+  triton_bmm_5749 0.0158 ms 87.6%
+  triton_bmm_5747 0.0158 ms 87.3%
+SingleProcess AUTOTUNE takes 3.4164 seconds
+AUTOTUNE bmm(512x1x96, 512x96x64)
+  triton_bmm_5815 0.0131 ms 100.0%
+  triton_bmm_5817 0.0136 ms 96.5%
+  triton_bmm_5818 0.0137 ms 95.6%
+  triton_bmm_5819 0.0141 ms 93.0%
+  triton_bmm_5821 0.0141 ms 93.0%
+  triton_bmm_5820 0.0142 ms 92.6%
+  triton_bmm_5816 0.0143 ms 91.7%
+  triton_bmm_5823 0.0144 ms 90.9%
+  triton_bmm_5822 0.0145 ms 90.5%
+  bmm 0.0173 ms 75.8%
+SingleProcess AUTOTUNE takes 2.7916 seconds
+AUTOTUNE bmm(512x1x64, 512x64x96)
+  triton_bmm_5837 0.0131 ms 100.0%
+  triton_bmm_5838 0.0136 ms 96.2%
+  triton_bmm_5847 0.0138 ms 94.9%
+  triton_bmm_5844 0.0139 ms 94.3%
+  triton_bmm_5836 0.0140 ms 94.0%
+  triton_bmm_5846 0.0141 ms 93.0%
+  triton_bmm_5843 0.0144 ms 91.1%
+  triton_bmm_5842 0.0147 ms 89.3%
+  triton_bmm_5845 0.0150 ms 87.4%
+  triton_bmm_5841 0.0153 ms 86.0%
+SingleProcess AUTOTUNE takes 3.6264 seconds
+AUTOTUNE bmm(512x1x96, 512x96x65)
+  triton_bmm_5915 0.0143 ms 100.0%
+  triton_bmm_5908 0.0145 ms 98.9%
+  triton_bmm_5919 0.0153 ms 93.9%
+  triton_bmm_5912 0.0155 ms 92.4%
+  triton_bmm_5911 0.0158 ms 90.9%
+  triton_bmm_5909 0.0158 ms 90.7%
+  triton_bmm_5916 0.0160 ms 89.4%
+  triton_bmm_5918 0.0162 ms 88.7%
+  triton_bmm_5914 0.0165 ms 87.0%
+  triton_bmm_5917 0.0166 ms 86.5%
+SingleProcess AUTOTUNE takes 3.7767 seconds
+AUTOTUNE bmm(512x1x65, 512x65x96)
+  triton_bmm_5943 0.0142 ms 100.0%
+  triton_bmm_5933 0.0151 ms 94.1%
+  triton_bmm_5932 0.0156 ms 91.0%
+  triton_bmm_5942 0.0159 ms 89.7%
+  triton_bmm_5939 0.0159 ms 89.4%
+  triton_bmm_5934 0.0160 ms 89.2%
+  triton_bmm_5940 0.0165 ms 86.2%
+  bmm 0.0166 ms 85.9%
+  triton_bmm_5941 0.0166 ms 85.9%
+  triton_bmm_5936 0.0169 ms 84.1%
+SingleProcess AUTOTUNE takes 3.4919 seconds
+AUTOTUNE bmm(512x1x96, 512x96x66)
+  triton_bmm_6011 0.0142 ms 100.0%
+  triton_bmm_6004 0.0144 ms 98.9%
+  triton_bmm_6007 0.0149 ms 95.1%
+  triton_bmm_6005 0.0150 ms 94.5%
+  triton_bmm_6015 0.0152 ms 93.3%
+  triton_bmm_6008 0.0159 ms 89.5%
+  triton_bmm_6014 0.0163 ms 87.4%
+  triton_bmm_6006 0.0164 ms 86.4%
+  triton_bmm_6012 0.0165 ms 85.9%
+  triton_bmm_6010 0.0166 ms 85.7%
+SingleProcess AUTOTUNE takes 3.6763 seconds
+AUTOTUNE bmm(512x1x66, 512x66x96)
+  triton_bmm_6029 0.0143 ms 100.0%
+  triton_bmm_6039 0.0146 ms 97.6%
+  triton_bmm_6038 0.0149 ms 95.5%
+  triton_bmm_6028 0.0154 ms 92.6%
+  triton_bmm_6030 0.0156 ms 91.4%
+  triton_bmm_6033 0.0157 ms 91.0%
+  triton_bmm_6036 0.0160 ms 89.1%
+  triton_bmm_6031 0.0164 ms 87.3%
+  triton_bmm_6035 0.0164 ms 86.9%
+  triton_bmm_6032 0.0166 ms 85.8%
+SingleProcess AUTOTUNE takes 3.7758 seconds
+AUTOTUNE bmm(512x1x96, 512x96x67)
+  triton_bmm_6107 0.0144 ms 100.0%
+  triton_bmm_6100 0.0151 ms 95.3%
+  triton_bmm_6103 0.0153 ms 94.4%
+  triton_bmm_6104 0.0156 ms 92.2%
+  triton_bmm_6111 0.0159 ms 90.6%
+  triton_bmm_6101 0.0160 ms 90.0%
+  triton_bmm_6106 0.0162 ms 89.3%
+  triton_bmm_6102 0.0163 ms 88.8%
+  triton_bmm_6108 0.0167 ms 86.2%
+  triton_bmm_6109 0.0167 ms 86.2%
+SingleProcess AUTOTUNE takes 3.8908 seconds
+AUTOTUNE bmm(512x1x67, 512x67x96)
+  triton_bmm_6135 0.0148 ms 100.0%
+  triton_bmm_6131 0.0154 ms 96.3%
+  triton_bmm_6125 0.0158 ms 93.7%
+  triton_bmm_6126 0.0161 ms 92.2%
+  triton_bmm_6124 0.0163 ms 91.1%
+  triton_bmm_6134 0.0165 ms 89.9%
+  triton_bmm_6133 0.0167 ms 88.5%
+  triton_bmm_6128 0.0171 ms 86.5%
+  triton_bmm_6132 0.0172 ms 86.1%
+  triton_bmm_6129 0.0172 ms 85.9%
+SingleProcess AUTOTUNE takes 3.4722 seconds
+AUTOTUNE bmm(512x1x96, 512x96x68)
+  triton_bmm_6196 0.0142 ms 100.0%
+  triton_bmm_6203 0.0145 ms 97.8%
+  triton_bmm_6197 0.0150 ms 94.7%
+  triton_bmm_6207 0.0153 ms 92.9%
+  triton_bmm_6199 0.0154 ms 92.3%
+  triton_bmm_6200 0.0156 ms 91.4%
+  triton_bmm_6206 0.0160 ms 89.0%
+  triton_bmm_6204 0.0167 ms 85.2%
+  triton_bmm_6202 0.0167 ms 85.1%
+  triton_bmm_6201 0.0167 ms 85.1%
+SingleProcess AUTOTUNE takes 3.5760 seconds
+AUTOTUNE bmm(512x1x68, 512x68x96)
+  triton_bmm_6221 0.0140 ms 100.0%
+  triton_bmm_6231 0.0142 ms 98.4%
+  triton_bmm_6220 0.0147 ms 95.4%
+  triton_bmm_6222 0.0151 ms 93.0%
+  triton_bmm_6230 0.0151 ms 92.8%
+  triton_bmm_6228 0.0156 ms 89.6%
+  triton_bmm_6229 0.0158 ms 88.5%
+  triton_bmm_6223 0.0165 ms 84.7%
+  triton_bmm_6227 0.0166 ms 84.2%
+  triton_bmm_6224 0.0167 ms 83.7%
+SingleProcess AUTOTUNE takes 3.7945 seconds
+AUTOTUNE bmm(512x1x96, 512x96x69)
+  triton_bmm_6299 0.0146 ms 100.0%
+  triton_bmm_6292 0.0154 ms 95.0%
+  triton_bmm_6296 0.0158 ms 92.3%
+  triton_bmm_6295 0.0159 ms 91.8%
+  triton_bmm_6303 0.0161 ms 90.7%
+  triton_bmm_6293 0.0162 ms 89.9%
+  triton_bmm_6300 0.0163 ms 89.4%
+  triton_bmm_6298 0.0167 ms 87.2%
+  triton_bmm_6294 0.0170 ms 85.7%
+  triton_bmm_6302 0.0171 ms 85.4%
+SingleProcess AUTOTUNE takes 3.8151 seconds
+AUTOTUNE bmm(512x1x69, 512x69x96)
+  triton_bmm_6327 0.0143 ms 100.0%
+  triton_bmm_6317 0.0159 ms 90.3%
+  triton_bmm_6318 0.0161 ms 88.9%
+  triton_bmm_6326 0.0162 ms 88.7%
+  triton_bmm_6316 0.0165 ms 87.0%
+  triton_bmm_6325 0.0168 ms 85.2%
+  triton_bmm_6324 0.0170 ms 84.4%
+  triton_bmm_6322 0.0173 ms 82.8%
+  triton_bmm_6320 0.0173 ms 82.7%
+  triton_bmm_6321 0.0175 ms 82.1%
+SingleProcess AUTOTUNE takes 3.5686 seconds
+AUTOTUNE bmm(512x1x96, 512x96x70)
+  triton_bmm_6395 0.0145 ms 100.0%
+  triton_bmm_6388 0.0152 ms 95.2%
+  triton_bmm_6389 0.0154 ms 93.8%
+  triton_bmm_6391 0.0157 ms 92.1%
+  triton_bmm_6399 0.0160 ms 90.2%
+  triton_bmm_6392 0.0162 ms 89.3%
+  triton_bmm_6390 0.0163 ms 88.6%
+  triton_bmm_6396 0.0168 ms 86.2%
+  triton_bmm_6393 0.0168 ms 85.9%
+  triton_bmm_6394 0.0169 ms 85.8%
+SingleProcess AUTOTUNE takes 4.0985 seconds
+AUTOTUNE bmm(512x1x70, 512x70x96)
+  triton_bmm_6423 0.0143 ms 100.0%
+  triton_bmm_6422 0.0147 ms 97.6%
+  triton_bmm_6412 0.0150 ms 95.5%
+  triton_bmm_6413 0.0150 ms 95.3%
+  triton_bmm_6414 0.0153 ms 93.5%
+  triton_bmm_6417 0.0160 ms 89.6%
+  triton_bmm_6415 0.0162 ms 88.2%
+  triton_bmm_6420 0.0165 ms 86.6%
+  triton_bmm_6419 0.0167 ms 85.8%
+  triton_bmm_6418 0.0168 ms 85.3%
+SingleProcess AUTOTUNE takes 4.0017 seconds
+AUTOTUNE bmm(512x1x96, 512x96x71)
+  triton_bmm_6491 0.0153 ms 100.0%
+  triton_bmm_6487 0.0156 ms 97.9%
+  triton_bmm_6484 0.0156 ms 97.7%
+  triton_bmm_6495 0.0157 ms 97.0%
+  triton_bmm_6488 0.0160 ms 95.6%
+  triton_bmm_6485 0.0164 ms 93.2%
+  triton_bmm_6490 0.0164 ms 92.8%
+  triton_bmm_6486 0.0167 ms 91.4%
+  triton_bmm_6494 0.0167 ms 91.2%
+  triton_bmm_6492 0.0169 ms 90.2%
+SingleProcess AUTOTUNE takes 3.6575 seconds
+AUTOTUNE bmm(512x1x71, 512x71x96)
+  triton_bmm_6519 0.0146 ms 100.0%
+  triton_bmm_6509 0.0154 ms 94.4%
+  triton_bmm_6510 0.0162 ms 89.9%
+  triton_bmm_6515 0.0164 ms 88.9%
+  triton_bmm_6518 0.0164 ms 88.9%
+  triton_bmm_6508 0.0165 ms 88.0%
+  triton_bmm_6513 0.0171 ms 85.2%
+  triton_bmm_6514 0.0175 ms 83.2%
+  triton_bmm_6517 0.0175 ms 83.0%
+  triton_bmm_6512 0.0177 ms 82.4%
+SingleProcess AUTOTUNE takes 3.4847 seconds
+AUTOTUNE bmm(512x1x96, 512x96x72)
+  triton_bmm_6587 0.0148 ms 100.0%
+  triton_bmm_6580 0.0150 ms 98.3%
+  triton_bmm_6581 0.0151 ms 97.9%
+  triton_bmm_6583 0.0151 ms 97.7%
+  triton_bmm_6591 0.0155 ms 95.3%
+  triton_bmm_6584 0.0161 ms 91.8%
+  triton_bmm_6586 0.0164 ms 90.1%
+  triton_bmm_6590 0.0167 ms 88.7%
+  triton_bmm_6588 0.0168 ms 88.0%
+  triton_bmm_6585 0.0168 ms 87.8%
+SingleProcess AUTOTUNE takes 3.5846 seconds
+AUTOTUNE bmm(512x1x72, 512x72x96)
+  triton_bmm_6605 0.0143 ms 100.0%
+  triton_bmm_6615 0.0149 ms 95.5%
+  triton_bmm_6614 0.0152 ms 93.7%
+  triton_bmm_6606 0.0154 ms 92.5%
+  triton_bmm_6604 0.0155 ms 92.3%
+  triton_bmm_6611 0.0162 ms 88.1%
+  triton_bmm_6610 0.0164 ms 86.8%
+  triton_bmm_6612 0.0166 ms 85.8%
+  triton_bmm_6609 0.0167 ms 85.4%
+  triton_bmm_6607 0.0169 ms 84.3%
+SingleProcess AUTOTUNE takes 4.0882 seconds
+AUTOTUNE bmm(512x1x96, 512x96x1537)
+  triton_bmm_6679 0.1185 ms 100.0%
+  triton_bmm_6680 0.1189 ms 99.7%
+  triton_bmm_6686 0.1197 ms 99.0%
+  triton_bmm_6682 0.1203 ms 98.5%
+  triton_bmm_6676 0.1204 ms 98.4%
+  triton_bmm_6683 0.1204 ms 98.4%
+  triton_bmm_6677 0.1226 ms 96.6%
+  triton_bmm_6687 0.1234 ms 96.0%
+  triton_bmm_6685 0.1248 ms 94.9%
+  triton_bmm_6678 0.1252 ms 94.6%
+SingleProcess AUTOTUNE takes 2.1451 seconds
+AUTOTUNE bmm(512x1x1537, 512x1537x96)
+  triton_bmm_6708 0.1246 ms 100.0%
+  triton_bmm_6701 0.1248 ms 99.9%
+  triton_bmm_6702 0.1253 ms 99.5%
+  triton_bmm_6711 0.1267 ms 98.4%
+  triton_bmm_6700 0.1286 ms 96.9%
+  triton_bmm_6707 0.1310 ms 95.2%
+  triton_bmm_6709 0.1374 ms 90.7%
+  triton_bmm_6706 0.1415 ms 88.1%
+  triton_bmm_6705 0.1454 ms 85.7%
+  triton_bmm_6704 0.1476 ms 84.4%
+SingleProcess AUTOTUNE takes 1.8161 seconds
+AUTOTUNE bmm(512x1x96, 512x96x1538)
+  triton_bmm_6775 0.1182 ms 100.0%
+  triton_bmm_6776 0.1188 ms 99.5%
+  triton_bmm_6778 0.1200 ms 98.5%
+  triton_bmm_6772 0.1201 ms 98.4%
+  triton_bmm_6779 0.1202 ms 98.3%
+  triton_bmm_6783 0.1208 ms 97.8%
+  triton_bmm_6773 0.1222 ms 96.7%
+  triton_bmm_6782 0.1246 ms 94.9%
+  triton_bmm_6774 0.1250 ms 94.6%
+  triton_bmm_6780 0.1271 ms 93.0%
+SingleProcess AUTOTUNE takes 2.1126 seconds
+AUTOTUNE bmm(512x1x1538, 512x1538x96)
+  triton_bmm_6797 0.1163 ms 100.0%
+  triton_bmm_6799 0.1190 ms 97.7%
+  triton_bmm_6804 0.1195 ms 97.3%
+  triton_bmm_6802 0.1212 ms 95.9%
+  triton_bmm_6796 0.1214 ms 95.8%
+  triton_bmm_6800 0.1215 ms 95.7%
+  triton_bmm_6798 0.1222 ms 95.1%
+  triton_bmm_6805 0.1243 ms 93.5%
+  triton_bmm_6801 0.1250 ms 93.0%
+  triton_bmm_6807 0.1260 ms 92.3%
+SingleProcess AUTOTUNE takes 1.8522 seconds
+AUTOTUNE bmm(512x1x96, 512x96x1539)
+  triton_bmm_6871 0.1189 ms 100.0%
+  triton_bmm_6872 0.1196 ms 99.5%
+  triton_bmm_6868 0.1199 ms 99.2%
+  triton_bmm_6874 0.1199 ms 99.2%
+  triton_bmm_6875 0.1200 ms 99.1%
+  triton_bmm_6878 0.1204 ms 98.7%
+  triton_bmm_6869 0.1230 ms 96.7%
+  triton_bmm_6879 0.1233 ms 96.5%
+  triton_bmm_6877 0.1246 ms 95.5%
+  triton_bmm_6870 0.1251 ms 95.1%
+SingleProcess AUTOTUNE takes 2.1210 seconds
+AUTOTUNE bmm(512x1x1539, 512x1539x96)
+  triton_bmm_6894 0.1244 ms 100.0%
+  triton_bmm_6893 0.1246 ms 99.8%
+  triton_bmm_6900 0.1274 ms 97.6%
+  triton_bmm_6903 0.1276 ms 97.4%
+  triton_bmm_6892 0.1285 ms 96.8%
+  triton_bmm_6899 0.1315 ms 94.6%
+  triton_bmm_6901 0.1361 ms 91.4%
+  triton_bmm_6898 0.1411 ms 88.2%
+  triton_bmm_6897 0.1444 ms 86.1%
+  triton_bmm_6896 0.1475 ms 84.3%
+SingleProcess AUTOTUNE takes 1.8769 seconds
+AUTOTUNE bmm(512x1x96, 512x96x1540)
+  triton_bmm_6967 0.1184 ms 100.0%
+  triton_bmm_6968 0.1193 ms 99.3%
+  triton_bmm_6964 0.1193 ms 99.2%
+  triton_bmm_6971 0.1197 ms 98.9%
+  triton_bmm_6970 0.1202 ms 98.6%
+  triton_bmm_6975 0.1207 ms 98.1%
+  triton_bmm_6965 0.1222 ms 96.9%
+  triton_bmm_6974 0.1229 ms 96.4%
+  triton_bmm_6973 0.1236 ms 95.8%
+  triton_bmm_6966 0.1250 ms 94.8%
+SingleProcess AUTOTUNE takes 2.0894 seconds
+AUTOTUNE bmm(512x1x1540, 512x1540x96)
+  triton_bmm_6989 0.1156 ms 100.0%
+  triton_bmm_6996 0.1181 ms 97.9%
+  triton_bmm_6991 0.1187 ms 97.4%
+  triton_bmm_6988 0.1192 ms 97.0%
+  triton_bmm_6990 0.1199 ms 96.4%
+  triton_bmm_6992 0.1208 ms 95.7%
+  triton_bmm_6997 0.1210 ms 95.5%
+  triton_bmm_6994 0.1219 ms 94.9%
+  triton_bmm_6993 0.1229 ms 94.1%
+  triton_bmm_6999 0.1253 ms 92.2%
+SingleProcess AUTOTUNE takes 1.9276 seconds
+AUTOTUNE bmm(512x1x96, 512x96x1541)
+  triton_bmm_7063 0.1181 ms 100.0%
+  triton_bmm_7064 0.1193 ms 99.0%
+  triton_bmm_7066 0.1202 ms 98.3%
+  triton_bmm_7070 0.1206 ms 98.0%
+  triton_bmm_7060 0.1206 ms 98.0%
+  triton_bmm_7067 0.1210 ms 97.6%
+  triton_bmm_7061 0.1227 ms 96.3%
+  triton_bmm_7071 0.1238 ms 95.4%
+  triton_bmm_7069 0.1248 ms 94.7%
+  triton_bmm_7062 0.1253 ms 94.3%
+SingleProcess AUTOTUNE takes 2.1418 seconds
+AUTOTUNE bmm(512x1x1541, 512x1541x96)
+  triton_bmm_7085 0.1244 ms 100.0%
+  triton_bmm_7086 0.1248 ms 99.7%
+  triton_bmm_7092 0.1258 ms 99.0%
+  triton_bmm_7095 0.1263 ms 98.6%
+  triton_bmm_7084 0.1286 ms 96.8%
+  triton_bmm_7091 0.1316 ms 94.6%
+  triton_bmm_7093 0.1352 ms 92.1%
+  triton_bmm_7090 0.1405 ms 88.6%
+  triton_bmm_7089 0.1442 ms 86.3%
+  triton_bmm_7088 0.1473 ms 84.5%
+SingleProcess AUTOTUNE takes 1.8078 seconds
+AUTOTUNE bmm(512x1x96, 512x96x1542)
+  triton_bmm_7159 0.1184 ms 100.0%
+  triton_bmm_7160 0.1191 ms 99.4%
+  triton_bmm_7156 0.1197 ms 98.9%
+  triton_bmm_7162 0.1200 ms 98.7%
+  triton_bmm_7163 0.1206 ms 98.2%
+  triton_bmm_7167 0.1215 ms 97.4%
+  triton_bmm_7157 0.1227 ms 96.5%
+  triton_bmm_7166 0.1244 ms 95.2%
+  triton_bmm_7158 0.1253 ms 94.5%
+  triton_bmm_7164 0.1277 ms 92.7%
+SingleProcess AUTOTUNE takes 2.1047 seconds
+AUTOTUNE bmm(512x1x1542, 512x1542x96)
+  triton_bmm_7181 0.1183 ms 100.0%
+  triton_bmm_7188 0.1188 ms 99.6%
+  triton_bmm_7183 0.1192 ms 99.2%
+  triton_bmm_7180 0.1205 ms 98.2%
+  triton_bmm_7184 0.1215 ms 97.4%
+  triton_bmm_7182 0.1216 ms 97.3%
+  triton_bmm_7186 0.1229 ms 96.3%
+  triton_bmm_7185 0.1244 ms 95.1%
+  triton_bmm_7189 0.1245 ms 95.0%
+  triton_bmm_7191 0.1257 ms 94.1%
+SingleProcess AUTOTUNE takes 1.8717 seconds
+AUTOTUNE bmm(512x1x96, 512x96x1543)
+  triton_bmm_7255 0.1191 ms 100.0%
+  triton_bmm_7256 0.1198 ms 99.4%
+  triton_bmm_7258 0.1201 ms 99.1%
+  triton_bmm_7262 0.1203 ms 99.0%
+  triton_bmm_7259 0.1205 ms 98.9%
+  triton_bmm_7252 0.1207 ms 98.6%
+  triton_bmm_7253 0.1232 ms 96.7%
+  triton_bmm_7263 0.1236 ms 96.4%
+  triton_bmm_7261 0.1250 ms 95.3%
+  triton_bmm_7254 0.1252 ms 95.1%
+SingleProcess AUTOTUNE takes 2.0763 seconds
+AUTOTUNE bmm(512x1x1543, 512x1543x96)
+  triton_bmm_7278 0.1267 ms 100.0%
+  triton_bmm_7277 0.1276 ms 99.3%
+  triton_bmm_7284 0.1277 ms 99.2%
+  triton_bmm_7287 0.1297 ms 97.7%
+  triton_bmm_7276 0.1305 ms 97.1%
+  triton_bmm_7283 0.1333 ms 95.1%
+  triton_bmm_7285 0.1358 ms 93.3%
+  triton_bmm_7282 0.1443 ms 87.8%
+  triton_bmm_7281 0.1464 ms 86.6%
+  triton_bmm_7280 0.1486 ms 85.3%
+SingleProcess AUTOTUNE takes 1.8164 seconds
+AUTOTUNE bmm(512x1x96, 512x96x1544)
+  triton_bmm_7351 0.1187 ms 100.0%
+  triton_bmm_7358 0.1190 ms 99.8%
+  triton_bmm_7352 0.1193 ms 99.5%
+  triton_bmm_7348 0.1195 ms 99.4%
+  triton_bmm_7354 0.1197 ms 99.2%
+  triton_bmm_7355 0.1197 ms 99.1%
+  triton_bmm_7359 0.1204 ms 98.6%
+  triton_bmm_7349 0.1219 ms 97.4%
+  triton_bmm_7350 0.1246 ms 95.3%
+  triton_bmm_7356 0.1266 ms 93.8%
+SingleProcess AUTOTUNE takes 2.0497 seconds
+AUTOTUNE bmm(512x1x1544, 512x1544x96)
+  bmm 0.1187 ms 100.0%
+  triton_bmm_7373 0.1188 ms 99.9%
+  triton_bmm_7375 0.1202 ms 98.8%
+  triton_bmm_7376 0.1208 ms 98.3%
+  triton_bmm_7380 0.1208 ms 98.2%
+  triton_bmm_7378 0.1215 ms 97.7%
+  triton_bmm_7374 0.1228 ms 96.7%
+  triton_bmm_7372 0.1241 ms 95.6%
+  triton_bmm_7377 0.1268 ms 93.6%
+  triton_bmm_7381 0.1307 ms 90.8%
+SingleProcess AUTOTUNE takes 1.8279 seconds
+AUTOTUNE bmm(512x1x96, 512x96x1545)
+  triton_bmm_7447 0.1191 ms 100.0%
+  triton_bmm_7448 0.1194 ms 99.8%
+  triton_bmm_7450 0.1201 ms 99.2%
+  triton_bmm_7451 0.1205 ms 98.8%
+  triton_bmm_7444 0.1207 ms 98.7%
+  triton_bmm_7454 0.1207 ms 98.7%
+  triton_bmm_7445 0.1227 ms 97.1%
+  triton_bmm_7455 0.1241 ms 96.0%
+  triton_bmm_7453 0.1246 ms 95.6%
+  triton_bmm_7446 0.1255 ms 94.9%
+SingleProcess AUTOTUNE takes 2.1418 seconds
+AUTOTUNE bmm(512x1x1545, 512x1545x96)
+  triton_bmm_7469 0.1288 ms 100.0%
+  triton_bmm_7476 0.1290 ms 99.9%
+  triton_bmm_7470 0.1298 ms 99.2%
+  triton_bmm_7468 0.1321 ms 97.5%
+  triton_bmm_7479 0.1323 ms 97.4%
+  triton_bmm_7475 0.1335 ms 96.5%
+  triton_bmm_7477 0.1380 ms 93.3%
+  triton_bmm_7474 0.1445 ms 89.1%
+  triton_bmm_7473 0.1505 ms 85.5%
+  triton_bmm_7472 0.1519 ms 84.8%
+SingleProcess AUTOTUNE takes 1.8031 seconds
+AUTOTUNE bmm(512x1x96, 512x96x1546)
+  triton_bmm_7543 0.1190 ms 100.0%
+  triton_bmm_7544 0.1198 ms 99.3%
+  triton_bmm_7540 0.1205 ms 98.8%
+  triton_bmm_7547 0.1205 ms 98.8%
+  triton_bmm_7546 0.1207 ms 98.6%
+  triton_bmm_7551 0.1216 ms 97.9%
+  triton_bmm_7541 0.1227 ms 97.0%
+  triton_bmm_7550 0.1247 ms 95.5%
+  triton_bmm_7542 0.1254 ms 94.9%
+  triton_bmm_7548 0.1278 ms 93.2%
+SingleProcess AUTOTUNE takes 2.0830 seconds
+AUTOTUNE bmm(512x1x1546, 512x1546x96)
+  triton_bmm_7572 0.1200 ms 100.0%
+  triton_bmm_7565 0.1201 ms 99.9%
+  triton_bmm_7567 0.1225 ms 97.9%
+  triton_bmm_7564 0.1228 ms 97.7%
+  triton_bmm_7570 0.1239 ms 96.8%
+  triton_bmm_7568 0.1248 ms 96.1%
+  triton_bmm_7573 0.1274 ms 94.1%
+  triton_bmm_7566 0.1282 ms 93.6%
+  triton_bmm_7569 0.1283 ms 93.5%
+  triton_bmm_7575 0.1284 ms 93.4%
+SingleProcess AUTOTUNE takes 1.8610 seconds
+AUTOTUNE bmm(512x1x96, 512x96x1547)
+  triton_bmm_7639 0.1193 ms 100.0%
+  triton_bmm_7640 0.1200 ms 99.4%
+  triton_bmm_7642 0.1202 ms 99.2%
+  triton_bmm_7646 0.1206 ms 99.0%
+  triton_bmm_7636 0.1208 ms 98.7%
+  triton_bmm_7643 0.1212 ms 98.5%
+  triton_bmm_7637 0.1228 ms 97.2%
+  triton_bmm_7647 0.1241 ms 96.1%
+  triton_bmm_7645 0.1250 ms 95.4%
+  triton_bmm_7638 0.1257 ms 94.9%
+SingleProcess AUTOTUNE takes 2.1016 seconds
+AUTOTUNE bmm(512x1x1547, 512x1547x96)
+  triton_bmm_7662 0.1239 ms 100.0%
+  triton_bmm_7661 0.1244 ms 99.6%
+  triton_bmm_7671 0.1274 ms 97.3%
+  triton_bmm_7660 0.1280 ms 96.8%
+  triton_bmm_7668 0.1290 ms 96.0%
+  triton_bmm_7667 0.1311 ms 94.5%
+  triton_bmm_7669 0.1398 ms 88.6%
+  triton_bmm_7666 0.1418 ms 87.4%
+  triton_bmm_7665 0.1447 ms 85.6%
+  triton_bmm_7664 0.1476 ms 83.9%
+SingleProcess AUTOTUNE takes 1.8109 seconds
+AUTOTUNE bmm(512x1x96, 512x96x1548)
+  triton_bmm_7735 0.1190 ms 100.0%
+  triton_bmm_7732 0.1198 ms 99.4%
+  triton_bmm_7736 0.1199 ms 99.3%
+  triton_bmm_7739 0.1201 ms 99.1%
+  triton_bmm_7738 0.1206 ms 98.7%
+  triton_bmm_7743 0.1209 ms 98.5%
+  triton_bmm_7733 0.1226 ms 97.1%
+  triton_bmm_7742 0.1234 ms 96.5%
+  triton_bmm_7741 0.1236 ms 96.3%
+  triton_bmm_7734 0.1251 ms 95.1%
+SingleProcess AUTOTUNE takes 2.1808 seconds
+AUTOTUNE bmm(512x1x1548, 512x1548x96)
+  triton_bmm_7757 0.1181 ms 100.0%
+  triton_bmm_7759 0.1199 ms 98.5%
+  triton_bmm_7758 0.1222 ms 96.7%
+  triton_bmm_7760 0.1224 ms 96.5%
+  triton_bmm_7764 0.1232 ms 95.9%
+  triton_bmm_7756 0.1233 ms 95.8%
+  triton_bmm_7762 0.1251 ms 94.4%
+  triton_bmm_7761 0.1255 ms 94.1%
+  triton_bmm_7765 0.1260 ms 93.7%
+  triton_bmm_7767 0.1318 ms 89.6%
+SingleProcess AUTOTUNE takes 1.8469 seconds
+AUTOTUNE bmm(512x1x96, 512x96x1549)
+  triton_bmm_7831 0.1191 ms 100.0%
+  triton_bmm_7832 0.1197 ms 99.5%
+  triton_bmm_7828 0.1205 ms 98.8%
+  triton_bmm_7838 0.1206 ms 98.8%
+  triton_bmm_7834 0.1207 ms 98.7%
+  triton_bmm_7835 0.1212 ms 98.3%
+  triton_bmm_7829 0.1231 ms 96.8%
+  triton_bmm_7839 0.1239 ms 96.2%
+  triton_bmm_7837 0.1252 ms 95.1%
+  triton_bmm_7830 0.1257 ms 94.8%
+SingleProcess AUTOTUNE takes 2.2902 seconds
+AUTOTUNE bmm(512x1x1549, 512x1549x96)
+  triton_bmm_7860 0.1303 ms 100.0%
+  triton_bmm_7854 0.1308 ms 99.6%
+  triton_bmm_7853 0.1309 ms 99.5%
+  triton_bmm_7852 0.1335 ms 97.6%
+  triton_bmm_7863 0.1340 ms 97.2%
+  triton_bmm_7859 0.1357 ms 96.0%
+  triton_bmm_7861 0.1387 ms 94.0%
+  triton_bmm_7858 0.1449 ms 89.9%
+  triton_bmm_7857 0.1500 ms 86.9%
+  triton_bmm_7856 0.1520 ms 85.7%
+SingleProcess AUTOTUNE takes 1.8105 seconds
+AUTOTUNE bmm(512x1x96, 512x96x1550)
+  triton_bmm_7927 0.1193 ms 100.0%
+  triton_bmm_7928 0.1201 ms 99.3%
+  triton_bmm_7924 0.1202 ms 99.3%
+  triton_bmm_7931 0.1204 ms 99.1%
+  triton_bmm_7930 0.1207 ms 98.9%
+  triton_bmm_7935 0.1217 ms 98.0%
+  triton_bmm_7925 0.1229 ms 97.1%
+  triton_bmm_7934 0.1255 ms 95.1%
+  triton_bmm_7926 0.1257 ms 94.9%
+  triton_bmm_7932 0.1279 ms 93.2%
+SingleProcess AUTOTUNE takes 2.1015 seconds
+AUTOTUNE bmm(512x1x1550, 512x1550x96)
+  triton_bmm_7949 0.1204 ms 100.0%
+  triton_bmm_7956 0.1225 ms 98.3%
+  triton_bmm_7951 0.1239 ms 97.2%
+  triton_bmm_7948 0.1249 ms 96.4%
+  triton_bmm_7952 0.1254 ms 96.0%
+  triton_bmm_7954 0.1254 ms 96.0%
+  triton_bmm_7950 0.1272 ms 94.7%
+  triton_bmm_7957 0.1278 ms 94.3%
+  triton_bmm_7953 0.1292 ms 93.2%
+  triton_bmm_7959 0.1298 ms 92.8%
+SingleProcess AUTOTUNE takes 1.9848 seconds
+AUTOTUNE bmm(512x1x96, 512x96x1551)
+  triton_bmm_8023 0.1194 ms 100.0%
+  triton_bmm_8024 0.1199 ms 99.6%
+  triton_bmm_8020 0.1205 ms 99.1%
+  triton_bmm_8030 0.1205 ms 99.1%
+  triton_bmm_8026 0.1208 ms 98.9%
+  triton_bmm_8027 0.1212 ms 98.5%
+  triton_bmm_8021 0.1235 ms 96.7%
+  triton_bmm_8031 0.1239 ms 96.4%
+  triton_bmm_8029 0.1250 ms 95.6%
+  triton_bmm_8022 0.1255 ms 95.1%
+SingleProcess AUTOTUNE takes 2.3302 seconds
+AUTOTUNE bmm(512x1x1551, 512x1551x96)
+  triton_bmm_8045 0.1253 ms 100.0%
+  triton_bmm_8046 0.1257 ms 99.7%
+  triton_bmm_8055 0.1278 ms 98.0%
+  triton_bmm_8052 0.1280 ms 97.9%
+  triton_bmm_8044 0.1290 ms 97.1%
+  triton_bmm_8051 0.1320 ms 94.9%
+  triton_bmm_8053 0.1384 ms 90.5%
+  triton_bmm_8050 0.1436 ms 87.3%
+  triton_bmm_8049 0.1466 ms 85.5%
+  triton_bmm_8048 0.1482 ms 84.6%
+SingleProcess AUTOTUNE takes 1.8047 seconds
+AUTOTUNE bmm(512x1x96, 512x96x1552)
+  triton_bmm_8126 0.1189 ms 100.0%
+  triton_bmm_8119 0.1191 ms 99.9%
+  triton_bmm_8116 0.1194 ms 99.6%
+  triton_bmm_8120 0.1199 ms 99.2%
+  triton_bmm_8123 0.1199 ms 99.2%
+  triton_bmm_8122 0.1205 ms 98.7%
+  triton_bmm_8127 0.1207 ms 98.5%
+  triton_bmm_8117 0.1216 ms 97.8%
+  triton_bmm_8118 0.1247 ms 95.4%
+  bmm 0.1272 ms 93.5%
+SingleProcess AUTOTUNE takes 2.0424 seconds
+AUTOTUNE bmm(512x1x1552, 512x1552x96)
+  triton_bmm_8141 0.1166 ms 100.0%
+  triton_bmm_8143 0.1200 ms 97.2%
+  triton_bmm_8148 0.1200 ms 97.2%
+  triton_bmm_8140 0.1201 ms 97.1%
+  bmm 0.1203 ms 97.0%
+  triton_bmm_8144 0.1216 ms 95.9%
+  triton_bmm_8142 0.1219 ms 95.7%
+  triton_bmm_8146 0.1224 ms 95.3%
+  triton_bmm_8151 0.1238 ms 94.2%
+  triton_bmm_8149 0.1244 ms 93.8%
+SingleProcess AUTOTUNE takes 1.8151 seconds
+AUTOTUNE bmm(512x1x96, 512x96x1553)
+  triton_bmm_8215 0.1196 ms 100.0%
+  triton_bmm_8216 0.1204 ms 99.3%
+  triton_bmm_8222 0.1208 ms 99.0%
+  triton_bmm_8212 0.1210 ms 98.8%
+  triton_bmm_8218 0.1212 ms 98.7%
+  triton_bmm_8219 0.1217 ms 98.3%
+  triton_bmm_8213 0.1232 ms 97.1%
+  triton_bmm_8223 0.1245 ms 96.0%
+  triton_bmm_8221 0.1255 ms 95.3%
+  triton_bmm_8214 0.1261 ms 94.9%
+SingleProcess AUTOTUNE takes 1.5276 seconds
+AUTOTUNE bmm(512x1x1553, 512x1553x96)
+  triton_bmm_8238 0.1247 ms 100.0%
+  triton_bmm_8237 0.1251 ms 99.7%
+  triton_bmm_8244 0.1255 ms 99.4%
+  triton_bmm_8247 0.1275 ms 97.8%
+  triton_bmm_8236 0.1291 ms 96.6%
+  triton_bmm_8243 0.1323 ms 94.3%
+  triton_bmm_8245 0.1365 ms 91.4%
+  triton_bmm_8242 0.1417 ms 88.0%
+  triton_bmm_8241 0.1454 ms 85.8%
+  triton_bmm_8240 0.1480 ms 84.3%
+SingleProcess AUTOTUNE takes 1.5311 seconds
+AUTOTUNE bmm(512x1x96, 512x96x1554)
+  triton_bmm_8311 0.1190 ms 100.0%
+  triton_bmm_8312 0.1201 ms 99.1%
+  triton_bmm_8315 0.1209 ms 98.5%
+  triton_bmm_8308 0.1210 ms 98.4%
+  triton_bmm_8314 0.1210 ms 98.3%
+  triton_bmm_8319 0.1223 ms 97.3%
+  triton_bmm_8309 0.1234 ms 96.5%
+  triton_bmm_8318 0.1252 ms 95.1%
+  triton_bmm_8310 0.1256 ms 94.8%
+  triton_bmm_8316 0.1282 ms 92.8%
+SingleProcess AUTOTUNE takes 1.5285 seconds
+AUTOTUNE bmm(512x1x1554, 512x1554x96)
+  triton_bmm_8333 0.1172 ms 100.0%
+  triton_bmm_8340 0.1190 ms 98.5%
+  triton_bmm_8335 0.1196 ms 98.0%
+  triton_bmm_8332 0.1197 ms 97.9%
+  triton_bmm_8334 0.1216 ms 96.4%
+  triton_bmm_8336 0.1223 ms 95.9%
+  triton_bmm_8338 0.1226 ms 95.6%
+  triton_bmm_8341 0.1236 ms 94.8%
+  triton_bmm_8343 0.1261 ms 93.0%
+  triton_bmm_8337 0.1271 ms 92.2%
+SingleProcess AUTOTUNE takes 1.5267 seconds
+AUTOTUNE bmm(512x1x96, 512x96x1555)
+  triton_bmm_8407 0.1198 ms 100.0%
+  triton_bmm_8408 0.1204 ms 99.5%
+  triton_bmm_8410 0.1207 ms 99.2%
+  triton_bmm_8411 0.1211 ms 98.9%
+  triton_bmm_8414 0.1213 ms 98.8%
+  triton_bmm_8404 0.1214 ms 98.7%
+  triton_bmm_8405 0.1231 ms 97.3%
+  triton_bmm_8415 0.1245 ms 96.2%
+  triton_bmm_8413 0.1251 ms 95.8%
+  triton_bmm_8406 0.1261 ms 95.0%
+SingleProcess AUTOTUNE takes 2.1646 seconds
+AUTOTUNE bmm(512x1x1555, 512x1555x96)
+  triton_bmm_8430 0.1246 ms 100.0%
+  triton_bmm_8429 0.1246 ms 100.0%
+  triton_bmm_8436 0.1254 ms 99.3%
+  triton_bmm_8439 0.1270 ms 98.1%
+  triton_bmm_8428 0.1282 ms 97.2%
+  triton_bmm_8435 0.1320 ms 94.4%
+  triton_bmm_8437 0.1367 ms 91.2%
+  triton_bmm_8434 0.1418 ms 87.9%
+  triton_bmm_8433 0.1455 ms 85.7%
+  triton_bmm_8432 0.1479 ms 84.3%
+SingleProcess AUTOTUNE takes 1.5352 seconds
+AUTOTUNE bmm(512x1x96, 512x96x1556)
+  triton_bmm_8503 0.1195 ms 100.0%
+  triton_bmm_8504 0.1202 ms 99.4%
+  triton_bmm_8500 0.1202 ms 99.4%
+  triton_bmm_8507 0.1207 ms 99.0%
+  triton_bmm_8506 0.1211 ms 98.7%
+  triton_bmm_8511 0.1215 ms 98.3%
+  triton_bmm_8501 0.1232 ms 97.0%
+  triton_bmm_8510 0.1244 ms 96.1%
+  triton_bmm_8509 0.1247 ms 95.8%
+  triton_bmm_8502 0.1258 ms 95.0%
+SingleProcess AUTOTUNE takes 1.5290 seconds
+AUTOTUNE bmm(512x1x1556, 512x1556x96)
+  triton_bmm_8525 0.1166 ms 100.0%
+  triton_bmm_8532 0.1180 ms 98.8%
+  triton_bmm_8527 0.1196 ms 97.5%
+  triton_bmm_8524 0.1198 ms 97.3%
+  triton_bmm_8528 0.1210 ms 96.4%
+  triton_bmm_8533 0.1212 ms 96.3%
+  triton_bmm_8526 0.1217 ms 95.8%
+  triton_bmm_8530 0.1222 ms 95.4%
+  triton_bmm_8529 0.1234 ms 94.5%
+  triton_bmm_8535 0.1268 ms 92.0%
+SingleProcess AUTOTUNE takes 1.5304 seconds
+AUTOTUNE bmm(512x1x96, 512x96x1557)
+  triton_bmm_8599 0.1198 ms 100.0%
+  triton_bmm_8600 0.1205 ms 99.4%
+  triton_bmm_8606 0.1211 ms 98.9%
+  triton_bmm_8596 0.1212 ms 98.9%
+  triton_bmm_8602 0.1213 ms 98.8%
+  triton_bmm_8603 0.1216 ms 98.6%
+  triton_bmm_8597 0.1234 ms 97.1%
+  triton_bmm_8607 0.1241 ms 96.6%
+  triton_bmm_8605 0.1256 ms 95.4%
+  triton_bmm_8598 0.1264 ms 94.8%
+SingleProcess AUTOTUNE takes 1.5333 seconds
+AUTOTUNE bmm(512x1x1557, 512x1557x96)
+  triton_bmm_8622 0.1245 ms 100.0%
+  triton_bmm_8621 0.1249 ms 99.7%
+  triton_bmm_8628 0.1250 ms 99.6%
+  triton_bmm_8631 0.1272 ms 97.9%
+  triton_bmm_8620 0.1292 ms 96.4%
+  triton_bmm_8627 0.1316 ms 94.6%
+  triton_bmm_8629 0.1369 ms 90.9%
+  triton_bmm_8626 0.1418 ms 87.8%
+  triton_bmm_8625 0.1459 ms 85.3%
+  triton_bmm_8624 0.1476 ms 84.3%
+SingleProcess AUTOTUNE takes 1.5378 seconds
+AUTOTUNE bmm(512x1x96, 512x96x1558)
+  triton_bmm_8695 0.1200 ms 100.0%
+  triton_bmm_8696 0.1204 ms 99.7%
+  triton_bmm_8692 0.1205 ms 99.6%
+  triton_bmm_8699 0.1211 ms 99.1%
+  triton_bmm_8698 0.1214 ms 98.8%
+  triton_bmm_8703 0.1222 ms 98.2%
+  triton_bmm_8693 0.1238 ms 96.9%
+  triton_bmm_8702 0.1254 ms 95.7%
+  triton_bmm_8694 0.1263 ms 95.0%
+  triton_bmm_8700 0.1283 ms 93.5%
+SingleProcess AUTOTUNE takes 1.5339 seconds
+AUTOTUNE bmm(512x1x1558, 512x1558x96)
+  triton_bmm_8717 0.1169 ms 100.0%
+  triton_bmm_8724 0.1193 ms 98.0%
+  triton_bmm_8719 0.1196 ms 97.7%
+  triton_bmm_8716 0.1203 ms 97.2%
+  triton_bmm_8720 0.1216 ms 96.1%
+  triton_bmm_8722 0.1226 ms 95.4%
+  triton_bmm_8718 0.1227 ms 95.3%
+  triton_bmm_8725 0.1234 ms 94.7%
+  triton_bmm_8721 0.1257 ms 93.0%
+  triton_bmm_8727 0.1260 ms 92.8%
+SingleProcess AUTOTUNE takes 1.5284 seconds
+AUTOTUNE bmm(512x1x96, 512x96x1559)
+  triton_bmm_8791 0.1200 ms 100.0%
+  triton_bmm_8792 0.1203 ms 99.8%
+  triton_bmm_8794 0.1209 ms 99.3%
+  triton_bmm_8798 0.1211 ms 99.2%
+  triton_bmm_8788 0.1211 ms 99.1%
+  triton_bmm_8795 0.1214 ms 98.9%
+  triton_bmm_8789 0.1234 ms 97.3%
+  triton_bmm_8799 0.1243 ms 96.6%
+  triton_bmm_8797 0.1257 ms 95.5%
+  triton_bmm_8790 0.1262 ms 95.1%
+SingleProcess AUTOTUNE takes 1.5320 seconds
+AUTOTUNE bmm(512x1x1559, 512x1559x96)
+  triton_bmm_8814 0.1247 ms 100.0%
+  triton_bmm_8813 0.1252 ms 99.6%
+  triton_bmm_8820 0.1255 ms 99.3%
+  triton_bmm_8823 0.1260 ms 99.0%
+  triton_bmm_8812 0.1288 ms 96.8%
+  triton_bmm_8819 0.1315 ms 94.8%
+  triton_bmm_8821 0.1372 ms 90.8%
+  triton_bmm_8818 0.1415 ms 88.1%
+  triton_bmm_8817 0.1453 ms 85.8%
+  triton_bmm_8816 0.1474 ms 84.6%
+SingleProcess AUTOTUNE takes 1.5343 seconds
+AUTOTUNE bmm(512x1x96, 512x96x1560)
+  triton_bmm_8887 0.1194 ms 100.0%
+  triton_bmm_8894 0.1197 ms 99.7%
+  triton_bmm_8884 0.1199 ms 99.6%
+  triton_bmm_8888 0.1202 ms 99.3%
+  triton_bmm_8891 0.1210 ms 98.7%
+  triton_bmm_8890 0.1212 ms 98.5%
+  triton_bmm_8895 0.1215 ms 98.3%
+  triton_bmm_8885 0.1225 ms 97.4%
+  triton_bmm_8886 0.1249 ms 95.6%
+  triton_bmm_8892 0.1275 ms 93.6%
+SingleProcess AUTOTUNE takes 1.5308 seconds
+AUTOTUNE bmm(512x1x1560, 512x1560x96)
+  triton_bmm_8909 0.1163 ms 100.0%
+  bmm 0.1188 ms 97.9%
+  triton_bmm_8911 0.1191 ms 97.6%
+  triton_bmm_8908 0.1192 ms 97.6%
+  triton_bmm_8912 0.1209 ms 96.2%
+  triton_bmm_8916 0.1211 ms 96.0%
+  triton_bmm_8914 0.1216 ms 95.6%
+  triton_bmm_8910 0.1217 ms 95.6%
+  triton_bmm_8919 0.1245 ms 93.4%
+  triton_bmm_8913 0.1260 ms 92.3%
+SingleProcess AUTOTUNE takes 1.5291 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:02<01:06,  2.28s/it]running benchmark:   7%|▋         | 2/30 [00:04<01:00,  2.18s/it]running benchmark:  10%|█         | 3/30 [00:06<00:58,  2.15s/it]running benchmark:  13%|█▎        | 4/30 [00:08<00:55,  2.14s/it]running benchmark:  17%|█▋        | 5/30 [00:10<00:53,  2.12s/it]running benchmark:  20%|██        | 6/30 [00:12<00:50,  2.11s/it]running benchmark:  23%|██▎       | 7/30 [00:14<00:48,  2.11s/it]running benchmark:  27%|██▋       | 8/30 [00:17<00:46,  2.11s/it]running benchmark:  30%|███       | 9/30 [00:19<00:44,  2.10s/it]running benchmark:  33%|███▎      | 10/30 [00:21<00:41,  2.10s/it]running benchmark:  37%|███▋      | 11/30 [00:23<00:39,  2.10s/it]running benchmark:  40%|████      | 12/30 [00:25<00:37,  2.09s/it]running benchmark:  43%|████▎     | 13/30 [00:27<00:35,  2.10s/it]running benchmark:  47%|████▋     | 14/30 [00:29<00:33,  2.11s/it]running benchmark:  50%|█████     | 15/30 [00:31<00:31,  2.11s/it]running benchmark:  53%|█████▎    | 16/30 [00:33<00:29,  2.10s/it]running benchmark:  57%|█████▋    | 17/30 [00:35<00:27,  2.09s/it]running benchmark:  60%|██████    | 18/30 [00:37<00:25,  2.09s/it]running benchmark:  63%|██████▎   | 19/30 [00:40<00:22,  2.09s/it]running benchmark:  67%|██████▋   | 20/30 [00:42<00:20,  2.09s/it]running benchmark:  70%|███████   | 21/30 [00:44<00:18,  2.10s/it]running benchmark:  73%|███████▎  | 22/30 [00:46<00:16,  2.10s/it]running benchmark:  77%|███████▋  | 23/30 [00:48<00:14,  2.09s/it]running benchmark:  80%|████████  | 24/30 [00:50<00:12,  2.10s/it]running benchmark:  83%|████████▎ | 25/30 [00:52<00:10,  2.09s/it]running benchmark:  87%|████████▋ | 26/30 [00:54<00:08,  2.09s/it]running benchmark:  90%|█████████ | 27/30 [00:56<00:06,  2.09s/it]running benchmark:  93%|█████████▎| 28/30 [00:58<00:04,  2.09s/it]running benchmark:  97%|█████████▋| 29/30 [01:00<00:02,  2.08s/it]running benchmark: 100%|██████████| 30/30 [01:03<00:00,  2.08s/it]running benchmark: 100%|██████████| 30/30 [01:03<00:00,  2.10s/it]
+2044.542ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:01, ?it/s]
+dcgan
+cuda eval  dcgan                               baseline-bs32             
+AUTOTUNE convolution(32x3x64x64, 64x3x4x4)
+  triton_convolution_4 0.0301 ms 100.0%
+  convolution 0.0324 ms 92.8%
+  triton_convolution_3 0.0333 ms 90.3%
+  triton_convolution_0 0.0448 ms 67.2%
+  triton_convolution_5 0.0480 ms 62.7%
+  triton_convolution_2 0.0680 ms 44.3%
+  triton_convolution_1 0.0857 ms 35.2%
+SingleProcess AUTOTUNE takes 0.8832 seconds
+AUTOTUNE convolution(32x64x32x32, 128x64x4x4)
+  convolution 0.0268 ms 100.0%
+  triton_convolution_12 0.1350 ms 19.9%
+  triton_convolution_6 0.1397 ms 19.2%
+  triton_convolution_9 0.1514 ms 17.7%
+  triton_convolution_11 0.1546 ms 17.4%
+  triton_convolution_10 0.1920 ms 14.0%
+  triton_convolution_7 0.2530 ms 10.6%
+  triton_convolution_8 0.4315 ms 6.2%
+SingleProcess AUTOTUNE takes 1.1479 seconds
+AUTOTUNE convolution(32x128x16x16, 256x128x4x4)
+  convolution 0.0254 ms 100.0%
+  triton_convolution_18 0.2353 ms 10.8%
+  triton_convolution_17 0.2604 ms 9.8%
+  triton_convolution_19 0.2853 ms 8.9%
+  triton_convolution_16 0.3162 ms 8.0%
+  triton_convolution_13 0.5039 ms 5.0%
+  triton_convolution_14 0.5052 ms 5.0%
+  triton_convolution_15 0.8284 ms 3.1%
+SingleProcess AUTOTUNE takes 1.0378 seconds
+AUTOTUNE convolution(32x256x8x8, 512x256x4x4)
+  convolution 0.0421 ms 100.0%
+  triton_convolution_25 0.4930 ms 8.5%
+  triton_convolution_24 0.5686 ms 7.4%
+  triton_convolution_26 0.6204 ms 6.8%
+  triton_convolution_23 0.7018 ms 6.0%
+  triton_convolution_22 0.9780 ms 4.3%
+  triton_convolution_20 1.0533 ms 4.0%
+  triton_convolution_21 1.0834 ms 3.9%
+SingleProcess AUTOTUNE takes 1.0551 seconds
+AUTOTUNE convolution(32x512x4x4, 1x512x4x4)
+  convolution 0.0345 ms 100.0%
+  triton_convolution_31 0.2242 ms 15.4%
+  triton_convolution_30 0.2447 ms 14.1%
+  triton_convolution_27 0.2532 ms 13.6%
+  triton_convolution_32 0.3010 ms 11.4%
+  triton_convolution_28 0.3690 ms 9.3%
+  triton_convolution_29 0.4208 ms 8.2%
+SingleProcess AUTOTUNE takes 0.8881 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 602.30it/s]
+2416.873ms
+loading model: 0it [00:00, ?it/s]WARNING:common:Model demucs does not support bfloat16, running with amp instead
+loading model: 0it [00:04, ?it/s]
+WARNING:common:Model demucs does not support bfloat16, running with amp instead
+demucs
+cuda eval  demucs                              baseline-bs32             
+WARNING:common:Model demucs does not support bfloat16, running with amp instead
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   7%|▋         | 2/30 [00:00<00:01, 14.56it/s]running benchmark:  17%|█▋        | 5/30 [00:00<00:01, 18.68it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:01, 19.97it/s]running benchmark:  37%|███▋      | 11/30 [00:00<00:00, 20.58it/s]running benchmark:  47%|████▋     | 14/30 [00:00<00:00, 20.90it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 21.11it/s]running benchmark:  67%|██████▋   | 20/30 [00:00<00:00, 21.24it/s]running benchmark:  77%|███████▋  | 23/30 [00:01<00:00, 21.32it/s]running benchmark:  87%|████████▋ | 26/30 [00:01<00:00, 21.37it/s]running benchmark:  97%|█████████▋| 29/30 [00:01<00:00, 21.41it/s]running benchmark: 100%|██████████| 30/30 [00:01<00:00, 20.83it/s]
+1184.947ms
+loading model: 0it [00:00, ?it/s]loading model: 0it [00:02, ?it/s]
+densenet121
+cuda eval  densenet121                         baseline-bs32             
+AUTOTUNE convolution(32x3x224x224, 64x3x7x7)
+  convolution 0.4252 ms 100.0%
+  triton_convolution_3 0.7117 ms 59.7%
+  triton_convolution_4 0.7712 ms 55.1%
+  triton_convolution_5 0.8275 ms 51.4%
+  triton_convolution_0 0.9438 ms 45.0%
+  triton_convolution_2 1.0333 ms 41.1%
+  triton_convolution_1 2.9877 ms 14.2%
+SingleProcess AUTOTUNE takes 1.3267 seconds
+AUTOTUNE mm(100352x64, 64x128)
+  triton_mm_14 0.0340 ms 100.0%
+  triton_mm_8 0.0343 ms 99.2%
+  triton_mm_10 0.0365 ms 93.1%
+  triton_mm_7 0.0379 ms 89.8%
+  triton_mm_9 0.0399 ms 85.2%
+  mm 0.0405 ms 83.9%
+  triton_mm_13 0.0412 ms 82.5%
+  triton_mm_6 0.0419 ms 81.2%
+  triton_mm_15 0.0473 ms 72.0%
+  triton_mm_17 0.0574 ms 59.2%
+SingleProcess AUTOTUNE takes 1.5923 seconds
+AUTOTUNE convolution(32x128x56x56, 32x128x3x3)
+  convolution 0.0779 ms 100.0%
+  triton_convolution_18 0.2707 ms 28.8%
+  triton_convolution_24 0.2804 ms 27.8%
+  triton_convolution_21 0.2980 ms 26.1%
+  triton_convolution_23 0.3143 ms 24.8%
+  triton_convolution_22 0.3253 ms 23.9%
+  triton_convolution_19 0.7027 ms 11.1%
+  triton_convolution_20 0.9588 ms 8.1%
+SingleProcess AUTOTUNE takes 1.0657 seconds
+AUTOTUNE mm(100352x96, 96x128)
+  triton_mm_29 0.0414 ms 100.0%
+  triton_mm_28 0.0418 ms 99.2%
+  triton_mm_26 0.0420 ms 98.5%
+  triton_mm_27 0.0426 ms 97.2%
+  triton_mm_32 0.0427 ms 96.9%
+  triton_mm_25 0.0449 ms 92.3%
+  triton_mm_33 0.0505 ms 82.1%
+  mm 0.0507 ms 81.7%
+  triton_mm_31 0.0654 ms 63.3%
+  triton_mm_35 0.0676 ms 61.2%
+SingleProcess AUTOTUNE takes 1.6033 seconds
+AUTOTUNE mm(100352x128, 128x128)
+  triton_mm_45 0.0467 ms 100.0%
+  triton_mm_46 0.0472 ms 98.9%
+  triton_mm_51 0.0487 ms 96.0%
+  triton_mm_44 0.0505 ms 92.6%
+  triton_mm_47 0.0508 ms 92.1%
+  triton_mm_48 0.0512 ms 91.2%
+  triton_mm_52 0.0514 ms 91.0%
+  mm 0.0540 ms 86.5%
+  triton_mm_54 0.0732 ms 63.9%
+  triton_mm_53 0.0748 ms 62.5%
+SingleProcess AUTOTUNE takes 1.6891 seconds
+AUTOTUNE mm(100352x160, 160x128)
+  triton_mm_64 0.0551 ms 100.0%
+  triton_mm_65 0.0560 ms 98.3%
+  triton_mm_70 0.0569 ms 96.7%
+  mm 0.0570 ms 96.6%
+  triton_mm_66 0.0597 ms 92.3%
+  triton_mm_63 0.0613 ms 89.8%
+  triton_mm_67 0.0619 ms 88.9%
+  triton_mm_71 0.0639 ms 86.2%
+  triton_mm_69 0.0898 ms 61.3%
+  triton_mm_68 0.0905 ms 60.9%
+SingleProcess AUTOTUNE takes 1.5999 seconds
+AUTOTUNE mm(100352x192, 192x128)
+  triton_mm_83 0.0585 ms 100.0%
+  mm 0.0587 ms 99.7%
+  triton_mm_84 0.0589 ms 99.3%
+  triton_mm_89 0.0633 ms 92.4%
+  triton_mm_85 0.0633 ms 92.3%
+  triton_mm_86 0.0641 ms 91.2%
+  triton_mm_90 0.0664 ms 88.0%
+  triton_mm_82 0.0667 ms 87.6%
+  triton_mm_88 0.1033 ms 56.6%
+  triton_mm_87 0.1046 ms 55.9%
+SingleProcess AUTOTUNE takes 1.6039 seconds
+AUTOTUNE mm(100352x224, 224x128)
+  mm 0.0651 ms 100.0%
+  triton_mm_102 0.0660 ms 98.7%
+  triton_mm_103 0.0670 ms 97.3%
+  triton_mm_108 0.0709 ms 91.9%
+  triton_mm_104 0.0712 ms 91.5%
+  triton_mm_105 0.0722 ms 90.2%
+  triton_mm_101 0.0763 ms 85.4%
+  triton_mm_109 0.0771 ms 84.4%
+  triton_mm_107 0.1065 ms 61.1%
+  triton_mm_106 0.1096 ms 59.4%
+SingleProcess AUTOTUNE takes 1.6308 seconds
+AUTOTUNE mm(100352x256, 256x128)
+  mm 0.0676 ms 100.0%
+  triton_mm_122 0.0687 ms 98.4%
+  triton_mm_121 0.0718 ms 94.2%
+  triton_mm_124 0.0735 ms 92.1%
+  triton_mm_123 0.0741 ms 91.3%
+  triton_mm_127 0.0745 ms 90.8%
+  triton_mm_128 0.0792 ms 85.4%
+  triton_mm_120 0.0838 ms 80.7%
+  triton_mm_125 0.1247 ms 54.3%
+  triton_mm_126 0.1255 ms 53.9%
+SingleProcess AUTOTUNE takes 1.6316 seconds
+AUTOTUNE mm(25088x128, 128x128)
+  triton_mm_139 0.0168 ms 100.0%
+  triton_mm_134 0.0186 ms 90.2%
+  mm 0.0188 ms 89.3%
+  triton_mm_135 0.0191 ms 87.9%
+  triton_mm_136 0.0192 ms 87.4%
+  triton_mm_133 0.0194 ms 86.8%
+  triton_mm_140 0.0197 ms 85.4%
+  triton_mm_132 0.0204 ms 82.4%
+  triton_mm_141 0.0258 ms 65.1%
+  triton_mm_138 0.0260 ms 64.5%
+SingleProcess AUTOTUNE takes 1.6237 seconds
+AUTOTUNE convolution(32x128x28x28, 32x128x3x3)
+  convolution 0.0305 ms 100.0%
+  triton_convolution_144 0.0799 ms 38.2%
+  triton_convolution_147 0.0815 ms 37.4%
+  triton_convolution_148 0.0875 ms 34.9%
+  triton_convolution_149 0.0892 ms 34.2%
+  triton_convolution_150 0.0919 ms 33.2%
+  triton_convolution_145 0.2002 ms 15.2%
+  triton_convolution_146 0.4693 ms 6.5%
+SingleProcess AUTOTUNE takes 1.0445 seconds
+AUTOTUNE mm(25088x160, 160x128)
+  triton_mm_158 0.0193 ms 100.0%
+  mm 0.0206 ms 93.8%
+  triton_mm_153 0.0212 ms 91.0%
+  triton_mm_155 0.0215 ms 89.9%
+  triton_mm_152 0.0218 ms 88.7%
+  triton_mm_154 0.0220 ms 87.9%
+  triton_mm_151 0.0231 ms 83.5%
+  triton_mm_159 0.0244 ms 79.2%
+  triton_mm_156 0.0293 ms 65.8%
+  triton_mm_157 0.0294 ms 65.6%
+SingleProcess AUTOTUNE takes 1.6135 seconds
+AUTOTUNE mm(25088x192, 192x128)
+  triton_mm_177 0.0213 ms 100.0%
+  mm 0.0214 ms 99.6%
+  triton_mm_172 0.0218 ms 97.8%
+  triton_mm_174 0.0224 ms 95.1%
+  triton_mm_171 0.0229 ms 93.1%
+  triton_mm_173 0.0229 ms 92.9%
+  triton_mm_178 0.0252 ms 84.4%
+  triton_mm_170 0.0262 ms 81.3%
+  triton_mm_176 0.0330 ms 64.7%
+  triton_mm_175 0.0330 ms 64.6%
+SingleProcess AUTOTUNE takes 1.6144 seconds
+AUTOTUNE mm(25088x224, 224x128)
+  triton_mm_196 0.0224 ms 100.0%
+  mm 0.0235 ms 95.6%
+  triton_mm_193 0.0245 ms 91.5%
+  triton_mm_191 0.0246 ms 91.3%
+  triton_mm_192 0.0247 ms 90.9%
+  triton_mm_190 0.0252 ms 89.0%
+  triton_mm_197 0.0280 ms 80.1%
+  triton_mm_189 0.0290 ms 77.4%
+  triton_mm_194 0.0340 ms 65.9%
+  triton_mm_195 0.0340 ms 65.9%
+SingleProcess AUTOTUNE takes 1.6201 seconds
+AUTOTUNE mm(25088x256, 256x128)
+  triton_mm_215 0.0245 ms 100.0%
+  triton_mm_210 0.0254 ms 96.6%
+  triton_mm_212 0.0255 ms 96.4%
+  mm 0.0256 ms 95.9%
+  triton_mm_211 0.0265 ms 92.5%
+  triton_mm_209 0.0272 ms 90.2%
+  triton_mm_216 0.0279 ms 87.9%
+  triton_mm_208 0.0316 ms 77.6%
+  triton_mm_213 0.0376 ms 65.2%
+  triton_mm_214 0.0386 ms 63.5%
+SingleProcess AUTOTUNE takes 1.7425 seconds
+AUTOTUNE mm(25088x288, 288x128)
+  mm 0.0276 ms 100.0%
+  triton_mm_234 0.0276 ms 100.0%
+  triton_mm_229 0.0290 ms 95.4%
+  triton_mm_231 0.0295 ms 93.7%
+  triton_mm_230 0.0301 ms 91.8%
+  triton_mm_228 0.0316 ms 87.5%
+  triton_mm_235 0.0330 ms 83.9%
+  triton_mm_227 0.0379 ms 73.0%
+  triton_mm_232 0.0401 ms 69.0%
+  triton_mm_233 0.0427 ms 64.8%
+SingleProcess AUTOTUNE takes 1.6108 seconds
+AUTOTUNE mm(25088x320, 320x128)
+  mm 0.0278 ms 100.0%
+  triton_mm_253 0.0290 ms 96.0%
+  triton_mm_250 0.0303 ms 91.7%
+  triton_mm_248 0.0305 ms 91.2%
+  triton_mm_249 0.0309 ms 89.9%
+  triton_mm_247 0.0312 ms 89.1%
+  triton_mm_254 0.0335 ms 83.1%
+  triton_mm_246 0.0377 ms 73.8%
+  triton_mm_251 0.0450 ms 61.8%
+  triton_mm_252 0.0455 ms 61.1%
+SingleProcess AUTOTUNE takes 1.6010 seconds
+AUTOTUNE mm(25088x352, 352x128)
+  mm 0.0302 ms 100.0%
+  triton_mm_272 0.0319 ms 94.7%
+  triton_mm_269 0.0324 ms 93.2%
+  triton_mm_268 0.0329 ms 91.7%
+  triton_mm_267 0.0334 ms 90.3%
+  triton_mm_266 0.0339 ms 89.1%
+  triton_mm_273 0.0373 ms 81.0%
+  triton_mm_265 0.0411 ms 73.6%
+  triton_mm_270 0.0457 ms 66.1%
+  triton_mm_271 0.0485 ms 62.3%
+SingleProcess AUTOTUNE takes 1.6017 seconds
+AUTOTUNE mm(25088x384, 384x128)
+  mm 0.0303 ms 100.0%
+  triton_mm_286 0.0343 ms 88.3%
+  triton_mm_288 0.0346 ms 87.7%
+  triton_mm_291 0.0354 ms 85.6%
+  triton_mm_287 0.0357 ms 84.8%
+  triton_mm_292 0.0381 ms 79.6%
+  triton_mm_285 0.0384 ms 78.8%
+  triton_mm_284 0.0452 ms 67.1%
+  triton_mm_289 0.0489 ms 62.0%
+  triton_mm_290 0.0552 ms 54.9%
+SingleProcess AUTOTUNE takes 1.6117 seconds
+AUTOTUNE mm(25088x416, 416x128)
+  mm 0.0337 ms 100.0%
+  triton_mm_307 0.0365 ms 92.3%
+  triton_mm_306 0.0365 ms 92.2%
+  triton_mm_305 0.0367 ms 91.7%
+  triton_mm_310 0.0377 ms 89.4%
+  triton_mm_304 0.0384 ms 87.7%
+  triton_mm_311 0.0413 ms 81.6%
+  triton_mm_303 0.0472 ms 71.3%
+  triton_mm_308 0.0504 ms 66.8%
+  triton_mm_309 0.0562 ms 59.9%
+SingleProcess AUTOTUNE takes 1.6931 seconds
+AUTOTUNE mm(25088x448, 448x128)
+  mm 0.0341 ms 100.0%
+  triton_mm_326 0.0383 ms 89.1%
+  triton_mm_325 0.0389 ms 87.7%
+  triton_mm_324 0.0390 ms 87.5%
+  triton_mm_329 0.0402 ms 84.8%
+  triton_mm_323 0.0423 ms 80.6%
+  triton_mm_330 0.0433 ms 78.8%
+  triton_mm_322 0.0512 ms 66.6%
+  triton_mm_327 0.0545 ms 62.6%
+  triton_mm_328 0.0614 ms 55.5%
+SingleProcess AUTOTUNE takes 1.6922 seconds
+AUTOTUNE mm(25088x480, 480x128)
+  mm 0.0371 ms 100.0%
+  triton_mm_345 0.0401 ms 92.5%
+  triton_mm_342 0.0402 ms 92.2%
+  triton_mm_344 0.0404 ms 91.7%
+  triton_mm_343 0.0418 ms 88.6%
+  triton_mm_348 0.0422 ms 87.8%
+  triton_mm_349 0.0457 ms 81.1%
+  triton_mm_341 0.0528 ms 70.2%
+  triton_mm_346 0.0563 ms 65.8%
+  triton_mm_347 0.0626 ms 59.2%
+SingleProcess AUTOTUNE takes 1.6056 seconds
+AUTOTUNE mm(25088x512, 512x256)
+  mm 0.0463 ms 100.0%
+  triton_mm_362 0.0500 ms 92.5%
+  triton_mm_361 0.0533 ms 86.7%
+  triton_mm_364 0.0566 ms 81.8%
+  triton_mm_363 0.0568 ms 81.5%
+  triton_mm_368 0.0618 ms 74.9%
+  triton_mm_367 0.0624 ms 74.2%
+  triton_mm_360 0.0674 ms 68.7%
+  triton_mm_370 0.1038 ms 44.6%
+  triton_mm_366 0.1077 ms 43.0%
+SingleProcess AUTOTUNE takes 1.6228 seconds
+AUTOTUNE mm(6272x256, 256x128)
+  triton_mm_376 0.0116 ms 100.0%
+  triton_mm_375 0.0118 ms 97.6%
+  mm 0.0124 ms 93.5%
+  triton_mm_374 0.0124 ms 93.5%
+  triton_mm_380 0.0124 ms 93.3%
+  triton_mm_373 0.0128 ms 90.3%
+  triton_mm_372 0.0150 ms 76.8%
+  triton_mm_377 0.0154 ms 75.2%
+  triton_mm_381 0.0166 ms 69.4%
+  triton_mm_379 0.0176 ms 65.8%
+SingleProcess AUTOTUNE takes 1.6322 seconds
+AUTOTUNE convolution(32x128x14x14, 32x128x3x3)
+  convolution 0.0149 ms 100.0%
+  triton_convolution_389 0.0451 ms 33.1%
+  triton_convolution_388 0.0558 ms 26.7%
+  triton_convolution_387 0.0618 ms 24.1%
+  triton_convolution_384 0.0639 ms 23.3%
+  triton_convolution_390 0.0838 ms 17.8%
+  triton_convolution_385 0.2031 ms 7.3%
+  triton_convolution_386 0.4467 ms 3.3%
+SingleProcess AUTOTUNE takes 1.0114 seconds
+AUTOTUNE mm(6272x288, 288x128)
+  triton_mm_395 0.0124 ms 100.0%
+  triton_mm_394 0.0131 ms 94.9%
+  triton_mm_393 0.0135 ms 91.9%
+  triton_mm_392 0.0136 ms 91.3%
+  triton_mm_399 0.0139 ms 89.4%
+  mm 0.0141 ms 87.8%
+  triton_mm_396 0.0167 ms 74.5%
+  triton_mm_391 0.0172 ms 72.4%
+  triton_mm_397 0.0179 ms 69.3%
+  triton_mm_398 0.0190 ms 65.2%
+SingleProcess AUTOTUNE takes 1.6312 seconds
+AUTOTUNE mm(6272x320, 320x128)
+  triton_mm_414 0.0130 ms 100.0%
+  mm 0.0133 ms 97.2%
+  triton_mm_413 0.0134 ms 96.8%
+  triton_mm_418 0.0142 ms 91.6%
+  triton_mm_412 0.0142 ms 91.3%
+  triton_mm_411 0.0143 ms 90.7%
+  triton_mm_410 0.0173 ms 75.1%
+  triton_mm_415 0.0177 ms 73.3%
+  triton_mm_419 0.0192 ms 67.5%
+  triton_mm_416 0.0194 ms 66.9%
+SingleProcess AUTOTUNE takes 1.6268 seconds
+AUTOTUNE mm(6272x352, 352x128)
+  triton_mm_433 0.0136 ms 100.0%
+  triton_mm_431 0.0140 ms 97.0%
+  triton_mm_432 0.0141 ms 96.4%
+  mm 0.0143 ms 95.3%
+  triton_mm_437 0.0148 ms 92.0%
+  triton_mm_430 0.0148 ms 91.8%
+  triton_mm_434 0.0174 ms 78.2%
+  triton_mm_429 0.0188 ms 72.4%
+  triton_mm_438 0.0198 ms 68.9%
+  triton_mm_435 0.0204 ms 66.8%
+SingleProcess AUTOTUNE takes 1.6302 seconds
+AUTOTUNE mm(6272x384, 384x128)
+  triton_mm_452 0.0139 ms 100.0%
+  triton_mm_451 0.0149 ms 93.1%
+  mm 0.0154 ms 90.4%
+  triton_mm_450 0.0155 ms 90.1%
+  triton_mm_456 0.0156 ms 89.1%
+  triton_mm_449 0.0162 ms 86.1%
+  triton_mm_453 0.0191 ms 72.7%
+  triton_mm_448 0.0197 ms 70.7%
+  triton_mm_457 0.0204 ms 68.4%
+  triton_mm_454 0.0222 ms 62.8%
+SingleProcess AUTOTUNE takes 1.6327 seconds
+AUTOTUNE mm(6272x416, 416x128)
+  triton_mm_471 0.0145 ms 100.0%
+  triton_mm_470 0.0152 ms 95.4%
+  triton_mm_469 0.0157 ms 92.2%
+  triton_mm_468 0.0158 ms 91.7%
+  mm 0.0161 ms 89.9%
+  triton_mm_475 0.0163 ms 89.0%
+  triton_mm_472 0.0189 ms 76.6%
+  triton_mm_467 0.0212 ms 68.4%
+  triton_mm_473 0.0225 ms 64.4%
+  triton_mm_476 0.0232 ms 62.3%
+SingleProcess AUTOTUNE takes 1.6274 seconds
+AUTOTUNE mm(6272x448, 448x128)
+  triton_mm_490 0.0142 ms 100.0%
+  triton_mm_489 0.0156 ms 91.4%
+  triton_mm_488 0.0162 ms 87.9%
+  mm 0.0163 ms 87.1%
+  triton_mm_494 0.0168 ms 84.5%
+  triton_mm_487 0.0168 ms 84.4%
+  triton_mm_491 0.0198 ms 71.7%
+  triton_mm_486 0.0218 ms 65.1%
+  triton_mm_495 0.0234 ms 60.8%
+  triton_mm_492 0.0240 ms 59.3%
+SingleProcess AUTOTUNE takes 1.6199 seconds
+AUTOTUNE mm(6272x480, 480x128)
+  triton_mm_509 0.0154 ms 100.0%
+  triton_mm_508 0.0162 ms 95.3%
+  mm 0.0165 ms 93.2%
+  triton_mm_507 0.0169 ms 91.1%
+  triton_mm_513 0.0172 ms 89.4%
+  triton_mm_506 0.0174 ms 88.6%
+  triton_mm_510 0.0210 ms 73.6%
+  triton_mm_505 0.0228 ms 67.5%
+  triton_mm_514 0.0239 ms 64.6%
+  triton_mm_511 0.0239 ms 64.4%
+SingleProcess AUTOTUNE takes 1.6222 seconds
+AUTOTUNE mm(6272x512, 512x128)
+  triton_mm_528 0.0155 ms 100.0%
+  mm 0.0167 ms 93.1%
+  triton_mm_527 0.0167 ms 93.1%
+  triton_mm_532 0.0171 ms 90.8%
+  triton_mm_526 0.0176 ms 87.9%
+  triton_mm_525 0.0181 ms 85.7%
+  triton_mm_529 0.0216 ms 72.0%
+  triton_mm_533 0.0234 ms 66.3%
+  triton_mm_524 0.0243 ms 63.9%
+  triton_mm_530 0.0246 ms 63.0%
+SingleProcess AUTOTUNE takes 1.6317 seconds
+AUTOTUNE mm(6272x544, 544x128)
+  triton_mm_547 0.0159 ms 100.0%
+  triton_mm_546 0.0162 ms 98.2%
+  mm 0.0172 ms 92.8%
+  triton_mm_545 0.0177 ms 90.2%
+  triton_mm_551 0.0180 ms 88.8%
+  triton_mm_544 0.0185 ms 86.2%
+  triton_mm_548 0.0223 ms 71.6%
+  triton_mm_549 0.0250 ms 63.8%
+  triton_mm_543 0.0252 ms 63.1%
+  triton_mm_552 0.0257 ms 62.0%
+SingleProcess AUTOTUNE takes 1.6332 seconds
+AUTOTUNE mm(6272x576, 576x128)
+  triton_mm_566 0.0163 ms 100.0%
+  mm 0.0169 ms 96.8%
+  triton_mm_565 0.0170 ms 95.9%
+  triton_mm_570 0.0178 ms 91.7%
+  triton_mm_564 0.0188 ms 86.7%
+  triton_mm_563 0.0193 ms 84.4%
+  triton_mm_567 0.0235 ms 69.6%
+  triton_mm_571 0.0252 ms 64.7%
+  triton_mm_568 0.0255 ms 63.9%
+  triton_mm_562 0.0268 ms 61.0%
+SingleProcess AUTOTUNE takes 1.6280 seconds
+AUTOTUNE mm(6272x608, 608x128)
+  triton_mm_584 0.0168 ms 100.0%
+  triton_mm_585 0.0174 ms 96.7%
+  mm 0.0176 ms 95.6%
+  triton_mm_589 0.0188 ms 89.8%
+  triton_mm_582 0.0190 ms 88.6%
+  triton_mm_583 0.0194 ms 86.9%
+  triton_mm_586 0.0231 ms 72.8%
+  triton_mm_590 0.0251 ms 67.1%
+  triton_mm_587 0.0260 ms 64.9%
+  triton_mm_581 0.0278 ms 60.5%
+SingleProcess AUTOTUNE takes 1.6277 seconds
+AUTOTUNE mm(6272x640, 640x128)
+  triton_mm_603 0.0168 ms 100.0%
+  triton_mm_604 0.0172 ms 97.8%
+  triton_mm_608 0.0173 ms 97.0%
+  mm 0.0173 ms 96.7%
+  triton_mm_602 0.0200 ms 84.0%
+  triton_mm_601 0.0204 ms 82.4%
+  triton_mm_605 0.0247 ms 67.8%
+  triton_mm_609 0.0251 ms 66.8%
+  triton_mm_606 0.0260 ms 64.5%
+  triton_mm_600 0.0292 ms 57.5%
+SingleProcess AUTOTUNE takes 1.6285 seconds
+AUTOTUNE mm(6272x672, 672x128)
+  triton_mm_622 0.0179 ms 100.0%
+  triton_mm_623 0.0181 ms 98.9%
+  mm 0.0182 ms 98.1%
+  triton_mm_627 0.0186 ms 96.0%
+  triton_mm_620 0.0203 ms 88.3%
+  triton_mm_621 0.0208 ms 86.1%
+  triton_mm_625 0.0251 ms 71.4%
+  triton_mm_624 0.0251 ms 71.3%
+  triton_mm_628 0.0272 ms 65.8%
+  triton_mm_619 0.0300 ms 59.5%
+SingleProcess AUTOTUNE takes 1.6271 seconds
+AUTOTUNE mm(6272x704, 704x128)
+  mm 0.0177 ms 100.0%
+  triton_mm_641 0.0178 ms 99.3%
+  triton_mm_646 0.0186 ms 95.0%
+  triton_mm_642 0.0188 ms 94.2%
+  triton_mm_639 0.0212 ms 83.3%
+  triton_mm_640 0.0218 ms 81.2%
+  triton_mm_643 0.0262 ms 67.5%
+  triton_mm_644 0.0266 ms 66.4%
+  triton_mm_647 0.0272 ms 65.0%
+  triton_mm_638 0.0312 ms 56.7%
+SingleProcess AUTOTUNE takes 1.7603 seconds
+AUTOTUNE mm(6272x736, 736x128)
+  mm 0.0186 ms 100.0%
+  triton_mm_660 0.0194 ms 95.9%
+  triton_mm_665 0.0196 ms 94.8%
+  triton_mm_661 0.0197 ms 94.3%
+  triton_mm_658 0.0217 ms 85.6%
+  triton_mm_659 0.0221 ms 84.0%
+  triton_mm_662 0.0260 ms 71.5%
+  triton_mm_666 0.0275 ms 67.7%
+  triton_mm_663 0.0276 ms 67.4%
+  triton_mm_657 0.0320 ms 58.1%
+SingleProcess AUTOTUNE takes 1.6320 seconds
+AUTOTUNE mm(6272x768, 768x128)
+  mm 0.0190 ms 100.0%
+  triton_mm_684 0.0192 ms 98.8%
+  triton_mm_679 0.0193 ms 98.7%
+  triton_mm_680 0.0198 ms 95.8%
+  triton_mm_678 0.0228 ms 83.5%
+  triton_mm_677 0.0228 ms 83.2%
+  triton_mm_681 0.0274 ms 69.3%
+  triton_mm_682 0.0285 ms 66.7%
+  triton_mm_685 0.0290 ms 65.5%
+  triton_mm_676 0.0334 ms 56.8%
+SingleProcess AUTOTUNE takes 1.8324 seconds
+AUTOTUNE mm(6272x800, 800x128)
+  mm 0.0193 ms 100.0%
+  triton_mm_698 0.0196 ms 98.5%
+  triton_mm_699 0.0206 ms 93.5%
+  triton_mm_703 0.0212 ms 90.8%
+  triton_mm_696 0.0225 ms 85.5%
+  triton_mm_697 0.0228 ms 84.4%
+  triton_mm_700 0.0276 ms 69.9%
+  triton_mm_701 0.0281 ms 68.6%
+  triton_mm_704 0.0307 ms 62.8%
+  triton_mm_695 0.0344 ms 56.1%
+SingleProcess AUTOTUNE takes 1.6257 seconds
+AUTOTUNE mm(6272x832, 832x128)
+  mm 0.0201 ms 100.0%
+  triton_mm_717 0.0205 ms 97.9%
+  triton_mm_718 0.0210 ms 95.8%
+  triton_mm_722 0.0213 ms 94.5%
+  triton_mm_715 0.0242 ms 83.2%
+  triton_mm_716 0.0245 ms 82.0%
+  triton_mm_719 0.0287 ms 70.1%
+  triton_mm_720 0.0312 ms 64.5%
+  triton_mm_723 0.0323 ms 62.4%
+  triton_mm_714 0.0354 ms 56.8%
+SingleProcess AUTOTUNE takes 1.6231 seconds
+AUTOTUNE mm(6272x864, 864x128)
+  triton_mm_736 0.0207 ms 100.0%
+  mm 0.0208 ms 99.8%
+  triton_mm_737 0.0219 ms 94.6%
+  triton_mm_741 0.0221 ms 93.6%
+  triton_mm_734 0.0243 ms 85.3%
+  triton_mm_735 0.0248 ms 83.7%
+  triton_mm_738 0.0294 ms 70.5%
+  triton_mm_739 0.0296 ms 70.1%
+  triton_mm_742 0.0307 ms 67.6%
+  triton_mm_733 0.0362 ms 57.3%
+SingleProcess AUTOTUNE takes 1.6207 seconds
+AUTOTUNE mm(6272x896, 896x128)
+  mm 0.0205 ms 100.0%
+  triton_mm_755 0.0214 ms 96.0%
+  triton_mm_760 0.0220 ms 93.6%
+  triton_mm_756 0.0222 ms 92.4%
+  triton_mm_753 0.0255 ms 80.5%
+  triton_mm_754 0.0257 ms 80.0%
+  triton_mm_757 0.0302 ms 68.1%
+  triton_mm_758 0.0321 ms 64.0%
+  triton_mm_761 0.0327 ms 62.9%
+  triton_mm_752 0.0378 ms 54.4%
+SingleProcess AUTOTUNE takes 1.6192 seconds
+AUTOTUNE mm(6272x928, 928x128)
+  mm 0.0215 ms 100.0%
+  triton_mm_774 0.0216 ms 99.6%
+  triton_mm_775 0.0229 ms 93.9%
+  triton_mm_779 0.0233 ms 92.4%
+  triton_mm_772 0.0254 ms 84.8%
+  triton_mm_773 0.0255 ms 84.4%
+  triton_mm_776 0.0305 ms 70.5%
+  triton_mm_777 0.0314 ms 68.7%
+  triton_mm_780 0.0335 ms 64.2%
+  triton_mm_771 0.0383 ms 56.2%
+SingleProcess AUTOTUNE takes 1.6190 seconds
+AUTOTUNE mm(6272x960, 960x128)
+  mm 0.0216 ms 100.0%
+  triton_mm_793 0.0217 ms 99.6%
+  triton_mm_798 0.0226 ms 95.7%
+  triton_mm_794 0.0232 ms 93.1%
+  triton_mm_791 0.0260 ms 82.9%
+  triton_mm_792 0.0274 ms 78.9%
+  triton_mm_795 0.0326 ms 66.3%
+  triton_mm_799 0.0336 ms 64.2%
+  triton_mm_796 0.0337 ms 64.1%
+  triton_mm_797 0.0399 ms 54.2%
+SingleProcess AUTOTUNE takes 1.6996 seconds
+AUTOTUNE mm(6272x992, 992x128)
+  mm 0.0218 ms 100.0%
+  triton_mm_812 0.0230 ms 94.7%
+  triton_mm_817 0.0237 ms 92.2%
+  triton_mm_813 0.0241 ms 90.7%
+  triton_mm_810 0.0264 ms 82.6%
+  triton_mm_811 0.0272 ms 80.1%
+  triton_mm_814 0.0322 ms 67.9%
+  triton_mm_815 0.0323 ms 67.5%
+  triton_mm_818 0.0336 ms 64.9%
+  triton_mm_809 0.0405 ms 53.9%
+SingleProcess AUTOTUNE takes 1.6234 seconds
+AUTOTUNE mm(6272x1024, 1024x512)
+  mm 0.0373 ms 100.0%
+  triton_mm_831 0.0510 ms 73.3%
+  triton_mm_832 0.0512 ms 73.0%
+  triton_mm_830 0.0522 ms 71.5%
+  triton_mm_829 0.0528 ms 70.7%
+  triton_mm_836 0.0594 ms 62.9%
+  triton_mm_835 0.0733 ms 51.0%
+  triton_mm_828 0.0744 ms 50.2%
+  triton_mm_833 0.1000 ms 37.3%
+  triton_mm_834 0.1002 ms 37.3%
+SingleProcess AUTOTUNE takes 1.7564 seconds
+AUTOTUNE mm(1568x512, 512x128)
+  triton_mm_845 0.0114 ms 100.0%
+  mm 0.0116 ms 98.3%
+  triton_mm_848 0.0119 ms 96.0%
+  triton_mm_846 0.0121 ms 94.4%
+  triton_mm_849 0.0121 ms 93.9%
+  triton_mm_844 0.0128 ms 89.0%
+  triton_mm_843 0.0133 ms 85.4%
+  triton_mm_841 0.0150 ms 75.9%
+  triton_mm_842 0.0156 ms 73.3%
+  triton_mm_840 0.0205 ms 55.6%
+SingleProcess AUTOTUNE takes 1.7149 seconds
+AUTOTUNE convolution(32x128x7x7, 32x128x3x3)
+  convolution 0.0141 ms 100.0%
+  triton_convolution_857 0.0427 ms 33.0%
+  triton_convolution_856 0.0542 ms 26.0%
+  triton_convolution_855 0.0557 ms 25.3%
+  triton_convolution_852 0.0622 ms 22.6%
+  triton_convolution_858 0.0789 ms 17.8%
+  triton_convolution_853 0.1936 ms 7.3%
+  triton_convolution_854 0.4100 ms 3.4%
+SingleProcess AUTOTUNE takes 1.0171 seconds
+AUTOTUNE mm(1568x544, 544x128)
+  mm 0.0113 ms 100.0%
+  triton_mm_865 0.0115 ms 98.1%
+  triton_mm_864 0.0116 ms 97.1%
+  triton_mm_867 0.0122 ms 92.4%
+  triton_mm_868 0.0128 ms 88.3%
+  triton_mm_862 0.0135 ms 83.6%
+  triton_mm_863 0.0140 ms 80.4%
+  triton_mm_861 0.0158 ms 71.5%
+  triton_mm_860 0.0160 ms 70.6%
+  triton_mm_859 0.0217 ms 52.1%
+SingleProcess AUTOTUNE takes 1.7130 seconds
+AUTOTUNE mm(1568x576, 576x128)
+  triton_mm_883 0.0116 ms 100.0%
+  mm 0.0118 ms 97.6%
+  triton_mm_884 0.0119 ms 97.0%
+  triton_mm_886 0.0120 ms 96.0%
+  triton_mm_887 0.0134 ms 86.4%
+  triton_mm_881 0.0143 ms 80.8%
+  triton_mm_882 0.0143 ms 80.8%
+  triton_mm_879 0.0163 ms 71.1%
+  triton_mm_880 0.0163 ms 71.1%
+  triton_mm_878 0.0225 ms 51.4%
+SingleProcess AUTOTUNE takes 1.7105 seconds
+AUTOTUNE mm(1568x608, 608x128)
+  mm 0.0115 ms 100.0%
+  triton_mm_902 0.0116 ms 99.7%
+  triton_mm_903 0.0118 ms 97.3%
+  triton_mm_906 0.0130 ms 88.9%
+  triton_mm_905 0.0133 ms 86.5%
+  triton_mm_900 0.0143 ms 80.7%
+  triton_mm_901 0.0148 ms 77.9%
+  triton_mm_898 0.0165 ms 69.9%
+  triton_mm_899 0.0170 ms 67.8%
+  triton_mm_897 0.0239 ms 48.3%
+SingleProcess AUTOTUNE takes 1.7478 seconds
+AUTOTUNE mm(1568x640, 640x128)
+  mm 0.0114 ms 100.0%
+  triton_mm_921 0.0124 ms 92.0%
+  triton_mm_924 0.0128 ms 89.5%
+  triton_mm_922 0.0129 ms 88.8%
+  triton_mm_925 0.0129 ms 88.6%
+  triton_mm_920 0.0145 ms 78.6%
+  triton_mm_919 0.0146 ms 78.5%
+  triton_mm_917 0.0173 ms 66.1%
+  triton_mm_918 0.0173 ms 66.1%
+  triton_mm_916 0.0249 ms 45.9%
+SingleProcess AUTOTUNE takes 1.6898 seconds
+AUTOTUNE mm(1568x672, 672x128)
+  triton_mm_941 0.0123 ms 100.0%
+  mm 0.0123 ms 99.9%
+  triton_mm_940 0.0126 ms 97.6%
+  triton_mm_943 0.0139 ms 88.9%
+  triton_mm_944 0.0140 ms 87.7%
+  triton_mm_938 0.0156 ms 79.2%
+  triton_mm_939 0.0156 ms 79.2%
+  triton_mm_936 0.0175 ms 70.5%
+  triton_mm_937 0.0175 ms 70.5%
+  triton_mm_935 0.0252 ms 48.8%
+SingleProcess AUTOTUNE takes 1.6330 seconds
+AUTOTUNE mm(1568x704, 704x128)
+  mm 0.0117 ms 100.0%
+  triton_mm_959 0.0124 ms 94.6%
+  triton_mm_960 0.0133 ms 88.2%
+  triton_mm_962 0.0133 ms 88.2%
+  triton_mm_963 0.0144 ms 81.7%
+  triton_mm_958 0.0152 ms 77.1%
+  triton_mm_957 0.0157 ms 74.6%
+  triton_mm_955 0.0188 ms 62.5%
+  triton_mm_956 0.0188 ms 62.5%
+  triton_mm_954 0.0262 ms 44.8%
+SingleProcess AUTOTUNE takes 1.6336 seconds
+AUTOTUNE mm(1568x736, 736x128)
+  mm 0.0121 ms 100.0%
+  triton_mm_978 0.0126 ms 95.9%
+  triton_mm_979 0.0129 ms 94.0%
+  triton_mm_981 0.0140 ms 86.3%
+  triton_mm_982 0.0141 ms 85.5%
+  triton_mm_977 0.0157 ms 76.8%
+  triton_mm_976 0.0163 ms 74.3%
+  triton_mm_974 0.0190 ms 63.5%
+  triton_mm_975 0.0191 ms 63.4%
+  triton_mm_973 0.0273 ms 44.4%
+SingleProcess AUTOTUNE takes 1.6415 seconds
+AUTOTUNE mm(1568x768, 768x128)
+  mm 0.0126 ms 100.0%
+  triton_mm_997 0.0131 ms 96.1%
+  triton_mm_998 0.0140 ms 89.9%
+  triton_mm_1001 0.0141 ms 89.1%
+  triton_mm_1000 0.0145 ms 86.8%
+  triton_mm_996 0.0160 ms 78.4%
+  triton_mm_995 0.0165 ms 76.0%
+  triton_mm_993 0.0195 ms 64.5%
+  triton_mm_994 0.0195 ms 64.5%
+  triton_mm_992 0.0285 ms 44.1%
+SingleProcess AUTOTUNE takes 1.6294 seconds
+AUTOTUNE mm(1568x800, 800x128)
+  mm 0.0124 ms 100.0%
+  triton_mm_1016 0.0131 ms 94.1%
+  triton_mm_1017 0.0134 ms 92.3%
+  triton_mm_1019 0.0146 ms 84.8%
+  triton_mm_1020 0.0151 ms 81.8%
+  triton_mm_1014 0.0165 ms 74.8%
+  triton_mm_1015 0.0171 ms 72.4%
+  triton_mm_1012 0.0195 ms 63.4%
+  triton_mm_1013 0.0195 ms 63.4%
+  triton_mm_1011 0.0295 ms 41.8%
+SingleProcess AUTOTUNE takes 1.6293 seconds
+AUTOTUNE mm(1568x832, 832x128)
+  mm 0.0123 ms 100.0%
+  triton_mm_1035 0.0135 ms 91.4%
+  triton_mm_1036 0.0140 ms 88.1%
+  triton_mm_1038 0.0148 ms 83.0%
+  triton_mm_1039 0.0157 ms 78.6%
+  triton_mm_1034 0.0171 ms 72.0%
+  triton_mm_1033 0.0172 ms 71.6%
+  triton_mm_1031 0.0203 ms 60.8%
+  triton_mm_1032 0.0204 ms 60.3%
+  triton_mm_1030 0.0303 ms 40.7%
+SingleProcess AUTOTUNE takes 1.6241 seconds
+AUTOTUNE mm(1568x864, 864x128)
+  mm 0.0126 ms 100.0%
+  triton_mm_1055 0.0139 ms 90.8%
+  triton_mm_1054 0.0142 ms 88.9%
+  triton_mm_1057 0.0156 ms 80.7%
+  triton_mm_1058 0.0158 ms 79.9%
+  triton_mm_1052 0.0172 ms 73.1%
+  triton_mm_1053 0.0172 ms 73.1%
+  triton_mm_1050 0.0205 ms 61.6%
+  triton_mm_1051 0.0211 ms 59.8%
+  triton_mm_1049 0.0315 ms 40.0%
+SingleProcess AUTOTUNE takes 1.6285 seconds
+AUTOTUNE mm(1568x896, 896x128)
+  mm 0.0132 ms 100.0%
+  triton_mm_1073 0.0141 ms 93.2%
+  triton_mm_1074 0.0145 ms 90.7%
+  triton_mm_1076 0.0151 ms 87.1%
+  triton_mm_1077 0.0154 ms 85.6%
+  triton_mm_1071 0.0175 ms 75.1%
+  triton_mm_1072 0.0179 ms 73.7%
+  triton_mm_1069 0.0213 ms 61.6%
+  triton_mm_1070 0.0215 ms 61.3%
+  triton_mm_1068 0.0317 ms 41.5%
+SingleProcess AUTOTUNE takes 1.6290 seconds
+AUTOTUNE mm(1568x928, 928x128)
+  mm 0.0137 ms 100.0%
+  triton_mm_1092 0.0141 ms 96.8%
+  triton_mm_1093 0.0149 ms 91.8%
+  triton_mm_1095 0.0161 ms 84.6%
+  triton_mm_1096 0.0164 ms 83.4%
+  triton_mm_1090 0.0180 ms 76.0%
+  triton_mm_1091 0.0180 ms 76.0%
+  triton_mm_1089 0.0215 ms 63.5%
+  triton_mm_1088 0.0220 ms 62.0%
+  triton_mm_1087 0.0332 ms 41.1%
+SingleProcess AUTOTUNE takes 1.6230 seconds
+AUTOTUNE mm(1568x960, 960x128)
+  mm 0.0131 ms 100.0%
+  triton_mm_1111 0.0145 ms 90.5%
+  triton_mm_1112 0.0149 ms 87.6%
+  triton_mm_1114 0.0156 ms 83.8%
+  triton_mm_1115 0.0163 ms 80.2%
+  triton_mm_1110 0.0183 ms 71.6%
+  triton_mm_1109 0.0186 ms 70.3%
+  triton_mm_1107 0.0225 ms 58.3%
+  triton_mm_1108 0.0225 ms 58.2%
+  triton_mm_1106 0.0340 ms 38.5%
+SingleProcess AUTOTUNE takes 1.6278 seconds
+AUTOTUNE mm(1568x992, 992x128)
+  mm 0.0134 ms 100.0%
+  triton_mm_1130 0.0145 ms 92.5%
+  triton_mm_1131 0.0154 ms 86.7%
+  triton_mm_1133 0.0163 ms 82.1%
+  triton_mm_1134 0.0164 ms 81.5%
+  triton_mm_1129 0.0191 ms 70.0%
+  triton_mm_1128 0.0192 ms 69.8%
+  triton_mm_1126 0.0225 ms 59.5%
+  triton_mm_1127 0.0230 ms 58.1%
+  triton_mm_1125 0.0345 ms 38.8%
+SingleProcess AUTOTUNE takes 1.6267 seconds
+AUTOTUNE addmm(32x1000, 32x1024, 1024x1000)
+  triton_mm_1149 0.0140 ms 100.0%
+  bias_addmm 0.0144 ms 96.7%
+  triton_mm_1152 0.0153 ms 91.4%
+  triton_mm_1150 0.0157 ms 88.6%
+  triton_mm_1153 0.0161 ms 86.9%
+  triton_mm_1148 0.0165 ms 84.7%
+  addmm 0.0183 ms 76.2%
+  triton_mm_1147 0.0186 ms 74.9%
+  triton_mm_1146 0.0209 ms 66.7%
+  triton_mm_1145 0.0228 ms 61.2%
+SingleProcess AUTOTUNE takes 1.7801 seconds
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:  13%|█▎        | 4/30 [00:00<00:00, 34.62it/s]running benchmark:  27%|██▋       | 8/30 [00:00<00:00, 37.25it/s]running benchmark:  40%|████      | 12/30 [00:00<00:00, 38.38it/s]running benchmark:  57%|█████▋    | 17/30 [00:00<00:00, 39.15it/s]running benchmark:  70%|███████   | 21/30 [00:00<00:00, 39.28it/s]running benchmark:  83%|████████▎ | 25/30 [00:00<00:00, 39.39it/s]running benchmark:  97%|█████████▋| 29/30 [00:00<00:00, 39.43it/s]running benchmark: 100%|██████████| 30/30 [00:00<00:00, 38.85it/s]
+3426.535ms
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_c4
+WARNING:common:Model detectron2_fasterrcnn_r_101_c4 does not support bfloat16, running with amp instead
+loading model: 0it [00:06, ?it/s]
+WARNING:common:Model detectron2_fasterrcnn_r_101_c4 does not support bfloat16, running with amp instead
+cuda eval  detectron2_fasterrcnn_r_101_c4      baseline-bs32             
+WARNING:common:Model detectron2_fasterrcnn_r_101_c4 does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['mutated inputs']
+AUTOTUNE convolution(32x3x1199x1333, 64x3x7x7)
+  convolution 3.6711 ms 100.0%
+  triton_convolution_3 21.6662 ms 16.9%
+  triton_convolution_4 23.7935 ms 15.4%
+  triton_convolution_5 26.3902 ms 13.9%
+  triton_convolution_0 29.6161 ms 12.4%
+  triton_convolution_2 31.9513 ms 11.5%
+  triton_convolution_1 79.8726 ms 4.6%
+SingleProcess AUTOTUNE takes 4.8000 seconds
+AUTOTUNE mm(3206400x64, 64x64)
+  triton_mm_8 0.5075 ms 100.0%
+  triton_mm_14 0.5102 ms 99.5%
+  triton_mm_10 0.5260 ms 96.5%
+  triton_mm_7 0.5306 ms 95.6%
+  triton_mm_13 0.5380 ms 94.3%
+  triton_mm_6 0.5406 ms 93.9%
+  mm 0.5563 ms 91.2%
+  triton_mm_9 0.5578 ms 91.0%
+  triton_mm_15 0.6812 ms 74.5%
+  triton_mm_17 0.8081 ms 62.8%
+SingleProcess AUTOTUNE takes 4.1431 seconds
+AUTOTUNE convolution(32x64x300x334, 64x64x3x3)
+  convolution 1.4110 ms 100.0%
+  triton_convolution_18 7.0030 ms 20.1%
+  triton_convolution_23 8.0338 ms 17.6%
+  triton_convolution_24 9.5636 ms 14.8%
+  triton_convolution_19 11.8127 ms 11.9%
+  triton_convolution_21 12.0242 ms 11.7%
+  triton_convolution_22 12.3796 ms 11.4%
+  triton_convolution_20 27.9354 ms 5.1%
+SingleProcess AUTOTUNE takes 4.4109 seconds
+AUTOTUNE mm(3206400x64, 64x256)
+  triton_mm_27 1.5420 ms 100.0%
+  triton_mm_26 1.6519 ms 93.3%
+  triton_mm_29 1.7502 ms 88.1%
+  mm 1.7585 ms 87.7%
+  triton_mm_28 1.8236 ms 84.6%
+  triton_mm_33 1.8328 ms 84.1%
+  triton_mm_25 1.9181 ms 80.4%
+  triton_mm_32 1.9732 ms 78.1%
+  triton_mm_35 2.6754 ms 57.6%
+  triton_mm_34 2.8038 ms 55.0%
+SingleProcess AUTOTUNE takes 4.5301 seconds
+AUTOTUNE mm(3206400x256, 256x64)
+  triton_mm_51 1.3059 ms 100.0%
+  triton_mm_53 1.3372 ms 97.7%
+  triton_mm_56 1.3412 ms 97.4%
+  mm 1.3805 ms 94.6%
+  triton_mm_57 1.5065 ms 86.7%
+  triton_mm_50 1.5583 ms 83.8%
+  triton_mm_52 1.5721 ms 83.1%
+  triton_mm_49 1.5871 ms 82.3%
+  triton_mm_54 2.0783 ms 62.8%
+  triton_mm_55 2.4887 ms 52.5%
+SingleProcess AUTOTUNE takes 4.4195 seconds
+AUTOTUNE convolution(32x256x300x334, 128x256x1x1)
+  convolution 0.4494 ms 100.0%
+  triton_convolution_114 1.0557 ms 42.6%
+  triton_convolution_111 1.2019 ms 37.4%
+  triton_convolution_117 1.3383 ms 33.6%
+  triton_convolution_116 1.4813 ms 30.3%
+  triton_convolution_115 1.6441 ms 27.3%
+  triton_convolution_112 3.1032 ms 14.5%
+  triton_convolution_113 6.6397 ms 6.8%
+SingleProcess AUTOTUNE takes 4.1872 seconds
+AUTOTUNE convolution(32x128x150x167, 128x128x3x3)
+  convolution 1.1686 ms 100.0%
+  triton_convolution_121 6.9283 ms 16.9%
+  triton_convolution_118 7.9025 ms 14.8%
+  triton_convolution_123 8.2180 ms 14.2%
+  triton_convolution_124 10.3950 ms 11.2%
+  triton_convolution_122 12.0315 ms 9.7%
+  triton_convolution_119 13.6963 ms 8.5%
+  triton_convolution_120 27.8904 ms 4.2%
+SingleProcess AUTOTUNE takes 4.6069 seconds
+AUTOTUNE mm(801600x128, 128x512)
+  triton_mm_127 0.9904 ms 100.0%
+  triton_mm_132 1.0565 ms 93.7%
+  triton_mm_126 1.0860 ms 91.2%
+  mm 1.1119 ms 89.1%
+  triton_mm_129 1.1889 ms 83.3%
+  triton_mm_128 1.1967 ms 82.8%
+  triton_mm_125 1.2317 ms 80.4%
+  triton_mm_133 1.3823 ms 71.6%
+  triton_mm_135 1.9030 ms 52.0%
+  triton_mm_134 2.2759 ms 43.5%
+SingleProcess AUTOTUNE takes 4.8260 seconds
+AUTOTUNE convolution(32x256x300x334, 512x256x1x1)
+  convolution 1.5085 ms 100.0%
+  triton_convolution_140 4.1747 ms 36.1%
+  triton_convolution_142 4.4417 ms 34.0%
+  triton_convolution_143 5.3322 ms 28.3%
+  triton_convolution_141 6.5290 ms 23.1%
+  triton_convolution_137 8.3508 ms 18.1%
+  triton_convolution_138 12.3684 ms 12.2%
+  triton_convolution_139 26.6449 ms 5.7%
+SingleProcess AUTOTUNE takes 4.8628 seconds
+AUTOTUNE mm(801600x512, 512x128)
+  mm 0.7224 ms 100.0%
+  triton_mm_146 0.8551 ms 84.5%
+  triton_mm_148 0.9092 ms 79.5%
+  triton_mm_151 0.9589 ms 75.3%
+  triton_mm_145 0.9661 ms 74.8%
+  triton_mm_147 0.9952 ms 72.6%
+  triton_mm_152 1.1199 ms 64.5%
+  triton_mm_144 1.1627 ms 62.1%
+  triton_mm_149 1.7531 ms 41.2%
+  triton_mm_150 1.8806 ms 38.4%
+SingleProcess AUTOTUNE takes 5.0073 seconds
+AUTOTUNE convolution(32x512x150x167, 256x512x1x1)
+  convolution 0.3116 ms 100.0%
+  triton_convolution_240 1.0448 ms 29.8%
+  triton_convolution_242 1.0599 ms 29.4%
+  triton_convolution_243 1.2888 ms 24.2%
+  triton_convolution_241 1.6524 ms 18.9%
+  triton_convolution_237 2.0967 ms 14.9%
+  triton_convolution_238 3.1234 ms 10.0%
+  triton_convolution_239 6.5845 ms 4.7%
+SingleProcess AUTOTUNE takes 4.6128 seconds
+AUTOTUNE convolution(32x256x75x84, 256x256x3x3)
+  convolution 1.0658 ms 100.0%
+  triton_convolution_249 5.9960 ms 17.8%
+  triton_convolution_247 7.2260 ms 14.7%
+  triton_convolution_244 7.7460 ms 13.8%
+  triton_convolution_250 8.3173 ms 12.8%
+  triton_convolution_245 16.2081 ms 6.6%
+  triton_convolution_248 16.2658 ms 6.6%
+  triton_convolution_246 29.2040 ms 3.6%
+SingleProcess AUTOTUNE takes 5.4178 seconds
+AUTOTUNE mm(201600x256, 256x1024)
+  mm 0.6622 ms 100.0%
+  triton_mm_252 0.7604 ms 87.1%
+  triton_mm_253 0.7659 ms 86.5%
+  triton_mm_258 0.7962 ms 83.2%
+  triton_mm_251 0.9126 ms 72.6%
+  triton_mm_254 0.9140 ms 72.5%
+  triton_mm_255 0.9154 ms 72.3%
+  triton_mm_259 1.0656 ms 62.1%
+  triton_mm_261 1.2927 ms 51.2%
+  triton_mm_260 1.8967 ms 34.9%
+SingleProcess AUTOTUNE takes 5.0787 seconds
+AUTOTUNE convolution(32x512x150x167, 1024x512x1x1)
+  convolution 1.1992 ms 100.0%
+  triton_convolution_266 4.0701 ms 29.5%
+  triton_convolution_268 4.1081 ms 29.2%
+  triton_convolution_269 5.0063 ms 24.0%
+  triton_convolution_267 6.4726 ms 18.5%
+  triton_convolution_263 8.0791 ms 14.8%
+  triton_convolution_264 12.3431 ms 9.7%
+  triton_convolution_265 25.8862 ms 4.6%
+SingleProcess AUTOTUNE takes 4.9374 seconds
+AUTOTUNE mm(201600x1024, 1024x256)
+  mm 0.5403 ms 100.0%
+  triton_mm_271 0.6544 ms 82.6%
+  triton_mm_272 0.6594 ms 81.9%
+  triton_mm_274 0.7568 ms 71.4%
+  triton_mm_273 0.7628 ms 70.8%
+  triton_mm_270 0.8695 ms 62.1%
+  triton_mm_278 0.8959 ms 60.3%
+  triton_mm_277 0.9666 ms 55.9%
+  triton_mm_280 1.3431 ms 40.2%
+  triton_mm_276 1.5653 ms 34.5%
+SingleProcess AUTOTUNE takes 4.9982 seconds
+AUTOTUNE convolution(32x1024x75x84, 1024x1024x3x3)
+  convolution 16.9228 ms 100.0%
+  triton_convolution_957 103.6706 ms 16.3%
+  triton_convolution_952 172.4298 ms 9.8%
+  triton_convolution_955 223.0900 ms 7.6%
+  triton_convolution_958 249.7339 ms 6.8%
+  triton_convolution_953 292.5016 ms 5.8%
+  triton_convolution_956 399.6055 ms 4.2%
+  triton_convolution_954 464.8203 ms 3.6%
+SingleProcess AUTOTUNE takes 19.3641 seconds
+AUTOTUNE addmm(201600x15, 201600x1024, 1024x15)
+  triton_mm_967 0.2812 ms 100.0%
+  triton_mm_968 0.2827 ms 99.5%
+  triton_mm_961 0.2891 ms 97.3%
+  triton_mm_960 0.2920 ms 96.3%
+  triton_mm_964 0.2925 ms 96.1%
+  triton_mm_962 0.2925 ms 96.1%
+  triton_mm_965 0.2930 ms 96.0%
+  triton_mm_963 0.2942 ms 95.6%
+  triton_mm_959 0.3956 ms 71.1%
+  triton_mm_966 0.3968 ms 70.9%
+SingleProcess AUTOTUNE takes 4.7155 seconds
+AUTOTUNE addmm(201600x60, 201600x1024, 1024x60)
+  triton_mm_973 0.3118 ms 100.0%
+  triton_mm_979 0.3169 ms 98.4%
+  triton_mm_975 0.3190 ms 97.7%
+  triton_mm_972 0.3243 ms 96.1%
+  triton_mm_974 0.3288 ms 94.8%
+  triton_mm_971 0.4221 ms 73.9%
+  bias_addmm 0.4224 ms 73.8%
+  triton_mm_978 0.4247 ms 73.4%
+  triton_mm_976 0.4310 ms 72.3%
+  triton_mm_977 0.4612 ms 67.6%
+SingleProcess AUTOTUNE takes 5.5761 seconds
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 17:06:59,639] [29/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+AUTOTUNE convolution(28747x512x7x7, 512x512x3x3)
+  convolution 28.4730 ms 100.0%
+  triton_convolution_988 184.2394 ms 15.5%
+  triton_convolution_983 238.6127 ms 11.9%
+  triton_convolution_986 265.3357 ms 10.7%
+  triton_convolution_989 420.1498 ms 6.8%
+  triton_convolution_987 500.0238 ms 5.7%
+  triton_convolution_984 575.9689 ms 4.9%
+  triton_convolution_985 692.4209 ms 4.1%
+SingleProcess AUTOTUNE takes 27.0794 seconds
+ERROR:common:Backend dynamo failed in warmup()
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2380, in warmup
+    fn(model, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 488, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward
+    return self.inference(batched_inputs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 203, in inference
+    images = self.preprocess_image(batched_inputs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 208, in resume_in_inference
+    proposals, _ = self.proposal_generator(images, features, None)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in resume_in_inference
+    results, _ = self.roi_heads(images, features, proposals, None)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 477, in forward
+    box_features = self._shared_roi_transform(
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 456, in _shared_roi_transform
+    x = self.pooler(features, boxes)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 654, in catch_errors
+    return callback(frame, cache_entry, hooks, frame_state)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 727, in _convert_frame
+    result = inner_convert(frame, cache_entry, hooks, frame_state)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 383, in _convert_frame_assert
+    compiled_product = _compile(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 646, in _compile
+    guarded_code = compile_inner(code, one_graph, hooks, transform)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 562, in compile_inner
+    out_code = transform_code_object(code, transform)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/bytecode_transformation.py", line 1033, in transform_code_object
+    transformations(instructions, code_options)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 151, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 527, in transform
+    tracer.run()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2144, in run
+    super().run()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 818, in run
+    and self.step()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 781, in step
+    getattr(self, inst.opname)(inst)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2259, in RETURN_VALUE
+    self.output.compile_subgraph(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 919, in compile_subgraph
+    self.compile_and_call_fx_graph(tx, list(reversed(stack_values)), root)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1087, in compile_and_call_fx_graph
+    compiled_fn = self.call_user_compiler(gm)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1159, in call_user_compiler
+    raise BackendCompilerFailed(self.compiler_fn, e).with_traceback(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1140, in call_user_compiler
+    compiled_fn = compiler_fn(gm, self.example_inputs())
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_dynamo.py", line 117, in debug_wrapper
+    compiled_gm = compiler_fn(gm, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/__init__.py", line 1672, in __call__
+    return compile_fx(model_, inputs_, config_patches=self.config)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 952, in compile_fx
+    return compile_fx(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1168, in compile_fx
+    return aot_autograd(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/backends/common.py", line 55, in compiler_fn
+    cg = aot_module_simplified(gm, example_inputs, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 887, in aot_module_simplified
+    compiled_fn = create_aot_dispatcher_function(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 600, in create_aot_dispatcher_function
+    compiled_fn = compiler_fn(flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 425, in aot_wrapper_dedupe
+    return compiler_fn(flat_fn, leaf_flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 630, in aot_wrapper_synthetic_base
+    return compiler_fn(flat_fn, flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py", line 97, in aot_dispatch_base
+    compiled_fw = compiler(fw_module, updated_flat_args)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1100, in fw_compiler_base
+    return inner_compile(
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_aot.py", line 83, in debug_wrapper
+    inner_compiled_fn = compiler_fn(gm, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/debug.py", line 305, in inner
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 320, in compile_fx_inner
+    compiled_graph = fx_codegen_and_compile(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 550, in fx_codegen_and_compile
+    compiled_fn = graph.compile_to_fn()
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 1117, in compile_to_fn
+    return self.compile_to_module().call
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 1071, in compile_to_module
+    mod = PyCodeCache.load_by_key_path(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 1891, in load_by_key_path
+    exec(code, mod.__dict__, mod.__dict__)
+  File "/tmp/torchinductor_cdhernandez/ax/caxcp4wd2lphx55vgchvpe4wujkilrhg2s26uqpiarrojt46ftl2.py", line 364, in <module>
+    async_compile.wait(globals())
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 2470, in wait
+    scope[key] = result.result()
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 2313, in result
+    self.future.result()
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/concurrent/futures/_base.py", line 458, in result
+    return self.__get_result()
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result
+    raise self._exception
+torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised:
+CompilationError: at 14:40:    xnumel = 196
+    yoffset = tl.program_id(1).to(tl.int64) * YBLOCK
+    yindex = yoffset + tl.arange(0, YBLOCK)[None, :].to(tl.int64)
+    ymask = yindex < ynumel
+    xoffset = tl.program_id(0).to(tl.int64) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None].to(tl.int64)
+    xmask = xindex < xnumel
+    x2 = xindex
+    y3 = yindex
+    y0 = yindex % 1024
+    y1 = (yindex // 1024)
+    tmp0 = tl.load(in_ptr0 + (x2 + (196*y3)), xmask, eviction_policy='evict_last').to(tl.float32)
+                                        ^
+ValueError('numel (262144) exceeds triton maximum tensor numel (131072)')
+
+Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information
+
+
+You can suppress this exception and fall back to eager by setting:
+    import torch._dynamo
+    torch._dynamo.config.suppress_errors = True
+
+Run failed with return code:  255
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_dc5
+loading model: 0it [00:09, ?it/s]
+WARNING:root:detectron2_fasterrcnn_r_101_dc5 failed to load
+Original Error: "roi_align_forward_kernel" not implemented for 'BFloat16'
+Eager model failed to run
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model
+    self.model_iter_fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward
+    return self.inference(batched_inputs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference
+    results, _ = self.roi_heads(images, features, proposals, None)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 747, in forward
+    pred_instances = self._forward_box(features, proposals)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 798, in _forward_box
+    box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals])
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward
+    return self.level_poolers[0](x[0], pooler_fmt_boxes)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward
+    return roi_align(
+  File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 238, in roi_align
+    return torch.ops.torchvision.roi_align(
+  File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 755, in __call__
+    return self._op(*args, **(kwargs or {}))
+RuntimeError: "roi_align_forward_kernel" not implemented for 'BFloat16'
+
+The above exception was the direct cause of the following exception:
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model
+    self.validate_model(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model
+    raise NotImplementedError("Eager model failed to run") from e
+NotImplementedError: Eager model failed to run
+
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_fpn
+WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:05, ?it/s]
+WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_fasterrcnn_r_101_fpn     baseline-bs32             
+WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['mutated inputs']
+AUTOTUNE convolution(32x3x1216x1344, 64x3x7x7)
+  convolution 3.7276 ms 100.0%
+  triton_convolution_3 22.1441 ms 16.8%
+  triton_convolution_4 24.2946 ms 15.3%
+  triton_convolution_5 26.9298 ms 13.8%
+  triton_convolution_0 30.0927 ms 12.4%
+  triton_convolution_2 32.5115 ms 11.5%
+  triton_convolution_1 81.4334 ms 4.6%
+SingleProcess AUTOTUNE takes 4.6148 seconds
+AUTOTUNE mm(3268608x64, 64x64)
+  triton_mm_14 0.5204 ms 100.0%
+  triton_mm_8 0.5205 ms 100.0%
+  triton_mm_7 0.5292 ms 98.3%
+  triton_mm_10 0.5374 ms 96.8%
+  triton_mm_6 0.5463 ms 95.2%
+  triton_mm_13 0.5497 ms 94.7%
+  triton_mm_9 0.5542 ms 93.9%
+  mm 0.5696 ms 91.4%
+  triton_mm_15 0.7099 ms 73.3%
+  triton_mm_16 0.7413 ms 70.2%
+SingleProcess AUTOTUNE takes 3.9672 seconds
+AUTOTUNE convolution(32x64x304x336, 64x64x3x3)
+  convolution 1.4512 ms 100.0%
+  triton_convolution_18 7.1300 ms 20.4%
+  triton_convolution_23 8.1302 ms 17.8%
+  triton_convolution_24 9.7394 ms 14.9%
+  triton_convolution_19 12.0317 ms 12.1%
+  triton_convolution_21 12.1762 ms 11.9%
+  triton_convolution_22 12.6389 ms 11.5%
+  triton_convolution_20 28.5142 ms 5.1%
+SingleProcess AUTOTUNE takes 4.5511 seconds
+AUTOTUNE mm(3268608x64, 64x256)
+  triton_mm_27 1.5106 ms 100.0%
+  triton_mm_26 1.5171 ms 99.6%
+  triton_mm_28 1.7556 ms 86.0%
+  triton_mm_29 1.7616 ms 85.8%
+  mm 1.7827 ms 84.7%
+  triton_mm_33 1.8637 ms 81.1%
+  triton_mm_25 1.8785 ms 80.4%
+  triton_mm_32 1.9761 ms 76.4%
+  triton_mm_35 2.2260 ms 67.9%
+  triton_mm_34 2.8418 ms 53.2%
+SingleProcess AUTOTUNE takes 4.4901 seconds
+AUTOTUNE mm(3268608x256, 256x64)
+  triton_mm_51 1.3317 ms 100.0%
+  triton_mm_50 1.3587 ms 98.0%
+  triton_mm_53 1.3629 ms 97.7%
+  triton_mm_56 1.3673 ms 97.4%
+  triton_mm_57 1.3967 ms 95.3%
+  triton_mm_49 1.4020 ms 95.0%
+  mm 1.4044 ms 94.8%
+  triton_mm_52 1.4348 ms 92.8%
+  triton_mm_59 2.0760 ms 64.1%
+  triton_mm_55 2.1260 ms 62.6%
+SingleProcess AUTOTUNE takes 4.6271 seconds
+AUTOTUNE convolution(32x256x304x336, 128x256x1x1)
+  convolution 0.4530 ms 100.0%
+  triton_convolution_114 1.0770 ms 42.1%
+  triton_convolution_111 1.2264 ms 36.9%
+  triton_convolution_117 1.3645 ms 33.2%
+  triton_convolution_116 1.5053 ms 30.1%
+  triton_convolution_115 1.6691 ms 27.1%
+  triton_convolution_112 3.1430 ms 14.4%
+  triton_convolution_113 6.8266 ms 6.6%
+SingleProcess AUTOTUNE takes 4.1577 seconds
+AUTOTUNE convolution(32x128x152x168, 128x128x3x3)
+  convolution 1.2069 ms 100.0%
+  triton_convolution_121 7.0850 ms 17.0%
+  triton_convolution_118 8.0108 ms 15.1%
+  triton_convolution_123 8.3698 ms 14.4%
+  triton_convolution_124 10.6358 ms 11.3%
+  triton_convolution_122 12.3811 ms 9.7%
+  triton_convolution_119 13.8907 ms 8.7%
+  triton_convolution_120 28.6004 ms 4.2%
+SingleProcess AUTOTUNE takes 4.5632 seconds
+AUTOTUNE mm(817152x128, 128x512)
+  triton_mm_127 1.0106 ms 100.0%
+  triton_mm_126 1.0184 ms 99.2%
+  triton_mm_132 1.0308 ms 98.0%
+  mm 1.1343 ms 89.1%
+  triton_mm_125 1.1611 ms 87.0%
+  triton_mm_128 1.1932 ms 84.7%
+  triton_mm_129 1.2067 ms 83.7%
+  triton_mm_133 1.3923 ms 72.6%
+  triton_mm_135 1.7286 ms 58.5%
+  triton_mm_134 2.2943 ms 44.0%
+SingleProcess AUTOTUNE takes 4.4943 seconds
+AUTOTUNE convolution(32x256x304x336, 512x256x1x1)
+  convolution 1.5192 ms 100.0%
+  triton_convolution_140 4.2484 ms 35.8%
+  triton_convolution_142 4.5286 ms 33.5%
+  triton_convolution_143 5.4167 ms 28.0%
+  triton_convolution_141 6.6385 ms 22.9%
+  triton_convolution_137 8.3060 ms 18.3%
+  triton_convolution_138 12.5158 ms 12.1%
+  triton_convolution_139 27.1067 ms 5.6%
+SingleProcess AUTOTUNE takes 5.2291 seconds
+AUTOTUNE mm(817152x512, 512x128)
+  mm 0.7270 ms 100.0%
+  triton_mm_146 0.8278 ms 87.8%
+  triton_mm_145 0.8757 ms 83.0%
+  triton_mm_148 0.9058 ms 80.3%
+  triton_mm_147 0.9367 ms 77.6%
+  triton_mm_151 0.9769 ms 74.4%
+  triton_mm_152 1.0830 ms 67.1%
+  triton_mm_144 1.1040 ms 65.9%
+  triton_mm_154 1.6869 ms 43.1%
+  triton_mm_149 1.7984 ms 40.4%
+SingleProcess AUTOTUNE takes 5.0176 seconds
+AUTOTUNE convolution(32x512x152x168, 256x512x1x1)
+  convolution 0.3219 ms 100.0%
+  triton_convolution_240 1.0508 ms 30.6%
+  triton_convolution_242 1.0755 ms 29.9%
+  triton_convolution_243 1.2962 ms 24.8%
+  triton_convolution_241 1.6665 ms 19.3%
+  triton_convolution_237 2.1070 ms 15.3%
+  triton_convolution_238 3.1459 ms 10.2%
+  triton_convolution_239 6.7243 ms 4.8%
+SingleProcess AUTOTUNE takes 4.9713 seconds
+AUTOTUNE convolution(32x256x76x84, 256x256x3x3)
+  convolution 1.0804 ms 100.0%
+  triton_convolution_249 6.4292 ms 16.8%
+  triton_convolution_247 7.2717 ms 14.9%
+  triton_convolution_244 7.9900 ms 13.5%
+  triton_convolution_250 11.5452 ms 9.4%
+  triton_convolution_248 16.4734 ms 6.6%
+  triton_convolution_245 19.3684 ms 5.6%
+  triton_convolution_246 29.2683 ms 3.7%
+SingleProcess AUTOTUNE takes 5.1353 seconds
+AUTOTUNE mm(204288x256, 256x1024)
+  mm 0.6786 ms 100.0%
+  triton_mm_253 0.7821 ms 86.8%
+  triton_mm_252 0.7883 ms 86.1%
+  triton_mm_258 0.8225 ms 82.5%
+  triton_mm_251 0.9310 ms 72.9%
+  triton_mm_255 0.9324 ms 72.8%
+  triton_mm_254 0.9355 ms 72.5%
+  triton_mm_259 1.0938 ms 62.0%
+  triton_mm_261 1.3321 ms 50.9%
+  triton_mm_260 1.9005 ms 35.7%
+SingleProcess AUTOTUNE takes 4.6551 seconds
+AUTOTUNE convolution(32x512x152x168, 1024x512x1x1)
+  convolution 1.2104 ms 100.0%
+  triton_convolution_266 4.1278 ms 29.3%
+  triton_convolution_268 4.2156 ms 28.7%
+  triton_convolution_269 5.0886 ms 23.8%
+  triton_convolution_267 6.5420 ms 18.5%
+  triton_convolution_263 8.1750 ms 14.8%
+  triton_convolution_264 12.4580 ms 9.7%
+  triton_convolution_265 26.7420 ms 4.5%
+SingleProcess AUTOTUNE takes 5.2265 seconds
+AUTOTUNE mm(204288x1024, 1024x256)
+  mm 0.5437 ms 100.0%
+  triton_mm_271 0.6632 ms 82.0%
+  triton_mm_272 0.6646 ms 81.8%
+  triton_mm_274 0.7669 ms 70.9%
+  triton_mm_273 0.7675 ms 70.8%
+  triton_mm_270 0.8797 ms 61.8%
+  triton_mm_278 0.8978 ms 60.6%
+  triton_mm_277 0.9712 ms 56.0%
+  triton_mm_280 1.3469 ms 40.4%
+  triton_mm_279 1.5811 ms 34.4%
+SingleProcess AUTOTUNE takes 5.2192 seconds
+AUTOTUNE convolution(32x1024x76x84, 512x1024x1x1)
+  convolution 0.2622 ms 100.0%
+  triton_convolution_955 1.0030 ms 26.1%
+  triton_convolution_957 1.1227 ms 23.4%
+  triton_convolution_958 1.2446 ms 21.1%
+  triton_convolution_956 1.9245 ms 13.6%
+  triton_convolution_952 2.2700 ms 11.6%
+  triton_convolution_953 3.1720 ms 8.3%
+  triton_convolution_954 6.5570 ms 4.0%
+SingleProcess AUTOTUNE takes 4.5819 seconds
+AUTOTUNE convolution(32x512x38x42, 512x512x3x3)
+  convolution 1.0515 ms 100.0%
+  triton_convolution_964 6.9621 ms 15.1%
+  triton_convolution_959 9.8507 ms 10.7%
+  triton_convolution_962 10.1274 ms 10.4%
+  triton_convolution_965 17.0127 ms 6.2%
+  triton_convolution_963 19.1787 ms 5.5%
+  triton_convolution_960 21.0096 ms 5.0%
+  triton_convolution_961 28.6011 ms 3.7%
+SingleProcess AUTOTUNE takes 5.3532 seconds
+AUTOTUNE mm(51072x512, 512x2048)
+  mm 0.5559 ms 100.0%
+  triton_mm_968 0.6594 ms 84.3%
+  triton_mm_967 0.6638 ms 83.7%
+  triton_mm_973 0.7133 ms 77.9%
+  triton_mm_970 0.7841 ms 70.9%
+  triton_mm_969 0.7904 ms 70.3%
+  triton_mm_966 0.8376 ms 66.4%
+  triton_mm_974 0.9407 ms 59.1%
+  triton_mm_976 1.2370 ms 44.9%
+  triton_mm_971 1.7099 ms 32.5%
+SingleProcess AUTOTUNE takes 4.7618 seconds
+AUTOTUNE convolution(32x1024x76x84, 2048x1024x1x1)
+  convolution 1.0470 ms 100.0%
+  triton_convolution_981 3.9419 ms 26.6%
+  triton_convolution_983 4.3415 ms 24.1%
+  triton_convolution_984 4.9420 ms 21.2%
+  triton_convolution_982 7.6665 ms 13.7%
+  triton_convolution_978 8.2708 ms 12.7%
+  triton_convolution_979 12.2919 ms 8.5%
+  triton_convolution_980 26.1428 ms 4.0%
+SingleProcess AUTOTUNE takes 4.8388 seconds
+AUTOTUNE mm(51072x2048, 2048x512)
+  mm 0.4948 ms 100.0%
+  triton_mm_987 0.6086 ms 81.3%
+  triton_mm_986 0.6168 ms 80.2%
+  triton_mm_988 0.7104 ms 69.6%
+  triton_mm_989 0.7249 ms 68.3%
+  triton_mm_985 0.8283 ms 59.7%
+  triton_mm_993 0.8379 ms 59.1%
+  triton_mm_992 0.8847 ms 55.9%
+  triton_mm_995 1.3007 ms 38.0%
+  triton_mm_990 1.5264 ms 32.4%
+SingleProcess AUTOTUNE takes 5.0192 seconds
+AUTOTUNE addmm(51072x256, 51072x2048, 2048x256)
+  bias_addmm 0.3045 ms 100.0%
+  addmm 0.3104 ms 98.1%
+  triton_mm_1049 0.3329 ms 91.5%
+  triton_mm_1048 0.3370 ms 90.4%
+  triton_mm_1050 0.3632 ms 83.8%
+  triton_mm_1051 0.3691 ms 82.5%
+  triton_mm_1055 0.4297 ms 70.9%
+  triton_mm_1047 0.4396 ms 69.3%
+  triton_mm_1054 0.5082 ms 59.9%
+  triton_mm_1057 0.7069 ms 43.1%
+SingleProcess AUTOTUNE takes 5.5123 seconds
+AUTOTUNE convolution(32x256x38x42, 256x256x3x3)
+  convolution 0.2800 ms 100.0%
+  triton_convolution_1064 1.7070 ms 16.4%
+  triton_convolution_1062 1.8459 ms 15.2%
+  triton_convolution_1059 2.1395 ms 13.1%
+  triton_convolution_1065 2.9374 ms 9.5%
+  triton_convolution_1063 3.9718 ms 7.0%
+  triton_convolution_1060 4.7213 ms 5.9%
+  triton_convolution_1061 7.4863 ms 3.7%
+SingleProcess AUTOTUNE takes 4.7817 seconds
+AUTOTUNE addmm(3268608x256, 3268608x256, 256x256)
+  triton_mm_1068 3.6515 ms 100.0%
+  triton_mm_1067 3.7399 ms 97.6%
+  triton_mm_1073 3.9075 ms 93.4%
+  triton_mm_1069 4.2690 ms 85.5%
+  triton_mm_1070 4.3191 ms 84.5%
+  triton_mm_1066 4.4729 ms 81.6%
+  triton_mm_1074 4.8846 ms 74.8%
+  bias_addmm 5.9405 ms 61.5%
+  addmm 5.9812 ms 61.0%
+  triton_mm_1076 6.6301 ms 55.1%
+SingleProcess AUTOTUNE takes 5.7035 seconds
+AUTOTUNE addmm(817152x256, 817152x512, 512x256)
+  bias_addmm 1.3548 ms 100.0%
+  triton_mm_1080 1.4332 ms 94.5%
+  triton_mm_1079 1.5655 ms 86.5%
+  triton_mm_1081 1.8005 ms 75.2%
+  triton_mm_1082 1.8064 ms 75.0%
+  addmm 1.9001 ms 71.3%
+  triton_mm_1078 1.9506 ms 69.5%
+  triton_mm_1086 2.0710 ms 65.4%
+  triton_mm_1085 2.6391 ms 51.3%
+  triton_mm_1088 2.8551 ms 47.5%
+SingleProcess AUTOTUNE takes 5.6193 seconds
+AUTOTUNE addmm(204288x256, 204288x1024, 1024x256)
+  bias_addmm 0.5761 ms 100.0%
+  triton_mm_1091 0.6856 ms 84.0%
+  triton_mm_1092 0.6867 ms 83.9%
+  addmm 0.7476 ms 77.1%
+  triton_mm_1093 0.7855 ms 73.3%
+  triton_mm_1094 0.7955 ms 72.4%
+  triton_mm_1090 0.9032 ms 63.8%
+  triton_mm_1098 0.9230 ms 62.4%
+  triton_mm_1097 0.9917 ms 58.1%
+  triton_mm_1100 1.3640 ms 42.2%
+SingleProcess AUTOTUNE takes 5.7268 seconds
+AUTOTUNE convolution(32x256x304x336, 256x256x3x3)
+  convolution 17.6539 ms 100.0%
+  triton_convolution_1107 100.5318 ms 17.6%
+  triton_convolution_1105 118.5658 ms 14.9%
+  triton_convolution_1102 122.7876 ms 14.4%
+  triton_convolution_1108 186.2668 ms 9.5%
+  triton_convolution_1106 265.2579 ms 6.7%
+  triton_convolution_1103 300.6816 ms 5.9%
+  triton_convolution_1104 465.4240 ms 3.8%
+SingleProcess AUTOTUNE takes 16.9654 seconds
+AUTOTUNE convolution(32x256x152x168, 256x256x3x3)
+  convolution 4.4577 ms 100.0%
+  triton_convolution_1114 25.2454 ms 17.7%
+  triton_convolution_1112 29.8068 ms 15.0%
+  triton_convolution_1109 31.0213 ms 14.4%
+  triton_convolution_1115 46.3662 ms 9.6%
+  triton_convolution_1113 66.0892 ms 6.7%
+  triton_convolution_1110 76.3744 ms 5.8%
+  triton_convolution_1111 116.2610 ms 3.8%
+SingleProcess AUTOTUNE takes 7.4362 seconds
+AUTOTUNE addmm(3268608x3, 3268608x256, 256x3)
+  triton_mm_1132 1.0499 ms 100.0%
+  triton_mm_1131 1.0695 ms 98.2%
+  triton_mm_1135 1.0697 ms 98.1%
+  triton_mm_1133 1.0720 ms 97.9%
+  triton_mm_1134 1.0749 ms 97.7%
+  triton_mm_1137 1.0843 ms 96.8%
+  triton_mm_1130 1.0863 ms 96.6%
+  triton_mm_1141 1.1345 ms 92.5%
+  triton_mm_1140 1.1662 ms 90.0%
+  triton_mm_1138 1.2461 ms 84.3%
+SingleProcess AUTOTUNE takes 5.0383 seconds
+AUTOTUNE addmm(817152x3, 817152x256, 256x3)
+  triton_mm_1151 0.2790 ms 100.0%
+  triton_mm_1150 0.2831 ms 98.6%
+  triton_mm_1154 0.2835 ms 98.4%
+  triton_mm_1152 0.2840 ms 98.3%
+  triton_mm_1153 0.2846 ms 98.1%
+  triton_mm_1149 0.2888 ms 96.6%
+  triton_mm_1156 0.2890 ms 96.5%
+  triton_mm_1160 0.3040 ms 91.8%
+  triton_mm_1159 0.3087 ms 90.4%
+  triton_mm_1157 0.3271 ms 85.3%
+SingleProcess AUTOTUNE takes 4.3400 seconds
+AUTOTUNE addmm(204288x3, 204288x256, 256x3)
+  triton_mm_1169 0.0862 ms 100.0%
+  triton_mm_1171 0.0866 ms 99.5%
+  triton_mm_1173 0.0869 ms 99.2%
+  triton_mm_1172 0.0875 ms 98.5%
+  triton_mm_1170 0.0877 ms 98.2%
+  triton_mm_1168 0.0899 ms 95.9%
+  triton_mm_1175 0.0902 ms 95.5%
+  triton_mm_1178 0.0952 ms 90.6%
+  triton_mm_1176 0.0966 ms 89.2%
+  triton_mm_1179 0.1013 ms 85.0%
+SingleProcess AUTOTUNE takes 3.9785 seconds
+AUTOTUNE addmm(51072x3, 51072x256, 256x3)
+  triton_mm_1189 0.0339 ms 100.0%
+  triton_mm_1191 0.0343 ms 98.8%
+  triton_mm_1188 0.0344 ms 98.7%
+  triton_mm_1190 0.0345 ms 98.3%
+  triton_mm_1187 0.0353 ms 96.2%
+  triton_mm_1194 0.0353 ms 96.2%
+  triton_mm_1192 0.0355 ms 95.7%
+  triton_mm_1198 0.0364 ms 93.3%
+  triton_mm_1197 0.0368 ms 92.3%
+  triton_mm_1195 0.0371 ms 91.4%
+SingleProcess AUTOTUNE takes 4.3372 seconds
+AUTOTUNE convolution(32x256x19x21, 256x256x3x3)
+  convolution 0.0686 ms 100.0%
+  triton_convolution_1202 0.4279 ms 16.0%
+  triton_convolution_1204 0.4406 ms 15.6%
+  triton_convolution_1199 0.5708 ms 12.0%
+  triton_convolution_1205 0.6937 ms 9.9%
+  triton_convolution_1200 0.7684 ms 8.9%
+  triton_convolution_1203 0.7883 ms 8.7%
+  triton_convolution_1201 1.8229 ms 3.8%
+SingleProcess AUTOTUNE takes 4.5925 seconds
+AUTOTUNE addmm(12768x3, 12768x256, 256x3)
+  triton_mm_1209 0.0132 ms 100.0%
+  triton_mm_1207 0.0133 ms 99.5%
+  triton_mm_1211 0.0133 ms 99.3%
+  triton_mm_1208 0.0140 ms 94.7%
+  triton_mm_1210 0.0140 ms 94.7%
+  triton_mm_1214 0.0143 ms 92.2%
+  bias_addmm 0.0155 ms 85.2%
+  triton_mm_1206 0.0157 ms 84.3%
+  triton_mm_1213 0.0161 ms 82.3%
+  triton_mm_1216 0.0176 ms 74.9%
+SingleProcess AUTOTUNE takes 4.0712 seconds
+AUTOTUNE addmm(3268608x12, 3268608x256, 256x12)
+  triton_mm_1226 1.0775 ms 100.0%
+  triton_mm_1220 1.1099 ms 97.1%
+  triton_mm_1219 1.1344 ms 95.0%
+  triton_mm_1223 1.1381 ms 94.7%
+  triton_mm_1222 1.1384 ms 94.7%
+  triton_mm_1221 1.1386 ms 94.6%
+  triton_mm_1225 1.1644 ms 92.5%
+  triton_mm_1227 1.1674 ms 92.3%
+  triton_mm_1224 1.1682 ms 92.2%
+  triton_mm_1218 1.1714 ms 92.0%
+SingleProcess AUTOTUNE takes 4.1467 seconds
+AUTOTUNE addmm(817152x12, 817152x256, 256x12)
+  triton_mm_1238 0.2854 ms 100.0%
+  triton_mm_1232 0.2924 ms 97.6%
+  triton_mm_1231 0.2959 ms 96.5%
+  triton_mm_1233 0.2972 ms 96.0%
+  triton_mm_1235 0.2973 ms 96.0%
+  triton_mm_1234 0.2988 ms 95.5%
+  triton_mm_1236 0.3048 ms 93.6%
+  triton_mm_1237 0.3065 ms 93.1%
+  triton_mm_1239 0.3069 ms 93.0%
+  triton_mm_1230 0.3071 ms 92.9%
+SingleProcess AUTOTUNE takes 3.9259 seconds
+AUTOTUNE addmm(204288x12, 204288x256, 256x12)
+  triton_mm_1250 0.0881 ms 100.0%
+  triton_mm_1243 0.0881 ms 99.9%
+  triton_mm_1245 0.0890 ms 98.9%
+  triton_mm_1247 0.0893 ms 98.6%
+  triton_mm_1246 0.0896 ms 98.3%
+  triton_mm_1244 0.0896 ms 98.3%
+  triton_mm_1248 0.0911 ms 96.6%
+  triton_mm_1249 0.0923 ms 95.4%
+  triton_mm_1242 0.0924 ms 95.3%
+  triton_mm_1251 0.0925 ms 95.2%
+SingleProcess AUTOTUNE takes 4.2935 seconds
+AUTOTUNE addmm(51072x12, 51072x256, 256x12)
+  triton_mm_1256 0.0333 ms 100.0%
+  triton_mm_1262 0.0336 ms 99.3%
+  triton_mm_1258 0.0342 ms 97.5%
+  triton_mm_1263 0.0343 ms 97.2%
+  triton_mm_1255 0.0344 ms 97.0%
+  triton_mm_1257 0.0345 ms 96.8%
+  triton_mm_1261 0.0346 ms 96.3%
+  triton_mm_1259 0.0348 ms 95.7%
+  triton_mm_1254 0.0352 ms 94.6%
+  triton_mm_1265 0.0362 ms 92.1%
+SingleProcess AUTOTUNE takes 4.0971 seconds
+AUTOTUNE addmm(12768x12, 12768x256, 256x12)
+  triton_mm_1274 0.0129 ms 100.0%
+  triton_mm_1267 0.0132 ms 98.1%
+  triton_mm_1268 0.0134 ms 96.7%
+  triton_mm_1271 0.0134 ms 96.7%
+  triton_mm_1275 0.0135 ms 96.0%
+  triton_mm_1269 0.0137 ms 94.4%
+  triton_mm_1272 0.0138 ms 94.0%
+  triton_mm_1270 0.0139 ms 92.9%
+  bias_addmm 0.0149 ms 86.9%
+  triton_mm_1266 0.0152 ms 85.2%
+SingleProcess AUTOTUNE takes 3.8293 seconds
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 17:13:02,525] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+AUTOTUNE mm(32000x12544, 12544x1024)
+  mm 3.6433 ms 100.0%
+  triton_mm_1279 4.8504 ms 75.1%
+  triton_mm_1280 4.9807 ms 73.1%
+  triton_mm_1281 5.4958 ms 66.3%
+  triton_mm_1282 5.5499 ms 65.6%
+  triton_mm_1286 6.4978 ms 56.1%
+  triton_mm_1278 6.5740 ms 55.4%
+  triton_mm_1285 6.9910 ms 52.1%
+  triton_mm_1288 10.0610 ms 36.2%
+  triton_mm_1283 11.3868 ms 32.0%
+SingleProcess AUTOTUNE takes 5.8258 seconds
+AUTOTUNE mm(32000x1024, 1024x1024)
+  mm 0.3167 ms 100.0%
+  triton_mm_1292 0.3782 ms 83.7%
+  triton_mm_1291 0.3793 ms 83.5%
+  triton_mm_1293 0.4526 ms 70.0%
+  triton_mm_1294 0.4528 ms 69.9%
+  triton_mm_1290 0.4931 ms 64.2%
+  triton_mm_1298 0.5370 ms 59.0%
+  triton_mm_1297 0.5453 ms 58.1%
+  triton_mm_1300 0.7575 ms 41.8%
+  triton_mm_1299 0.9494 ms 33.4%
+SingleProcess AUTOTUNE takes 5.4527 seconds
+AUTOTUNE addmm(32000x81, 32000x1024, 1024x81)
+  triton_mm_1303 0.0913 ms 100.0%
+  triton_mm_1305 0.0941 ms 97.0%
+  triton_mm_1310 0.1024 ms 89.2%
+  triton_mm_1306 0.1032 ms 88.5%
+  triton_mm_1304 0.1063 ms 85.9%
+  triton_mm_1307 0.1226 ms 74.5%
+  triton_mm_1311 0.1240 ms 73.7%
+  triton_mm_1309 0.1309 ms 69.7%
+  triton_mm_1302 0.1377 ms 66.3%
+  triton_mm_1308 0.1441 ms 63.4%
+SingleProcess AUTOTUNE takes 6.6387 seconds
+AUTOTUNE addmm(32000x320, 32000x1024, 1024x320)
+  triton_mm_1316 0.1288 ms 100.0%
+  bias_addmm 0.1451 ms 88.8%
+  triton_mm_1318 0.1492 ms 86.3%
+  triton_mm_1315 0.1502 ms 85.7%
+  addmm 0.1558 ms 82.6%
+  triton_mm_1314 0.1653 ms 77.9%
+  triton_mm_1317 0.1708 ms 75.4%
+  triton_mm_1322 0.1740 ms 74.0%
+  triton_mm_1321 0.2398 ms 53.7%
+  triton_mm_1324 0.2614 ms 49.3%
+SingleProcess AUTOTUNE takes 5.5531 seconds
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+[2023-12-12 17:14:54,209] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (8)
+[2023-12-12 17:14:54,209] torch._dynamo.convert_frame: [WARNING]    function: 'resume_in_detector_postprocess' (/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/postprocessing.py:45)
+[2023-12-12 17:14:54,209] torch._dynamo.convert_frame: [WARNING]    last reason: L['scale_x'] == 0.5337781484570475                            # self.tensor[:, 0::2] *= scale_x  # miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/structures/boxes.py:275 in scale
+[2023-12-12 17:14:54,209] torch._dynamo.convert_frame: [WARNING] To log all recompilation reasons, use TORCH_LOGS="recompiles".
+[2023-12-12 17:14:54,209] torch._dynamo.convert_frame: [WARNING] To diagnose recompilation issues, see https://pytorch.org/docs/master/compile/troubleshooting.html.
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:01<00:29,  1.01s/it]running benchmark:   7%|▋         | 2/30 [00:02<00:28,  1.01s/it]running benchmark:  10%|█         | 3/30 [00:03<00:27,  1.02s/it]running benchmark:  13%|█▎        | 4/30 [00:04<00:26,  1.01s/it]running benchmark:  17%|█▋        | 5/30 [00:05<00:25,  1.01s/it]running benchmark:  20%|██        | 6/30 [00:06<00:24,  1.01s/it]running benchmark:  23%|██▎       | 7/30 [00:07<00:23,  1.01s/it]running benchmark:  27%|██▋       | 8/30 [00:08<00:22,  1.01s/it]running benchmark:  30%|███       | 9/30 [00:09<00:21,  1.00s/it]running benchmark:  33%|███▎      | 10/30 [00:10<00:20,  1.00s/it]running benchmark:  37%|███▋      | 11/30 [00:11<00:19,  1.00s/it]running benchmark:  40%|████      | 12/30 [00:12<00:18,  1.00s/it]running benchmark:  43%|████▎     | 13/30 [00:13<00:17,  1.01s/it]running benchmark:  47%|████▋     | 14/30 [00:14<00:16,  1.01s/it]running benchmark:  50%|█████     | 15/30 [00:15<00:15,  1.01s/it]running benchmark:  53%|█████▎    | 16/30 [00:16<00:14,  1.00s/it]running benchmark:  57%|█████▋    | 17/30 [00:17<00:12,  1.00it/s]running benchmark:  60%|██████    | 18/30 [00:18<00:12,  1.00s/it]running benchmark:  63%|██████▎   | 19/30 [00:19<00:11,  1.00s/it]running benchmark:  67%|██████▋   | 20/30 [00:20<00:10,  1.00s/it]running benchmark:  70%|███████   | 21/30 [00:21<00:09,  1.00s/it]running benchmark:  73%|███████▎  | 22/30 [00:22<00:08,  1.00s/it]running benchmark:  77%|███████▋  | 23/30 [00:23<00:07,  1.00s/it]running benchmark:  80%|████████  | 24/30 [00:24<00:06,  1.00s/it]running benchmark:  83%|████████▎ | 25/30 [00:25<00:05,  1.00s/it]running benchmark:  87%|████████▋ | 26/30 [00:26<00:04,  1.00s/it]running benchmark:  90%|█████████ | 27/30 [00:27<00:03,  1.00s/it]running benchmark:  93%|█████████▎| 28/30 [00:28<00:02,  1.00s/it]running benchmark:  97%|█████████▋| 29/30 [00:29<00:01,  1.00s/it]running benchmark: 100%|██████████| 30/30 [00:30<00:00,  1.00s/it]running benchmark: 100%|██████████| 30/30 [00:30<00:00,  1.00s/it]
+1221.793ms
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_50_c4
+loading model: 0it [00:05, ?it/s]
+WARNING:root:detectron2_fasterrcnn_r_50_c4 failed to load
+Original Error: "roi_align_forward_kernel" not implemented for 'BFloat16'
+Eager model failed to run
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model
+    self.model_iter_fn(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward
+    return self.inference(batched_inputs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference
+    results, _ = self.roi_heads(images, features, proposals, None)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 477, in forward
+    box_features = self._shared_roi_transform(
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 456, in _shared_roi_transform
+    x = self.pooler(features, boxes)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward
+    return self.level_poolers[0](x[0], pooler_fmt_boxes)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward
+    return roi_align(
+  File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 238, in roi_align
+    return torch.ops.torchvision.roi_align(
+  File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 755, in __call__
+    return self._op(*args, **(kwargs or {}))
+RuntimeError: "roi_align_forward_kernel" not implemented for 'BFloat16'
+
+The above exception was the direct cause of the following exception:
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run
+    ) = runner.load_model(
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model
+    self.validate_model(model, example_inputs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model
+    raise NotImplementedError("Eager model failed to run") from e
+NotImplementedError: Eager model failed to run
+
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_50_dc5
+WARNING:common:Model detectron2_fasterrcnn_r_50_dc5 does not support bfloat16, running with amp instead
+loading model: 0it [00:08, ?it/s]
+WARNING:common:Model detectron2_fasterrcnn_r_50_dc5 does not support bfloat16, running with amp instead
+cuda eval  detectron2_fasterrcnn_r_50_dc5      baseline-bs32             
+WARNING:common:Model detectron2_fasterrcnn_r_50_dc5 does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['mutated inputs']
+AUTOTUNE mm(201600x1024, 1024x512)
+  mm 1.0358 ms 100.0%
+  triton_mm_427 1.2499 ms 82.9%
+  triton_mm_426 1.2556 ms 82.5%
+  triton_mm_428 1.4656 ms 70.7%
+  triton_mm_429 1.4848 ms 69.8%
+  triton_mm_425 1.6263 ms 63.7%
+  triton_mm_432 1.7172 ms 60.3%
+  triton_mm_433 1.7271 ms 60.0%
+  triton_mm_435 2.5084 ms 41.3%
+  triton_mm_434 3.0095 ms 34.4%
+SingleProcess AUTOTUNE takes 5.6640 seconds
+AUTOTUNE mm(201600x512, 512x2048)
+  mm 2.0434 ms 100.0%
+  triton_mm_439 2.4442 ms 83.6%
+  triton_mm_438 2.6540 ms 77.0%
+  triton_mm_444 2.7824 ms 73.4%
+  triton_mm_440 3.1454 ms 65.0%
+  triton_mm_441 3.1550 ms 64.8%
+  triton_mm_437 3.3095 ms 61.7%
+  triton_mm_445 3.7457 ms 54.6%
+  triton_mm_447 4.9126 ms 41.6%
+  triton_mm_443 6.6934 ms 30.5%
+SingleProcess AUTOTUNE takes 5.6783 seconds
+AUTOTUNE mm(201600x1024, 1024x2048)
+  mm 4.0110 ms 100.0%
+  triton_mm_451 4.8498 ms 82.7%
+  triton_mm_450 4.8500 ms 82.7%
+  triton_mm_452 5.7944 ms 69.2%
+  triton_mm_453 5.8079 ms 69.1%
+  triton_mm_449 6.1773 ms 64.9%
+  triton_mm_456 6.5682 ms 61.1%
+  triton_mm_457 6.7636 ms 59.3%
+  triton_mm_459 9.4334 ms 42.5%
+  triton_mm_458 11.9919 ms 33.4%
+SingleProcess AUTOTUNE takes 5.3564 seconds
+AUTOTUNE mm(201600x2048, 2048x512)
+  mm 2.0384 ms 100.0%
+  triton_mm_463 2.4217 ms 84.2%
+  triton_mm_462 2.4330 ms 83.8%
+  triton_mm_464 2.8038 ms 72.7%
+  triton_mm_465 2.8677 ms 71.1%
+  triton_mm_461 3.2619 ms 62.5%
+  triton_mm_469 3.3163 ms 61.5%
+  triton_mm_468 3.3249 ms 61.3%
+  triton_mm_471 4.9656 ms 41.1%
+  triton_mm_466 5.9518 ms 34.2%
+SingleProcess AUTOTUNE takes 5.1687 seconds
+AUTOTUNE convolution(32x2048x75x84, 2048x2048x3x3)
+  convolution 69.9631 ms 100.0%
+  triton_convolution_514 677.6723 ms 10.3%
+  triton_convolution_509 742.7197 ms 9.4%
+  triton_convolution_515 1004.2849 ms 7.0%
+  triton_convolution_512 1152.3721 ms 6.1%
+  triton_convolution_510 1178.3074 ms 5.9%
+  triton_convolution_511 1851.8463 ms 3.8%
+  triton_convolution_513 2211.5046 ms 3.2%
+SingleProcess AUTOTUNE takes 75.0732 seconds
+AUTOTUNE addmm(201600x15, 201600x2048, 2048x15)
+  triton_mm_524 0.5203 ms 100.0%
+  triton_mm_525 0.5299 ms 98.2%
+  triton_mm_518 0.5437 ms 95.7%
+  triton_mm_521 0.5443 ms 95.6%
+  triton_mm_519 0.5446 ms 95.6%
+  triton_mm_517 0.5464 ms 95.2%
+  triton_mm_520 0.5468 ms 95.2%
+  triton_mm_522 0.5484 ms 94.9%
+  triton_mm_516 0.7950 ms 65.5%
+  triton_mm_523 0.7989 ms 65.1%
+SingleProcess AUTOTUNE takes 4.6511 seconds
+AUTOTUNE addmm(201600x60, 201600x2048, 2048x60)
+  triton_mm_530 0.5739 ms 100.0%
+  triton_mm_536 0.5827 ms 98.5%
+  triton_mm_532 0.5890 ms 97.4%
+  triton_mm_529 0.5966 ms 96.2%
+  triton_mm_531 0.6090 ms 94.2%
+  triton_mm_528 0.6180 ms 92.9%
+  triton_mm_535 0.6362 ms 90.2%
+  bias_addmm 0.8101 ms 70.8%
+  triton_mm_534 0.8152 ms 70.4%
+  triton_mm_533 0.8370 ms 68.6%
+SingleProcess AUTOTUNE takes 4.9860 seconds
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 17:20:42,656] [29/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+AUTOTUNE mm(26642x1024, 1024x1024)
+  mm 0.2569 ms 100.0%
+  triton_mm_541 0.3133 ms 82.0%
+  triton_mm_542 0.3157 ms 81.4%
+  triton_mm_543 0.3697 ms 69.5%
+  triton_mm_544 0.3747 ms 68.6%
+  triton_mm_540 0.4130 ms 62.2%
+  triton_mm_548 0.4421 ms 58.1%
+  triton_mm_547 0.4614 ms 55.7%
+  triton_mm_550 0.6248 ms 41.1%
+  triton_mm_549 0.7936 ms 32.4%
+SingleProcess AUTOTUNE takes 4.9235 seconds
+AUTOTUNE addmm(26642x81, 26642x1024, 1024x81)
+  triton_mm_553 0.0715 ms 100.0%
+  triton_mm_555 0.0754 ms 94.8%
+  triton_mm_560 0.0845 ms 84.6%
+  triton_mm_556 0.0861 ms 82.9%
+  triton_mm_554 0.0876 ms 81.6%
+  triton_mm_559 0.0950 ms 75.2%
+  triton_mm_561 0.1029 ms 69.4%
+  triton_mm_557 0.1051 ms 68.0%
+  triton_mm_552 0.1091 ms 65.5%
+  bias_addmm 0.1202 ms 59.5%
+SingleProcess AUTOTUNE takes 6.0512 seconds
+AUTOTUNE addmm(26642x320, 26642x1024, 1024x320)
+  triton_mm_566 0.1118 ms 100.0%
+  bias_addmm 0.1219 ms 91.7%
+  triton_mm_568 0.1248 ms 89.6%
+  triton_mm_565 0.1263 ms 88.5%
+  addmm 0.1375 ms 81.3%
+  triton_mm_567 0.1450 ms 77.1%
+  triton_mm_572 0.1456 ms 76.8%
+  triton_mm_564 0.1543 ms 72.5%
+  triton_mm_571 0.2059 ms 54.3%
+  triton_mm_574 0.2196 ms 50.9%
+SingleProcess AUTOTUNE takes 5.6466 seconds
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+[2023-12-12 17:21:56,345] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (8)
+[2023-12-12 17:21:56,345] torch._dynamo.convert_frame: [WARNING]    function: 'resume_in_detector_postprocess' (/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/postprocessing.py:45)
+[2023-12-12 17:21:56,345] torch._dynamo.convert_frame: [WARNING]    last reason: L['scale_x'] == 0.5337781484570475                            # self.tensor[:, 0::2] *= scale_x  # miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/structures/boxes.py:275 in scale
+[2023-12-12 17:21:56,345] torch._dynamo.convert_frame: [WARNING] To log all recompilation reasons, use TORCH_LOGS="recompiles".
+[2023-12-12 17:21:56,345] torch._dynamo.convert_frame: [WARNING] To diagnose recompilation issues, see https://pytorch.org/docs/master/compile/troubleshooting.html.
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:26,  1.08it/s]running benchmark:   7%|▋         | 2/30 [00:01<00:26,  1.07it/s]running benchmark:  10%|█         | 3/30 [00:02<00:24,  1.08it/s]running benchmark:  13%|█▎        | 4/30 [00:03<00:23,  1.10it/s]running benchmark:  17%|█▋        | 5/30 [00:04<00:22,  1.09it/s]running benchmark:  20%|██        | 6/30 [00:05<00:22,  1.09it/s]running benchmark:  23%|██▎       | 7/30 [00:06<00:20,  1.10it/s]running benchmark:  27%|██▋       | 8/30 [00:07<00:20,  1.10it/s]running benchmark:  30%|███       | 9/30 [00:08<00:19,  1.09it/s]running benchmark:  33%|███▎      | 10/30 [00:09<00:18,  1.10it/s]running benchmark:  37%|███▋      | 11/30 [00:10<00:17,  1.10it/s]running benchmark:  40%|████      | 12/30 [00:10<00:16,  1.10it/s]running benchmark:  43%|████▎     | 13/30 [00:11<00:15,  1.09it/s]running benchmark:  47%|████▋     | 14/30 [00:12<00:14,  1.09it/s]running benchmark:  50%|█████     | 15/30 [00:13<00:13,  1.09it/s]running benchmark:  53%|█████▎    | 16/30 [00:14<00:12,  1.10it/s]running benchmark:  57%|█████▋    | 17/30 [00:15<00:11,  1.10it/s]running benchmark:  60%|██████    | 18/30 [00:16<00:10,  1.10it/s]running benchmark:  63%|██████▎   | 19/30 [00:17<00:10,  1.10it/s]running benchmark:  67%|██████▋   | 20/30 [00:18<00:09,  1.10it/s]running benchmark:  70%|███████   | 21/30 [00:19<00:08,  1.09it/s]running benchmark:  73%|███████▎  | 22/30 [00:20<00:07,  1.09it/s]running benchmark:  77%|███████▋  | 23/30 [00:21<00:06,  1.09it/s]running benchmark:  80%|████████  | 24/30 [00:21<00:05,  1.08it/s]running benchmark:  83%|████████▎ | 25/30 [00:22<00:04,  1.10it/s]running benchmark:  87%|████████▋ | 26/30 [00:23<00:03,  1.10it/s]running benchmark:  90%|█████████ | 27/30 [00:24<00:02,  1.10it/s]running benchmark:  93%|█████████▎| 28/30 [00:25<00:01,  1.09it/s]running benchmark:  97%|█████████▋| 29/30 [00:26<00:00,  1.09it/s]running benchmark: 100%|██████████| 30/30 [00:27<00:00,  1.09it/s]running benchmark: 100%|██████████| 30/30 [00:27<00:00,  1.09it/s]
+1280.569ms
+loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_50_fpn
+WARNING:common:Model detectron2_fasterrcnn_r_50_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:05, ?it/s]
+WARNING:common:Model detectron2_fasterrcnn_r_50_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_fasterrcnn_r_50_fpn      baseline-bs32             
+WARNING:common:Model detectron2_fasterrcnn_r_50_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 17:25:05,343] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+[2023-12-12 17:26:22,474] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (8)
+[2023-12-12 17:26:22,474] torch._dynamo.convert_frame: [WARNING]    function: 'resume_in_detector_postprocess' (/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/postprocessing.py:45)
+[2023-12-12 17:26:22,474] torch._dynamo.convert_frame: [WARNING]    last reason: L['scale_x'] == 0.5337781484570475                            # self.tensor[:, 0::2] *= scale_x  # miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/structures/boxes.py:275 in scale
+[2023-12-12 17:26:22,474] torch._dynamo.convert_frame: [WARNING] To log all recompilation reasons, use TORCH_LOGS="recompiles".
+[2023-12-12 17:26:22,474] torch._dynamo.convert_frame: [WARNING] To diagnose recompilation issues, see https://pytorch.org/docs/master/compile/troubleshooting.html.
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:00<00:25,  1.14it/s]running benchmark:   7%|▋         | 2/30 [00:01<00:24,  1.15it/s]running benchmark:  10%|█         | 3/30 [00:02<00:23,  1.16it/s]running benchmark:  13%|█▎        | 4/30 [00:03<00:22,  1.16it/s]running benchmark:  17%|█▋        | 5/30 [00:04<00:21,  1.16it/s]running benchmark:  20%|██        | 6/30 [00:05<00:20,  1.16it/s]running benchmark:  23%|██▎       | 7/30 [00:06<00:19,  1.16it/s]running benchmark:  27%|██▋       | 8/30 [00:06<00:19,  1.15it/s]running benchmark:  30%|███       | 9/30 [00:07<00:18,  1.15it/s]running benchmark:  33%|███▎      | 10/30 [00:08<00:17,  1.16it/s]running benchmark:  37%|███▋      | 11/30 [00:09<00:16,  1.16it/s]running benchmark:  40%|████      | 12/30 [00:10<00:15,  1.17it/s]running benchmark:  43%|████▎     | 13/30 [00:11<00:14,  1.17it/s]running benchmark:  47%|████▋     | 14/30 [00:12<00:13,  1.17it/s]running benchmark:  50%|█████     | 15/30 [00:12<00:12,  1.17it/s]running benchmark:  53%|█████▎    | 16/30 [00:13<00:11,  1.17it/s]running benchmark:  57%|█████▋    | 17/30 [00:14<00:11,  1.17it/s]running benchmark:  60%|██████    | 18/30 [00:15<00:10,  1.17it/s]running benchmark:  63%|██████▎   | 19/30 [00:16<00:09,  1.17it/s]running benchmark:  67%|██████▋   | 20/30 [00:17<00:08,  1.17it/s]running benchmark:  70%|███████   | 21/30 [00:18<00:07,  1.17it/s]running benchmark:  73%|███████▎  | 22/30 [00:18<00:06,  1.17it/s]running benchmark:  77%|███████▋  | 23/30 [00:19<00:05,  1.17it/s]running benchmark:  80%|████████  | 24/30 [00:20<00:05,  1.17it/s]running benchmark:  83%|████████▎ | 25/30 [00:21<00:04,  1.17it/s]running benchmark:  87%|████████▋ | 26/30 [00:22<00:03,  1.17it/s]running benchmark:  90%|█████████ | 27/30 [00:23<00:02,  1.17it/s]running benchmark:  93%|█████████▎| 28/30 [00:24<00:01,  1.17it/s]running benchmark:  97%|█████████▋| 29/30 [00:24<00:00,  1.17it/s]running benchmark: 100%|██████████| 30/30 [00:25<00:00,  1.17it/s]running benchmark: 100%|██████████| 30/30 [00:25<00:00,  1.16it/s]
+1153.627ms
+loading model: 0it [00:00, ?it/s]detectron2_fcos_r_50_fpn
+WARNING:common:Model detectron2_fcos_r_50_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:05, ?it/s]
+WARNING:common:Model detectron2_fcos_r_50_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_fcos_r_50_fpn            baseline-bs32             
+WARNING:common:Model detectron2_fcos_r_50_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['mutated inputs']
+AUTOTUNE convolution(32x512x152x168, 128x512x1x1)
+  convolution 0.7164 ms 100.0%
+  triton_convolution_147 1.9852 ms 36.1%
+  triton_convolution_145 2.0024 ms 35.8%
+  triton_convolution_150 2.2915 ms 31.3%
+  triton_convolution_144 2.5278 ms 28.3%
+  triton_convolution_149 2.8213 ms 25.4%
+  triton_convolution_148 2.8757 ms 24.9%
+  triton_convolution_146 5.5331 ms 12.9%
+  conv1x1_via_mm 7.8624 ms 9.1%
+SingleProcess AUTOTUNE takes 5.3078 seconds
+AUTOTUNE convolution(32x128x152x168, 128x128x3x3)
+  convolution 1.7716 ms 100.0%
+  triton_convolution_152 3.7847 ms 46.8%
+  triton_convolution_157 4.1978 ms 42.2%
+  triton_convolution_151 4.5708 ms 38.8%
+  triton_convolution_155 5.2864 ms 33.5%
+  triton_convolution_154 6.1634 ms 28.7%
+  triton_convolution_156 7.1862 ms 24.7%
+  triton_convolution_153 10.5019 ms 16.9%
+SingleProcess AUTOTUNE takes 4.8724 seconds
+AUTOTUNE convolution(32x128x152x168, 512x128x1x1)
+  convolution 1.0197 ms 100.0%
+  triton_convolution_159 2.5243 ms 40.4%
+  triton_convolution_161 2.5826 ms 39.5%
+  triton_convolution_164 2.8486 ms 35.8%
+  triton_convolution_163 3.0849 ms 33.1%
+  triton_convolution_162 3.2853 ms 31.0%
+  triton_convolution_158 4.4526 ms 22.9%
+  triton_convolution_160 6.3412 ms 16.1%
+  conv1x1_via_mm 11.9159 ms 8.6%
+SingleProcess AUTOTUNE takes 5.4892 seconds
+AUTOTUNE convolution(32x512x152x168, 256x512x1x1)
+  triton_convolution_212 1.3236 ms 100.0%
+  triton_convolution_208 1.4139 ms 93.6%
+  triton_convolution_210 1.5668 ms 84.5%
+  triton_convolution_213 1.6124 ms 82.1%
+  convolution 1.6526 ms 80.1%
+  triton_convolution_211 1.7570 ms 75.3%
+  triton_convolution_207 1.8770 ms 70.5%
+  triton_convolution_209 5.0149 ms 26.4%
+SingleProcess AUTOTUNE takes 6.3091 seconds
+AUTOTUNE convolution(32x256x76x84, 256x256x3x3)
+  convolution 1.3577 ms 100.0%
+  triton_convolution_220 4.3288 ms 31.4%
+  triton_convolution_215 4.8987 ms 27.7%
+  triton_convolution_218 5.5673 ms 24.4%
+  triton_convolution_217 6.3459 ms 21.4%
+  triton_convolution_214 7.1864 ms 18.9%
+  triton_convolution_219 9.0415 ms 15.0%
+  triton_convolution_216 12.8060 ms 10.6%
+SingleProcess AUTOTUNE takes 6.4701 seconds
+AUTOTUNE convolution(32x256x76x84, 1024x256x1x1)
+  convolution 0.6847 ms 100.0%
+  triton_convolution_222 2.2835 ms 30.0%
+  triton_convolution_224 2.3134 ms 29.6%
+  triton_convolution_227 2.4866 ms 27.5%
+  triton_convolution_221 2.6558 ms 25.8%
+  triton_convolution_225 2.8458 ms 24.1%
+  triton_convolution_226 3.4151 ms 20.0%
+  triton_convolution_223 5.7796 ms 11.8%
+  conv1x1_via_mm 6.1301 ms 11.2%
+SingleProcess AUTOTUNE takes 6.8181 seconds
+AUTOTUNE convolution(32x512x152x168, 1024x512x1x1)
+  convolution 2.9781 ms 100.0%
+  triton_convolution_233 5.0663 ms 58.8%
+  triton_convolution_229 5.6240 ms 53.0%
+  triton_convolution_231 6.1512 ms 48.4%
+  triton_convolution_234 6.3697 ms 46.8%
+  triton_convolution_232 7.0068 ms 42.5%
+  triton_convolution_228 7.5078 ms 39.7%
+  triton_convolution_230 20.3272 ms 14.7%
+SingleProcess AUTOTUNE takes 6.6245 seconds
+AUTOTUNE convolution(32x1024x76x84, 256x1024x1x1)
+  convolution 0.5439 ms 100.0%
+  triton_convolution_236 1.9006 ms 28.6%
+  triton_convolution_238 2.3553 ms 23.1%
+  triton_convolution_241 2.4178 ms 22.5%
+  triton_convolution_240 2.4597 ms 22.1%
+  triton_convolution_239 2.8280 ms 19.2%
+  triton_convolution_235 3.9171 ms 13.9%
+  conv1x1_via_mm 4.1892 ms 13.0%
+  triton_convolution_237 5.3427 ms 10.2%
+SingleProcess AUTOTUNE takes 5.6841 seconds
+AUTOTUNE convolution(32x256x76x84, 256x256x3x3)
+  convolution 1.3584 ms 100.0%
+  triton_convolution_248 4.3385 ms 31.3%
+  triton_convolution_243 4.5120 ms 30.1%
+  triton_convolution_246 5.1444 ms 26.4%
+  triton_convolution_245 6.5045 ms 20.9%
+  triton_convolution_242 7.0836 ms 19.2%
+  triton_convolution_247 10.1825 ms 13.3%
+  triton_convolution_244 13.5452 ms 10.0%
+SingleProcess AUTOTUNE takes 5.9672 seconds
+AUTOTUNE convolution(32x256x76x84, 1024x256x1x1)
+  convolution 0.6773 ms 100.0%
+  triton_convolution_250 2.1828 ms 31.0%
+  triton_convolution_252 2.2004 ms 30.8%
+  triton_convolution_255 2.4560 ms 27.6%
+  triton_convolution_254 2.8402 ms 23.8%
+  triton_convolution_253 2.8948 ms 23.4%
+  triton_convolution_249 4.0443 ms 16.7%
+  triton_convolution_251 5.8398 ms 11.6%
+  conv1x1_via_mm 6.1139 ms 11.1%
+SingleProcess AUTOTUNE takes 5.8082 seconds
+AUTOTUNE convolution(32x1024x76x84, 512x1024x1x1)
+  convolution 0.9408 ms 100.0%
+  triton_convolution_345 1.4536 ms 64.7%
+  triton_convolution_341 1.4680 ms 64.1%
+  triton_convolution_343 1.5120 ms 62.2%
+  triton_convolution_346 1.6762 ms 56.1%
+  triton_convolution_344 1.7685 ms 53.2%
+  triton_convolution_340 1.9085 ms 49.3%
+  triton_convolution_342 4.3609 ms 21.6%
+SingleProcess AUTOTUNE takes 6.5069 seconds
+AUTOTUNE convolution(32x512x38x42, 512x512x3x3)
+  convolution 1.2105 ms 100.0%
+  triton_convolution_353 5.2240 ms 23.2%
+  triton_convolution_351 5.4226 ms 22.3%
+  triton_convolution_348 5.7446 ms 21.1%
+  triton_convolution_350 6.6167 ms 18.3%
+  triton_convolution_347 7.1543 ms 16.9%
+  triton_convolution_352 9.1097 ms 13.3%
+  triton_convolution_349 10.9966 ms 11.0%
+SingleProcess AUTOTUNE takes 6.6818 seconds
+AUTOTUNE convolution(32x512x38x42, 2048x512x1x1)
+  convolution 1.0106 ms 100.0%
+  triton_convolution_357 2.1266 ms 47.5%
+  triton_convolution_355 2.1442 ms 47.1%
+  triton_convolution_360 2.3056 ms 43.8%
+  triton_convolution_354 2.6780 ms 37.7%
+  triton_convolution_358 2.8099 ms 36.0%
+  triton_convolution_359 3.1076 ms 32.5%
+  conv1x1_via_mm 3.5866 ms 28.2%
+  triton_convolution_356 4.5913 ms 22.0%
+SingleProcess AUTOTUNE takes 7.0249 seconds
+AUTOTUNE convolution(32x1024x76x84, 2048x1024x1x1)
+  convolution 1.9397 ms 100.0%
+  triton_convolution_362 5.7853 ms 33.5%
+  triton_convolution_366 5.7924 ms 33.5%
+  triton_convolution_364 5.9714 ms 32.5%
+  triton_convolution_367 6.6575 ms 29.1%
+  triton_convolution_365 7.0550 ms 27.5%
+  triton_convolution_361 7.1643 ms 27.1%
+  triton_convolution_363 21.0244 ms 9.2%
+SingleProcess AUTOTUNE takes 6.8239 seconds
+AUTOTUNE convolution(32x2048x38x42, 512x2048x1x1)
+  convolution 0.8501 ms 100.0%
+  triton_convolution_369 1.9282 ms 44.1%
+  triton_convolution_374 2.2558 ms 37.7%
+  triton_convolution_371 2.3401 ms 36.3%
+  triton_convolution_373 2.5124 ms 33.8%
+  conv1x1_via_mm 2.5499 ms 33.3%
+  triton_convolution_372 2.7929 ms 30.4%
+  triton_convolution_368 4.0669 ms 20.9%
+  triton_convolution_370 4.4476 ms 19.1%
+SingleProcess AUTOTUNE takes 5.3795 seconds
+AUTOTUNE convolution(32x512x38x42, 512x512x3x3)
+  convolution 1.2106 ms 100.0%
+  triton_convolution_381 4.6837 ms 25.8%
+  triton_convolution_379 5.2315 ms 23.1%
+  triton_convolution_376 5.6140 ms 21.6%
+  triton_convolution_378 6.8462 ms 17.7%
+  triton_convolution_375 7.4343 ms 16.3%
+  triton_convolution_380 10.2856 ms 11.8%
+  triton_convolution_377 11.6360 ms 10.4%
+SingleProcess AUTOTUNE takes 6.0201 seconds
+AUTOTUNE convolution(32x512x38x42, 2048x512x1x1)
+  convolution 1.0136 ms 100.0%
+  triton_convolution_385 2.0158 ms 50.3%
+  triton_convolution_383 2.0159 ms 50.3%
+  triton_convolution_388 2.2914 ms 44.2%
+  triton_convolution_387 2.5972 ms 39.0%
+  triton_convolution_386 2.8882 ms 35.1%
+  conv1x1_via_mm 3.5789 ms 28.3%
+  triton_convolution_382 3.8804 ms 26.1%
+  triton_convolution_384 4.6936 ms 21.6%
+SingleProcess AUTOTUNE takes 6.0191 seconds
+AUTOTUNE convolution(32x512x152x168, 256x512x1x1)
+  convolution 1.3210 ms 100.0%
+  triton_convolution_411 4.0316 ms 32.8%
+  triton_convolution_413 4.4280 ms 29.8%
+  triton_convolution_416 4.7487 ms 27.8%
+  triton_convolution_415 5.0390 ms 26.2%
+  triton_convolution_410 5.0798 ms 26.0%
+  triton_convolution_414 5.7053 ms 23.2%
+  triton_convolution_412 10.5870 ms 12.5%
+  conv1x1_via_mm 10.8862 ms 12.1%
+SingleProcess AUTOTUNE takes 6.0245 seconds
+AUTOTUNE convolution(32x1024x76x84, 256x1024x1x1)
+  convolution 0.5492 ms 100.0%
+  triton_convolution_418 2.0280 ms 27.1%
+  triton_convolution_422 2.0408 ms 26.9%
+  triton_convolution_423 2.1877 ms 25.1%
+  triton_convolution_421 2.3597 ms 23.3%
+  triton_convolution_420 2.4233 ms 22.7%
+  triton_convolution_417 2.9095 ms 18.9%
+  conv1x1_via_mm 4.2276 ms 13.0%
+  triton_convolution_419 5.3397 ms 10.3%
+SingleProcess AUTOTUNE takes 5.4754 seconds
+AUTOTUNE convolution(32x2048x38x42, 256x2048x1x1)
+  convolution 0.4702 ms 100.0%
+  triton_convolution_425 0.9672 ms 48.6%
+  triton_convolution_430 1.1981 ms 39.2%
+  triton_convolution_427 1.2357 ms 38.1%
+  triton_convolution_429 1.3382 ms 35.1%
+  triton_convolution_428 1.4094 ms 33.4%
+  conv1x1_via_mm 1.8744 ms 25.1%
+  triton_convolution_424 2.0313 ms 23.1%
+  triton_convolution_426 2.3479 ms 20.0%
+SingleProcess AUTOTUNE takes 5.6321 seconds
+AUTOTUNE convolution(32x256x152x168, 256x256x3x3)
+  convolution 5.5274 ms 100.0%
+  triton_convolution_437 17.2917 ms 32.0%
+  triton_convolution_432 19.6196 ms 28.2%
+  triton_convolution_435 22.1785 ms 24.9%
+  triton_convolution_431 23.0092 ms 24.0%
+  triton_convolution_434 24.5889 ms 22.5%
+  triton_convolution_436 33.2411 ms 16.6%
+  triton_convolution_433 46.5938 ms 11.9%
+SingleProcess AUTOTUNE takes 6.8635 seconds
+AUTOTUNE convolution(32x256x76x84, 256x256x3x3)
+  convolution 1.3734 ms 100.0%
+  triton_convolution_444 4.2824 ms 32.1%
+  triton_convolution_439 4.5926 ms 29.9%
+  triton_convolution_442 5.2323 ms 26.2%
+  triton_convolution_441 6.2591 ms 21.9%
+  triton_convolution_438 6.5048 ms 21.1%
+  triton_convolution_443 8.6506 ms 15.9%
+  triton_convolution_440 13.1587 ms 10.4%
+SingleProcess AUTOTUNE takes 5.4490 seconds
+AUTOTUNE convolution(32x256x38x42, 256x256x3x3)
+  convolution 0.3640 ms 100.0%
+  triton_convolution_446 1.1260 ms 32.3%
+  triton_convolution_451 1.1622 ms 31.3%
+  triton_convolution_449 1.3037 ms 27.9%
+  triton_convolution_448 1.6663 ms 21.8%
+  triton_convolution_445 1.9439 ms 18.7%
+  triton_convolution_450 2.7786 ms 13.1%
+  triton_convolution_447 2.9919 ms 12.2%
+SingleProcess AUTOTUNE takes 1.0836 seconds
+AUTOTUNE convolution(32x256x38x42, 256x256x3x3)
+  convolution 0.1362 ms 100.0%
+  triton_convolution_453 0.3833 ms 35.5%
+  triton_convolution_458 0.4256 ms 32.0%
+  triton_convolution_455 0.4530 ms 30.1%
+  triton_convolution_456 0.4911 ms 27.7%
+  triton_convolution_452 0.6477 ms 21.0%
+  triton_convolution_457 0.7228 ms 18.8%
+  triton_convolution_454 0.8483 ms 16.1%
+SingleProcess AUTOTUNE takes 6.7516 seconds
+AUTOTUNE convolution(32x256x19x21, 256x256x3x3)
+  convolution 0.0707 ms 100.0%
+  triton_convolution_465 0.1660 ms 42.6%
+  triton_convolution_462 0.1768 ms 40.0%
+  triton_convolution_463 0.2011 ms 35.1%
+  triton_convolution_460 0.2495 ms 28.3%
+  triton_convolution_464 0.2577 ms 27.4%
+  triton_convolution_461 0.3393 ms 20.8%
+  triton_convolution_459 0.4170 ms 17.0%
+SingleProcess AUTOTUNE takes 6.4088 seconds
+[2023-12-12 17:32:39,280] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (8)
+[2023-12-12 17:32:39,280] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/batch_norm.py:318)
+[2023-12-12 17:32:39,280] torch._dynamo.convert_frame: [WARNING]    last reason: L['self']._pos == 0                                           # ret = self[self._pos](x)  # miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/batch_norm.py:319 in forward
+[2023-12-12 17:32:39,280] torch._dynamo.convert_frame: [WARNING] To log all recompilation reasons, use TORCH_LOGS="recompiles".
+[2023-12-12 17:32:39,280] torch._dynamo.convert_frame: [WARNING] To diagnose recompilation issues, see https://pytorch.org/docs/master/compile/troubleshooting.html.
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+[2023-12-12 17:33:04,222] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (8)
+[2023-12-12 17:33:04,222] torch._dynamo.convert_frame: [WARNING]    function: 'resume_in_detector_postprocess' (/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/postprocessing.py:45)
+[2023-12-12 17:33:04,222] torch._dynamo.convert_frame: [WARNING]    last reason: L['scale_x'] == 0.5337781484570475                            # self.tensor[:, 0::2] *= scale_x  # miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/structures/boxes.py:275 in scale
+[2023-12-12 17:33:04,222] torch._dynamo.convert_frame: [WARNING] To log all recompilation reasons, use TORCH_LOGS="recompiles".
+[2023-12-12 17:33:04,222] torch._dynamo.convert_frame: [WARNING] To diagnose recompilation issues, see https://pytorch.org/docs/master/compile/troubleshooting.html.
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:01<00:30,  1.06s/it]running benchmark:   7%|▋         | 2/30 [00:02<00:29,  1.05s/it]running benchmark:  10%|█         | 3/30 [00:03<00:28,  1.06s/it]running benchmark:  13%|█▎        | 4/30 [00:04<00:27,  1.05s/it]running benchmark:  17%|█▋        | 5/30 [00:05<00:25,  1.03s/it]running benchmark:  20%|██        | 6/30 [00:06<00:24,  1.03s/it]running benchmark:  23%|██▎       | 7/30 [00:07<00:23,  1.03s/it]running benchmark:  27%|██▋       | 8/30 [00:08<00:22,  1.03s/it]running benchmark:  30%|███       | 9/30 [00:09<00:21,  1.04s/it]running benchmark:  33%|███▎      | 10/30 [00:10<00:20,  1.03s/it]running benchmark:  37%|███▋      | 11/30 [00:11<00:19,  1.03s/it]running benchmark:  40%|████      | 12/30 [00:12<00:18,  1.03s/it]running benchmark:  43%|████▎     | 13/30 [00:13<00:17,  1.03s/it]running benchmark:  47%|████▋     | 14/30 [00:14<00:16,  1.03s/it]running benchmark:  50%|█████     | 15/30 [00:15<00:15,  1.03s/it]running benchmark:  53%|█████▎    | 16/30 [00:16<00:14,  1.04s/it]running benchmark:  57%|█████▋    | 17/30 [00:17<00:13,  1.04s/it]running benchmark:  60%|██████    | 18/30 [00:18<00:12,  1.03s/it]running benchmark:  63%|██████▎   | 19/30 [00:19<00:11,  1.03s/it]running benchmark:  67%|██████▋   | 20/30 [00:20<00:10,  1.03s/it]running benchmark:  70%|███████   | 21/30 [00:21<00:09,  1.02s/it]running benchmark:  73%|███████▎  | 22/30 [00:22<00:08,  1.03s/it]running benchmark:  77%|███████▋  | 23/30 [00:23<00:07,  1.03s/it]running benchmark:  80%|████████  | 24/30 [00:24<00:06,  1.02s/it]running benchmark:  83%|████████▎ | 25/30 [00:25<00:05,  1.02s/it]running benchmark:  87%|████████▋ | 26/30 [00:26<00:04,  1.01s/it]running benchmark:  90%|█████████ | 27/30 [00:27<00:03,  1.01s/it]running benchmark:  93%|█████████▎| 28/30 [00:28<00:02,  1.01s/it]running benchmark:  97%|█████████▋| 29/30 [00:29<00:01,  1.02s/it]running benchmark: 100%|██████████| 30/30 [00:30<00:00,  1.02s/it]running benchmark: 100%|██████████| 30/30 [00:30<00:00,  1.03s/it]
+1068.371ms
+loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_101_c4
+WARNING:common:Model detectron2_maskrcnn_r_101_c4 does not support bfloat16, running with amp instead
+loading model: 0it [00:10, ?it/s]
+WARNING:common:Model detectron2_maskrcnn_r_101_c4 does not support bfloat16, running with amp instead
+cuda eval  detectron2_maskrcnn_r_101_c4        baseline-bs32             
+WARNING:common:Model detectron2_maskrcnn_r_101_c4 does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['mutated inputs']
+AUTOTUNE convolution(32x1024x75x84, 1024x1024x3x3)
+  convolution 16.6914 ms 100.0%
+  triton_convolution_957 103.6520 ms 16.1%
+  triton_convolution_952 172.1457 ms 9.7%
+  triton_convolution_955 218.5776 ms 7.6%
+  triton_convolution_958 249.2678 ms 6.7%
+  triton_convolution_953 292.8491 ms 5.7%
+  triton_convolution_956 398.4814 ms 4.2%
+  triton_convolution_954 463.9069 ms 3.6%
+SingleProcess AUTOTUNE takes 15.5623 seconds
+AUTOTUNE addmm(201600x15, 201600x1024, 1024x15)
+  triton_mm_967 0.2814 ms 100.0%
+  triton_mm_968 0.2824 ms 99.6%
+  triton_mm_961 0.2900 ms 97.0%
+  triton_mm_960 0.2921 ms 96.3%
+  triton_mm_964 0.2923 ms 96.3%
+  triton_mm_965 0.2927 ms 96.1%
+  triton_mm_962 0.2929 ms 96.1%
+  triton_mm_963 0.2941 ms 95.7%
+  triton_mm_959 0.3964 ms 71.0%
+  triton_mm_966 0.3980 ms 70.7%
+SingleProcess AUTOTUNE takes 1.8261 seconds
+AUTOTUNE addmm(201600x60, 201600x1024, 1024x60)
+  triton_mm_973 0.3136 ms 100.0%
+  triton_mm_979 0.3158 ms 99.3%
+  triton_mm_975 0.3201 ms 98.0%
+  triton_mm_972 0.3245 ms 96.6%
+  triton_mm_974 0.3295 ms 95.2%
+  bias_addmm 0.4210 ms 74.5%
+  triton_mm_971 0.4264 ms 73.6%
+  triton_mm_978 0.4299 ms 73.0%
+  triton_mm_977 0.4427 ms 70.8%
+  triton_mm_976 0.4428 ms 70.8%
+SingleProcess AUTOTUNE takes 1.8528 seconds
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 17:38:10,258] [29/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+AUTOTUNE convolution(28390x512x7x7, 512x512x3x3)
+  convolution 28.1268 ms 100.0%
+  triton_convolution_988 182.0027 ms 15.5%
+  triton_convolution_983 235.7827 ms 11.9%
+  triton_convolution_986 262.0482 ms 10.7%
+  triton_convolution_989 415.6305 ms 6.8%
+  triton_convolution_987 494.0548 ms 5.7%
+  triton_convolution_984 572.7955 ms 4.9%
+  triton_convolution_985 682.7259 ms 4.1%
+SingleProcess AUTOTUNE takes 26.8812 seconds
+ERROR:common:Backend dynamo failed in warmup()
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2380, in warmup
+    fn(model, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 488, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass
+    return mod(*inputs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward
+    return self.inference(batched_inputs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 203, in inference
+    images = self.preprocess_image(batched_inputs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 208, in resume_in_inference
+    proposals, _ = self.proposal_generator(images, features, None)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in resume_in_inference
+    results, _ = self.roi_heads(images, features, proposals, None)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 477, in forward
+    box_features = self._shared_roi_transform(
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 456, in _shared_roi_transform
+    x = self.pooler(features, boxes)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 654, in catch_errors
+    return callback(frame, cache_entry, hooks, frame_state)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 727, in _convert_frame
+    result = inner_convert(frame, cache_entry, hooks, frame_state)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 383, in _convert_frame_assert
+    compiled_product = _compile(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 646, in _compile
+    guarded_code = compile_inner(code, one_graph, hooks, transform)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 562, in compile_inner
+    out_code = transform_code_object(code, transform)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/bytecode_transformation.py", line 1033, in transform_code_object
+    transformations(instructions, code_options)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 151, in _fn
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 527, in transform
+    tracer.run()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2144, in run
+    super().run()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 818, in run
+    and self.step()
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 781, in step
+    getattr(self, inst.opname)(inst)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2259, in RETURN_VALUE
+    self.output.compile_subgraph(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 919, in compile_subgraph
+    self.compile_and_call_fx_graph(tx, list(reversed(stack_values)), root)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1087, in compile_and_call_fx_graph
+    compiled_fn = self.call_user_compiler(gm)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1159, in call_user_compiler
+    raise BackendCompilerFailed(self.compiler_fn, e).with_traceback(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1140, in call_user_compiler
+    compiled_fn = compiler_fn(gm, self.example_inputs())
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_dynamo.py", line 117, in debug_wrapper
+    compiled_gm = compiler_fn(gm, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/__init__.py", line 1672, in __call__
+    return compile_fx(model_, inputs_, config_patches=self.config)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 952, in compile_fx
+    return compile_fx(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1168, in compile_fx
+    return aot_autograd(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/backends/common.py", line 55, in compiler_fn
+    cg = aot_module_simplified(gm, example_inputs, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 887, in aot_module_simplified
+    compiled_fn = create_aot_dispatcher_function(
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 600, in create_aot_dispatcher_function
+    compiled_fn = compiler_fn(flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 425, in aot_wrapper_dedupe
+    return compiler_fn(flat_fn, leaf_flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 630, in aot_wrapper_synthetic_base
+    return compiler_fn(flat_fn, flat_args, aot_config, fw_metadata=fw_metadata)
+  File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py", line 97, in aot_dispatch_base
+    compiled_fw = compiler(fw_module, updated_flat_args)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1100, in fw_compiler_base
+    return inner_compile(
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_aot.py", line 83, in debug_wrapper
+    inner_compiled_fn = compiler_fn(gm, example_inputs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/debug.py", line 305, in inner
+    return fn(*args, **kwargs)
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner
+    return func(*args, **kwds)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 320, in compile_fx_inner
+    compiled_graph = fx_codegen_and_compile(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 550, in fx_codegen_and_compile
+    compiled_fn = graph.compile_to_fn()
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 1117, in compile_to_fn
+    return self.compile_to_module().call
+  File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper
+    r = func(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 1071, in compile_to_module
+    mod = PyCodeCache.load_by_key_path(
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 1891, in load_by_key_path
+    exec(code, mod.__dict__, mod.__dict__)
+  File "/tmp/torchinductor_cdhernandez/3n/c3nmaxvhaldax7e64yzdvlbxcvrykks3cdqq4tsnbelhzew73i5t.py", line 364, in <module>
+    async_compile.wait(globals())
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 2470, in wait
+    scope[key] = result.result()
+  File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 2313, in result
+    self.future.result()
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/concurrent/futures/_base.py", line 458, in result
+    return self.__get_result()
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result
+    raise self._exception
+torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised:
+CompilationError: at 14:40:    xnumel = 196
+    yoffset = tl.program_id(1).to(tl.int64) * YBLOCK
+    yindex = yoffset + tl.arange(0, YBLOCK)[None, :].to(tl.int64)
+    ymask = yindex < ynumel
+    xoffset = tl.program_id(0).to(tl.int64) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None].to(tl.int64)
+    xmask = xindex < xnumel
+    x2 = xindex
+    y3 = yindex
+    y0 = yindex % 1024
+    y1 = (yindex // 1024)
+    tmp0 = tl.load(in_ptr0 + (x2 + (196*y3)), xmask, eviction_policy='evict_last').to(tl.float32)
+                                        ^
+ValueError('numel (262144) exceeds triton maximum tensor numel (131072)')
+
+Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information
+
+
+You can suppress this exception and fall back to eager by setting:
+    import torch._dynamo
+    torch._dynamo.config.suppress_errors = True
+
+Run failed with return code:  255
+Output:  None
+Error:  None
+loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_101_fpn
+WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead
+loading model: 0it [00:06, ?it/s]
+WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead
+cuda eval  detectron2_maskrcnn_r_101_fpn       baseline-bs32             
+WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 17:40:18,830] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['non-cuda device in graph']
+[2023-12-12 17:41:27,948] [30/1_1] torch._inductor.utils: [WARNING] DeviceCopy in input program
+skipping cudagraphs due to ['non-cuda device in graph']
+AUTOTUNE convolution(957x256x14x14, 256x256x3x3)
+  convolution 1.2820 ms 100.0%
+  triton_convolution_1332 4.0781 ms 31.4%
+  triton_convolution_1327 4.1190 ms 31.1%
+  triton_convolution_1330 4.6714 ms 27.4%
+  triton_convolution_1329 5.5829 ms 23.0%
+  triton_convolution_1326 6.5548 ms 19.6%
+  triton_convolution_1331 8.8159 ms 14.5%
+  triton_convolution_1328 12.4215 ms 10.3%
+SingleProcess AUTOTUNE takes 5.6736 seconds
+AUTOTUNE convolution(957x256x28x28, 80x256x1x1)
+  triton_convolution_1354 0.8452 ms 100.0%
+  triton_convolution_1355 0.9540 ms 88.6%
+  triton_convolution_1359 0.9582 ms 88.2%
+  triton_convolution_1357 1.0138 ms 83.4%
+  convolution 1.0945 ms 77.2%
+  triton_convolution_1358 1.1035 ms 76.6%
+  triton_convolution_1360 1.1451 ms 73.8%
+  triton_convolution_1356 1.7224 ms 49.1%
+  conv1x1_via_mm 3.6812 ms 23.0%
+SingleProcess AUTOTUNE takes 4.8280 seconds
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['non-cuda device in graph']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+skipping cudagraphs due to ['mutated inputs']
+[2023-12-12 17:43:12,768] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (8)
+[2023-12-12 17:43:12,768] torch._dynamo.convert_frame: [WARNING]    function: 'resume_in_paste_masks_in_image' (/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/mask_ops.py:123)
+[2023-12-12 17:43:12,768] torch._dynamo.convert_frame: [WARNING]    last reason: L['N'] == 36                                                  # num_chunks <= N  # miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/mask_ops.py:125 in resume_in_paste_masks_in_image
+[2023-12-12 17:43:12,768] torch._dynamo.convert_frame: [WARNING] To log all recompilation reasons, use TORCH_LOGS="recompiles".
+[2023-12-12 17:43:12,768] torch._dynamo.convert_frame: [WARNING] To diagnose recompilation issues, see https://pytorch.org/docs/master/compile/troubleshooting.html.
+skipping cudagraphs due to ['mutated inputs']
+[2023-12-12 17:43:22,978] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (8)
+[2023-12-12 17:43:22,978] torch._dynamo.convert_frame: [WARNING]    function: 'resume_in_detector_postprocess' (/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/postprocessing.py:45)
+[2023-12-12 17:43:22,978] torch._dynamo.convert_frame: [WARNING]    last reason: L['scale_x'] == 0.5337781484570475                            # self.tensor[:, 0::2] *= scale_x  # miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/structures/boxes.py:275 in scale
+[2023-12-12 17:43:22,978] torch._dynamo.convert_frame: [WARNING] To log all recompilation reasons, use TORCH_LOGS="recompiles".
+[2023-12-12 17:43:22,978] torch._dynamo.convert_frame: [WARNING] To diagnose recompilation issues, see https://pytorch.org/docs/master/compile/troubleshooting.html.
+running benchmark:   0%|          | 0/30 [00:00<?, ?it/s]running benchmark:   3%|▎         | 1/30 [00:01<00:32,  1.11s/it]running benchmark:   7%|▋         | 2/30 [00:02<00:30,  1.10s/it]running benchmark:  10%|█         | 3/30 [00:03<00:30,  1.12s/it]running benchmark:  13%|█▎        | 4/30 [00:04<00:29,  1.12s/it]running benchmark:  17%|█▋        | 5/30 [00:05<00:27,  1.11s/it]running benchmark:  20%|██        | 6/30 [00:06<00:26,  1.11s/it]running benchmark:  23%|██▎       | 7/30 [00:07<00:25,  1.11s/it]running benchmark:  27%|██▋       | 8/30 [00:08<00:24,  1.11s/it]running benchmark:  30%|███       | 9/30 [00:09<00:23,  1.11s/it]running benchmark:  33%|███▎      | 10/30 [00:11<00:22,  1.10s/it]running benchmark:  37%|███▋      | 11/30 [00:12<00:21,  1.11s/it]running benchmark:  40%|████      | 12/30 [00:13<00:20,  1.13s/it]running benchmark:  43%|████▎     | 13/30 [00:14<00:19,  1.13s/it]running benchmark:  47%|████▋     | 14/30 [00:15<00:18,  1.14s/it]running benchmark:  50%|█████     | 15/30 [00:16<00:17,  1.15s/it]running benchmark:  53%|█████▎    | 16/30 [00:18<00:16,  1.16s/it]running benchmark:  57%|█████▋    | 17/30 [00:19<00:15,  1.16s/it]running benchmark:  60%|██████    | 18/30 [00:20<00:13,  1.16s/it]running benchmark:  63%|██████▎   | 19/30 [00:21<00:12,  1.17s/it]running benchmark:  67%|██████▋   | 20/30 [00:22<00:11,  1.17s/it]running benchmark:  70%|███████   | 21/30 [00:23<00:10,  1.17s/it]running benchmark:  73%|███████▎  | 22/30 [00:25<00:09,  1.17s/it]running benchmark:  77%|███████▋  | 23/30 [00:26<00:08,  1.16s/it]running benchmark:  80%|████████  | 24/30 [00:27<00:07,  1.17s/it]running benchmark:  83%|████████▎ | 25/30 [00:28<00:05,  1.17s/it]running benchmark:  87%|████████▋ | 26/30 [00:29<00:04,  1.16s/it]running benchmark:  90%|█████████ | 27/30 [00:30<00:03,  1.16s/it]running benchmark:  93%|█████████▎| 28/30 [00:32<00:02,  1.16s/it]running benchmark:  97%|█████████▋| 29/30 [00:33<00:01,  1.17s/it]running benchmark: 100%|██████████| 30/30 [00:34<00:00,  1.16s/it]running benchmark: 100%|██████████| 30/30 [00:34<00:00,  1.14s/it]
+1209.677ms
+loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_50_c4
+WARNING:common:Model detectron2_maskrcnn_r_50_c4 does not support bfloat16, running with amp instead
+loading model: 0it [00:05, ?it/s]
+WARNING:common:Model detectron2_maskrcnn_r_50_c4 does not support bfloat16, running with amp instead
+cuda eval  detectron2_maskrcnn_r_50_c4         baseline-bs32             
+WARNING:common:Model detectron2_maskrcnn_r_50_c4 does not support bfloat16, running with amp instead
+> /home/cdhernandez/local/pytorch/torch/_inductor/decomposition.py(221)mm()
+-> if config.coordinate_descent_tuning:
+(Pdb) 
\ No newline at end of file
diff --git a/torchao_benchmarks.sh b/torchao_benchmarks.sh
new file mode 100644
index 0000000000..11fcbc1314
--- /dev/null
+++ b/torchao_benchmarks.sh
@@ -0,0 +1,25 @@
+echo "start dynamic"
+python run_benchmark.py dynamo --bfloat16 --inductor --performance --inference --quantization int8dynamic --inductor-compile-mode max-autotune --tag int8dynamic
+echo "start int8 weight only"
+python run_benchmark.py dynamo --bfloat16 --inductor --performance --inference --quantization int8weightonly --inductor-compile-mode max-autotune --tag int8weightonly
+echo "start int4 weight only"
+python run_benchmark.py dynamo --bfloat16 --inductor --performance --inference --quantization int4weightonly --inductor-compile-mode max-autotune --tag int4weightonly
+echo "start baseline"
+python run_benchmark.py dynamo --bfloat16 --inductor --performance --inference --inductor-compile-mode max-autotune --tag baseline
+
+echo "start int8 weight only batchsize 1"
+python run_benchmark.py dynamo --bfloat16 --inductor --performance --inference --quantization int8weightonly --inductor-compile-mode max-autotune --batch_size 1 --tag int8weightonly-bs1
+echo "start int4 weight only batchsize 1"
+python run_benchmark.py dynamo --bfloat16 --inductor --performance --inference --quantization int4weightonly --inductor-compile-mode max-autotune --batch_size 1 --tag int4weightonly-bs1
+echo "start baseline batchsize 1"
+python run_benchmark.py dynamo --bfloat16 --inductor --performance --inference --inductor-compile-mode max-autotune --batch_size 1 --tag baseline-bs1
+
+echo "start dynamic batchsize 32"
+python run_benchmark.py dynamo --bfloat16 --inductor --performance --inference --quantization int8dynamic --inductor-compile-mode max-autotune --batch_size 32 --tag int8dynamic-bs32
+echo "start baseline batchsize 32"
+python run_benchmark.py dynamo --bfloat16 --inductor --performance --inference --inductor-compile-mode max-autotune --batch_size 32 --tag baseline-bs32
+
+echo "start accuracy"
+python run_benchmark.py dynamo --bfloat16 --inductor --inference --quantization int8dynamic --inductor-compile-mode max-autotune --batch_size 1 --tag int8dynamic-bs1-acc --accuracy
+python run_benchmark.py dynamo --bfloat16 --inductor --inference --quantization int8weightonly --inductor-compile-mode max-autotune --batch_size 1 --tag int8weightonly-bs1-acc --accuracy
+python run_benchmark.py dynamo --bfloat16 --inductor --inference --quantization int4weightonly --inductor-compile-mode max-autotune --batch_size 1 --tag int4weightonly-bs1-acc --accuracy
diff --git a/userbenchmark/dynamo/dynamobench/common.py b/userbenchmark/dynamo/dynamobench/common.py
index b11c3cbf62..fd2df37f45 100644
--- a/userbenchmark/dynamo/dynamobench/common.py
+++ b/userbenchmark/dynamo/dynamobench/common.py
@@ -73,6 +73,12 @@
 
 from torch.utils import _pytree as pytree
 from torch.utils._pytree import tree_map, tree_map_only
+import torchao
+from torchao.quantization import (
+    change_linear_weights_to_int8_dqtensors,
+    change_linear_weights_to_int8_woqtensors,
+    change_linear_weights_to_int4_woqtensors
+)
 
 from tqdm.auto import tqdm, trange
 
@@ -576,7 +582,7 @@ def maybe_mark_profile(*args, **kwargs):
         first_fields.append(kwargs["tag"])
     headers = first_headers + ["speedup", "abs_latency"]
     row = first_fields + [float(speedup), median[1] * 1000]
-    msg = f"{speedup:.3f}x"
+    msg = f"{speedup*1000:.3f}ms"
     if args.baseline:
         headers.extend(
             [
@@ -2066,7 +2072,7 @@ def deepcopy_and_maybe_ddp(self, model):
         return model
 
     def check_accuracy(
-        self, name, model, example_inputs, optimize_ctx, experiment, tag
+        self, name, model, example_inputs, optimize_ctx, experiment, tag, res=None
     ):
         """
         Checks accuracy.
@@ -2238,6 +2244,10 @@ def record_status(accuracy_status, dynamo_start_stats):
             finally:
                 del model_copy
 
+            sqnr = "err"
+            if res is not None and isinstance(res, torch.Tensor):
+                sqnr = 20 * torch.log10(torch.linalg.norm(res) / torch.linalg.norm(res - new_result)).item()
+
             if name in self.skip_accuracy_check_as_eager_non_deterministic:
                 return record_status("pass_due_to_skip", dynamo_start_stats=start_stats)
 
@@ -2276,9 +2286,7 @@ def record_status(accuracy_status, dynamo_start_stats):
                     accuracy_status = "pass_due_to_skip"
                 else:
                     accuracy_status = "fail_accuracy"
-                return record_status(accuracy_status, dynamo_start_stats=start_stats)
-
-        return record_status(accuracy_status, dynamo_start_stats=start_stats)
+        return record_status(accuracy_status+f"-sqnr-{sqnr:.3f}", dynamo_start_stats=start_stats)
 
     def check_tolerance(
         self, name, model, example_inputs, optimize_ctx, base_device="cpu"
@@ -2496,6 +2504,7 @@ def run_one_model(
         experiment,
         explain=False,
         tag=None,
+        res=None,
     ):
         mode = "train" if self.args.training else "eval"
         msg = f"{current_device:4} {mode:5} {current_name:34} "
@@ -2507,7 +2516,7 @@ def run_one_model(
 
         if self.args.accuracy:
             status = self.check_accuracy(
-                name, model, example_inputs, optimize_ctx, experiment, tag
+                name, model, example_inputs, optimize_ctx, experiment, tag, res,
             )
             print(status)
             if status == "fail_accuracy" and self.args.minify:
@@ -2714,6 +2723,11 @@ def get_example_inputs(self):
         action="store_true",
         help="Create n processes based on the number of devices (distributed use case).",
     )
+    parser.add_argument(
+        "--quantization",
+        choices=["int8dynamic", "int8weightonly", "int4weightonly"],
+        help="Apply quantization to the model before running it",
+    )
     parser.add_argument(
         "--ddp",
         action="store_true",
@@ -3535,6 +3549,7 @@ def run(runner, args, original_dir=None):
                                     extra_args=extra_args,
                                 )
                             else:
+                                print(model_name)
                                 (
                                     device,
                                     name,
@@ -3547,6 +3562,24 @@ def run(runner, args, original_dir=None):
                                     batch_size=batch_size,
                                     extra_args=extra_args,
                                 )
+                            res = None
+                            if args.quantization:
+                                if args.accuracy:
+                                    res=model(*example_inputs) # to later calculate SQNR
+
+                                torch._dynamo.config.automatic_dynamic_shapes = False
+                                torch._dynamo.config.force_parameter_static_shapes = False
+                                torch._dynamo.config.cache_size_limit = 1000
+                                assert "cuda" in device
+                                if args.quantization=="int8dynamic":
+                                    torch._inductor.config.force_fuse_int_mm_with_mul = True
+                                    change_linear_weights_to_int8_dqtensors(model)
+                                elif args.quantization=="int8weightonly":
+                                    torch._inductor.config.use_mixed_mm = True
+                                    change_linear_weights_to_int8_woqtensors(model)
+                                elif args.quantization=="int4weightonly":
+                                    change_linear_weights_to_int4_woqtensors(model)
+
                 except NotImplementedError as e:
                     print(e)
                     import traceback
@@ -3614,6 +3647,7 @@ def detect_and_mark_batch(t):
                 experiment,
                 explain=args.explain,
                 tag=args.tag,
+                res=res,
             )
         if args.generate_aot_autograd_stats:
             stats_file = output_filename.split(".csv")[0] + "_stats.csv"
@@ -3629,8 +3663,8 @@ def detect_and_mark_batch(t):
             )
     else:
         metrics.purge_old_log_files()
-        if output_filename and os.path.exists(output_filename):
-            os.unlink(output_filename)
+        # if output_filename and os.path.exists(output_filename):
+            # os.unlink(output_filename)
         if original_dir:
             os.chdir(original_dir)
         model_names = list(runner.iter_model_names(args))
diff --git a/userbenchmark/dynamo/dynamobench/torchbench.py b/userbenchmark/dynamo/dynamobench/torchbench.py
index 9919332e1d..3222762243 100755
--- a/userbenchmark/dynamo/dynamobench/torchbench.py
+++ b/userbenchmark/dynamo/dynamobench/torchbench.py
@@ -265,6 +265,17 @@ def setup_torchbench_cwd():
     "tts_angular",
     "pyhpc_turbulent_kinetic_energy",
     "detectron2_fcos_r_50_fpn",
+    "detectron2_fasterrcnn_r_101_dc5"
+    "detectron2_fasterrcnn_r_50_c4",
+    "detectron2_fasterrcnn_r_101_c4",
+    "detectron2_fasterrcnn_r_101_fpn",
+    "detectron2_fasterrcnn_r_50_dc5",
+    "detectron2_fasterrcnn_r_50_fpn",
+    "detectron2_maskrcnn_r_101_c4",
+    "detectron2_maskrcnn_r_101_fpn",
+    "detectron2_maskrcnn_r_50_c4",
+    "detectron2_maskrcnn_r_50_fpn",
+    "demucs",
 }
 
 FORCE_FP16_FOR_BF16_MODELS = {"vision_maskrcnn"}