From 8640b85acfc861531dff37e5952a9ae763e0d55f Mon Sep 17 00:00:00 2001 From: HDCharles Date: Tue, 12 Dec 2023 18:05:07 -0800 Subject: [PATCH] [not for land] testing torchao coverage on torchbench/dynamo models Summary: testing locally accuracy and perf Test Plan: sh torchao_benchmarks.sh Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: 98a007a42e7c024fd8fb87f2d92223ffc528e3c3 Pull Request resolved: https://github.com/pytorch/benchmark/pull/2075 --- log.log | 34549 ++++++++++++++++ torchao_benchmarks.sh | 25 + userbenchmark/dynamo/dynamobench/common.py | 50 +- .../dynamo/dynamobench/torchbench.py | 11 + 4 files changed, 34627 insertions(+), 8 deletions(-) create mode 100644 log.log create mode 100644 torchao_benchmarks.sh diff --git a/log.log b/log.log new file mode 100644 index 0000000000..39ac878584 --- /dev/null +++ b/log.log @@ -0,0 +1,34549 @@ +start dynamic + loading model: 0it [00:00, ?it/s] loading model: 0it [00:00, ?it/s] +torchrec_dlrm +/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/fbgemm_gpu_py.so: undefined symbol: _ZNK5torch8autograd4Node4nameEv +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 36, in + run() + File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 30, in run + benchmark.run(bm_args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/run.py", line 24, in run + main(TorchBenchmarkRunner(), original_dir, args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3133, in main + process_entry(0, runner, original_dir, args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3090, in process_entry + return maybe_fresh_cache( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1714, in inner + return fn(*args, **kwargs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 383, in load_model + module = importlib.import_module(c) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/importlib/__init__.py", line 126, in import_module + return _bootstrap._gcd_import(name[level:], package, level) + File "", line 1050, in _gcd_import + File "", line 1027, in _find_and_load + File "", line 1006, in _find_and_load_unlocked + File "", line 688, in _load_unlocked + File "", line 883, in exec_module + File "", line 241, in _call_with_frames_removed + File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/__init__.py", line 7, in + from .data.dlrm_dataloader import get_dataloader + File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/data/dlrm_dataloader.py", line 13, in + from torchrec.datasets.criteo import ( + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/__init__.py", line 8, in + import torchrec.distributed # noqa + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/__init__.py", line 36, in + from torchrec.distributed.model_parallel import DistributedModelParallel # noqa + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/model_parallel.py", line 21, in + from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/__init__.py", line 22, in + from torchrec.distributed.planner.planners import EmbeddingShardingPlanner # noqa + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/planners.py", line 19, in + from torchrec.distributed.planner.constants import BATCH_SIZE, MAX_SIZE + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/constants.py", line 10, in + from torchrec.distributed.embedding_types import EmbeddingComputeKernel + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/embedding_types.py", line 14, in + from fbgemm_gpu.split_table_batched_embeddings_ops_training import EmbeddingLocation + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 23, in + from . import _fbgemm_gpu_docs, sparse_ops # noqa: F401, E402 # noqa: F401, E402 + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/_fbgemm_gpu_docs.py", line 19, in + torch.ops.fbgemm.jagged_2d_to_dense, + File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 822, in __getattr__ + raise AttributeError( +AttributeError: '_OpNamespace' 'fbgemm' object has no attribute 'jagged_2d_to_dense' +Run failed with return code: 1 +Output: None +Error: None + loading model: 0it [00:00, ?it/s] loading model: 0it [00:03, ?it/s] +BERT_pytorch +cuda eval BERT_pytorch int8dynamic + running benchmark: 0%| | 0/30 [00:00= MINSIZE && prev_inuse (old_top) && ((unsigned long) old_end & (pagesize - 1)) == 0) +Run failed with return code: -6 +Output: None +Error: None + loading model: 0it [00:00, ?it/s]WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead + loading model: 0it [00:06, ?it/s] +WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead +doctr_reco_predictor +cuda eval doctr_reco_predictor int8dynamic +WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead + running benchmark: 0%| | 0/30 [00:00 /home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/diffusers/models/attention_processor.py(1236)__call__() +-> hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim) +(Pdb) TIMEOUT + loading model: 0it [00:00, ?it/s] loading model: 0it [00:05, ?it/s] +timm_efficientdet +cuda eval timm_efficientdet int8dynamic + running benchmark: 0%| | 0/30 [00:00 + run() + File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 30, in run + benchmark.run(bm_args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/run.py", line 24, in run + main(TorchBenchmarkRunner(), original_dir, args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3133, in main + process_entry(0, runner, original_dir, args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3090, in process_entry + return maybe_fresh_cache( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1714, in inner + return fn(*args, **kwargs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 383, in load_model + module = importlib.import_module(c) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/importlib/__init__.py", line 126, in import_module + return _bootstrap._gcd_import(name[level:], package, level) + File "", line 1050, in _gcd_import + File "", line 1027, in _find_and_load + File "", line 1006, in _find_and_load_unlocked + File "", line 688, in _load_unlocked + File "", line 883, in exec_module + File "", line 241, in _call_with_frames_removed + File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/__init__.py", line 7, in + from .data.dlrm_dataloader import get_dataloader + File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/data/dlrm_dataloader.py", line 13, in + from torchrec.datasets.criteo import ( + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/__init__.py", line 8, in + import torchrec.distributed # noqa + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/__init__.py", line 36, in + from torchrec.distributed.model_parallel import DistributedModelParallel # noqa + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/model_parallel.py", line 21, in + from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/__init__.py", line 22, in + from torchrec.distributed.planner.planners import EmbeddingShardingPlanner # noqa + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/planners.py", line 19, in + from torchrec.distributed.planner.constants import BATCH_SIZE, MAX_SIZE + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/constants.py", line 10, in + from torchrec.distributed.embedding_types import EmbeddingComputeKernel + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/embedding_types.py", line 14, in + from fbgemm_gpu.split_table_batched_embeddings_ops_training import EmbeddingLocation + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 23, in + from . import _fbgemm_gpu_docs, sparse_ops # noqa: F401, E402 # noqa: F401, E402 + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/_fbgemm_gpu_docs.py", line 19, in + torch.ops.fbgemm.jagged_2d_to_dense, + File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 822, in __getattr__ + raise AttributeError( +AttributeError: '_OpNamespace' 'fbgemm' object has no attribute 'jagged_2d_to_dense' +Run failed with return code: 1 +Output: None +Error: None + loading model: 0it [00:00, ?it/s] loading model: 0it [00:04, ?it/s] +BERT_pytorch +cuda eval BERT_pytorch int8weightonly + running benchmark: 0%| | 0/30 [00:00 + run() + File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 30, in run + benchmark.run(bm_args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/run.py", line 24, in run + main(TorchBenchmarkRunner(), original_dir, args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3133, in main + process_entry(0, runner, original_dir, args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3090, in process_entry + return maybe_fresh_cache( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1714, in inner + return fn(*args, **kwargs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 383, in load_model + module = importlib.import_module(c) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/importlib/__init__.py", line 126, in import_module + return _bootstrap._gcd_import(name[level:], package, level) + File "", line 1050, in _gcd_import + File "", line 1027, in _find_and_load + File "", line 1006, in _find_and_load_unlocked + File "", line 688, in _load_unlocked + File "", line 883, in exec_module + File "", line 241, in _call_with_frames_removed + File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/__init__.py", line 7, in + from .data.dlrm_dataloader import get_dataloader + File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/data/dlrm_dataloader.py", line 13, in + from torchrec.datasets.criteo import ( + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/__init__.py", line 8, in + import torchrec.distributed # noqa + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/__init__.py", line 36, in + from torchrec.distributed.model_parallel import DistributedModelParallel # noqa + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/model_parallel.py", line 21, in + from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/__init__.py", line 22, in + from torchrec.distributed.planner.planners import EmbeddingShardingPlanner # noqa + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/planners.py", line 19, in + from torchrec.distributed.planner.constants import BATCH_SIZE, MAX_SIZE + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/constants.py", line 10, in + from torchrec.distributed.embedding_types import EmbeddingComputeKernel + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/embedding_types.py", line 14, in + from fbgemm_gpu.split_table_batched_embeddings_ops_training import EmbeddingLocation + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 23, in + from . import _fbgemm_gpu_docs, sparse_ops # noqa: F401, E402 # noqa: F401, E402 + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/_fbgemm_gpu_docs.py", line 19, in + torch.ops.fbgemm.jagged_2d_to_dense, + File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 822, in __getattr__ + raise AttributeError( +AttributeError: '_OpNamespace' 'fbgemm' object has no attribute 'jagged_2d_to_dense' +Run failed with return code: 1 +Output: None +Error: None + loading model: 0it [00:00, ?it/s] loading model: 0it [00:04, ?it/s] +BERT_pytorch +cuda eval BERT_pytorch int4weightonly + running benchmark: 0%| | 0/30 [00:00 + run() + File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 30, in run + benchmark.run(bm_args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/run.py", line 24, in run + main(TorchBenchmarkRunner(), original_dir, args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3133, in main + process_entry(0, runner, original_dir, args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3090, in process_entry + return maybe_fresh_cache( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1714, in inner + return fn(*args, **kwargs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 383, in load_model + module = importlib.import_module(c) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/importlib/__init__.py", line 126, in import_module + return _bootstrap._gcd_import(name[level:], package, level) + File "", line 1050, in _gcd_import + File "", line 1027, in _find_and_load + File "", line 1006, in _find_and_load_unlocked + File "", line 688, in _load_unlocked + File "", line 883, in exec_module + File "", line 241, in _call_with_frames_removed + File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/__init__.py", line 7, in + from .data.dlrm_dataloader import get_dataloader + File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/data/dlrm_dataloader.py", line 13, in + from torchrec.datasets.criteo import ( + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/__init__.py", line 8, in + import torchrec.distributed # noqa + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/__init__.py", line 36, in + from torchrec.distributed.model_parallel import DistributedModelParallel # noqa + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/model_parallel.py", line 21, in + from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/__init__.py", line 22, in + from torchrec.distributed.planner.planners import EmbeddingShardingPlanner # noqa + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/planners.py", line 19, in + from torchrec.distributed.planner.constants import BATCH_SIZE, MAX_SIZE + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/constants.py", line 10, in + from torchrec.distributed.embedding_types import EmbeddingComputeKernel + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/embedding_types.py", line 14, in + from fbgemm_gpu.split_table_batched_embeddings_ops_training import EmbeddingLocation + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 23, in + from . import _fbgemm_gpu_docs, sparse_ops # noqa: F401, E402 # noqa: F401, E402 + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/_fbgemm_gpu_docs.py", line 19, in + torch.ops.fbgemm.jagged_2d_to_dense, + File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 822, in __getattr__ + raise AttributeError( +AttributeError: '_OpNamespace' 'fbgemm' object has no attribute 'jagged_2d_to_dense' +Run failed with return code: 1 +Output: None +Error: None + loading model: 0it [00:00, ?it/s] loading model: 0it [00:04, ?it/s] +BERT_pytorch +cuda eval BERT_pytorch baseline + running benchmark: 0%| | 0/30 [00:00 will be ignored +[rank0]:[2023-12-12 03:31:06,215] [1/0_1] torch._dynamo.backends.distributed: [WARNING] Some buckets were extended beyond their requested parameter capacities in order to ensure each subgraph has an output node, required for fx graph partitioning. This can be the case when a subgraph would have only contained nodes performing inplace mutation, and returning no logical outputs. This should not be a problem, unless it results in too few graph partitions for optimal DDP performance. +[rank0]:[2023-12-12 03:31:06,238] [1/0_1] torch._dynamo.backends.distributed: [WARNING] DDPOptimizer extended these buckets to ensure per-subgraph output nodes: +[rank0]:[2023-12-12 03:31:06,238] [1/0_1] torch._dynamo.backends.distributed: [WARNING] ┌─────────┬─────────────┬────────────────────────┐ +[rank0]:[2023-12-12 03:31:06,238] [1/0_1] torch._dynamo.backends.distributed: [WARNING] │ Index │ Extra Ops │ Extra Param Size (b) │ +[rank0]:[2023-12-12 03:31:06,238] [1/0_1] torch._dynamo.backends.distributed: [WARNING] ├─────────┼─────────────┼────────────────────────┤ +[rank0]:[2023-12-12 03:31:06,238] [1/0_1] torch._dynamo.backends.distributed: [WARNING] │ 0 │ 157 │ 44910720 │ +[rank0]:[2023-12-12 03:31:06,238] [1/0_1] torch._dynamo.backends.distributed: [WARNING] └─────────┴─────────────┴────────────────────────┘ +skipping cudagraphs due to ['mutated inputs'] +[rank0]:[2023-12-12 03:31:29,846] [5/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program +skipping cudagraphs due to ['non-cuda device in graph'] +[rank0]:[W CUDAGraph.cpp:145] Warning: Waiting for pending NCCL work to finish before starting graph capture. (function operator()) + running benchmark: 0%| | 0/30 [00:00 + run() + File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 30, in run + benchmark.run(bm_args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/run.py", line 24, in run + main(TorchBenchmarkRunner(), original_dir, args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3133, in main + process_entry(0, runner, original_dir, args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3090, in process_entry + return maybe_fresh_cache( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1714, in inner + return fn(*args, **kwargs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 383, in load_model + module = importlib.import_module(c) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/importlib/__init__.py", line 126, in import_module + return _bootstrap._gcd_import(name[level:], package, level) + File "", line 1050, in _gcd_import + File "", line 1027, in _find_and_load + File "", line 1006, in _find_and_load_unlocked + File "", line 688, in _load_unlocked + File "", line 883, in exec_module + File "", line 241, in _call_with_frames_removed + File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/__init__.py", line 7, in + from .data.dlrm_dataloader import get_dataloader + File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/data/dlrm_dataloader.py", line 13, in + from torchrec.datasets.criteo import ( + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/__init__.py", line 8, in + import torchrec.distributed # noqa + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/__init__.py", line 36, in + from torchrec.distributed.model_parallel import DistributedModelParallel # noqa + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/model_parallel.py", line 21, in + from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/__init__.py", line 22, in + from torchrec.distributed.planner.planners import EmbeddingShardingPlanner # noqa + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/planners.py", line 19, in + from torchrec.distributed.planner.constants import BATCH_SIZE, MAX_SIZE + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/constants.py", line 10, in + from torchrec.distributed.embedding_types import EmbeddingComputeKernel + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/embedding_types.py", line 14, in + from fbgemm_gpu.split_table_batched_embeddings_ops_training import EmbeddingLocation + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 23, in + from . import _fbgemm_gpu_docs, sparse_ops # noqa: F401, E402 # noqa: F401, E402 + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/_fbgemm_gpu_docs.py", line 19, in + torch.ops.fbgemm.jagged_2d_to_dense, + File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 822, in __getattr__ + raise AttributeError( +AttributeError: '_OpNamespace' 'fbgemm' object has no attribute 'jagged_2d_to_dense' +Run failed with return code: 1 +Output: None +Error: None + loading model: 0it [00:00, ?it/s] loading model: 0it [00:03, ?it/s] +BERT_pytorch +cuda eval BERT_pytorch int8weightonly-bs1 + running benchmark: 0%| | 0/30 [00:00= MINSIZE && prev_inuse (old_top) && ((unsigned long) old_end & (pagesize - 1)) == 0) +Run failed with return code: -6 +Output: None +Error: None + loading model: 0it [00:00, ?it/s]WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead + loading model: 0it [00:05, ?it/s] +WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead +doctr_reco_predictor +cuda eval doctr_reco_predictor int8weightonly-bs1 +WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead + running benchmark: 0%| | 0/30 [00:00 + run() + File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 30, in run + benchmark.run(bm_args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/run.py", line 24, in run + main(TorchBenchmarkRunner(), original_dir, args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3133, in main + process_entry(0, runner, original_dir, args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3090, in process_entry + return maybe_fresh_cache( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1714, in inner + return fn(*args, **kwargs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 383, in load_model + module = importlib.import_module(c) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/importlib/__init__.py", line 126, in import_module + return _bootstrap._gcd_import(name[level:], package, level) + File "", line 1050, in _gcd_import + File "", line 1027, in _find_and_load + File "", line 1006, in _find_and_load_unlocked + File "", line 688, in _load_unlocked + File "", line 883, in exec_module + File "", line 241, in _call_with_frames_removed + File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/__init__.py", line 7, in + from .data.dlrm_dataloader import get_dataloader + File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/data/dlrm_dataloader.py", line 13, in + from torchrec.datasets.criteo import ( + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/__init__.py", line 8, in + import torchrec.distributed # noqa + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/__init__.py", line 36, in + from torchrec.distributed.model_parallel import DistributedModelParallel # noqa + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/model_parallel.py", line 21, in + from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/__init__.py", line 22, in + from torchrec.distributed.planner.planners import EmbeddingShardingPlanner # noqa + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/planners.py", line 19, in + from torchrec.distributed.planner.constants import BATCH_SIZE, MAX_SIZE + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/constants.py", line 10, in + from torchrec.distributed.embedding_types import EmbeddingComputeKernel + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/embedding_types.py", line 14, in + from fbgemm_gpu.split_table_batched_embeddings_ops_training import EmbeddingLocation + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 23, in + from . import _fbgemm_gpu_docs, sparse_ops # noqa: F401, E402 # noqa: F401, E402 + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/_fbgemm_gpu_docs.py", line 19, in + torch.ops.fbgemm.jagged_2d_to_dense, + File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 822, in __getattr__ + raise AttributeError( +AttributeError: '_OpNamespace' 'fbgemm' object has no attribute 'jagged_2d_to_dense' +Run failed with return code: 1 +Output: None +Error: None + loading model: 0it [00:00, ?it/s] loading model: 0it [00:04, ?it/s] +BERT_pytorch +cuda eval BERT_pytorch int4weightonly-bs1 + running benchmark: 0%| | 0/30 [00:00 + run() + File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 30, in run + benchmark.run(bm_args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/run.py", line 24, in run + main(TorchBenchmarkRunner(), original_dir, args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3133, in main + process_entry(0, runner, original_dir, args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3090, in process_entry + return maybe_fresh_cache( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1714, in inner + return fn(*args, **kwargs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 383, in load_model + module = importlib.import_module(c) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/importlib/__init__.py", line 126, in import_module + return _bootstrap._gcd_import(name[level:], package, level) + File "", line 1050, in _gcd_import + File "", line 1027, in _find_and_load + File "", line 1006, in _find_and_load_unlocked + File "", line 688, in _load_unlocked + File "", line 883, in exec_module + File "", line 241, in _call_with_frames_removed + File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/__init__.py", line 7, in + from .data.dlrm_dataloader import get_dataloader + File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/data/dlrm_dataloader.py", line 13, in + from torchrec.datasets.criteo import ( + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/__init__.py", line 8, in + import torchrec.distributed # noqa + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/__init__.py", line 36, in + from torchrec.distributed.model_parallel import DistributedModelParallel # noqa + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/model_parallel.py", line 21, in + from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/__init__.py", line 22, in + from torchrec.distributed.planner.planners import EmbeddingShardingPlanner # noqa + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/planners.py", line 19, in + from torchrec.distributed.planner.constants import BATCH_SIZE, MAX_SIZE + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/constants.py", line 10, in + from torchrec.distributed.embedding_types import EmbeddingComputeKernel + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/embedding_types.py", line 14, in + from fbgemm_gpu.split_table_batched_embeddings_ops_training import EmbeddingLocation + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 23, in + from . import _fbgemm_gpu_docs, sparse_ops # noqa: F401, E402 # noqa: F401, E402 + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/_fbgemm_gpu_docs.py", line 19, in + torch.ops.fbgemm.jagged_2d_to_dense, + File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 822, in __getattr__ + raise AttributeError( +AttributeError: '_OpNamespace' 'fbgemm' object has no attribute 'jagged_2d_to_dense' +Run failed with return code: 1 +Output: None +Error: None + loading model: 0it [00:00, ?it/s] loading model: 0it [00:04, ?it/s] +BERT_pytorch +cuda eval BERT_pytorch baseline-bs1 +AUTOTUNE addmm(128x768, 128x768, 768x768) + bias_addmm 0.0127 ms 100.0% + triton_mm_5 0.0131 ms 97.5% + triton_mm_9 0.0131 ms 97.3% + triton_mm_6 0.0135 ms 94.4% + triton_mm_8 0.0147 ms 86.7% + addmm 0.0165 ms 77.0% + triton_mm_3 0.0170 ms 75.0% + triton_mm_4 0.0171 ms 74.3% + triton_mm_2 0.0205 ms 62.1% + triton_mm_1 0.0205 ms 62.0% +SingleProcess AUTOTUNE takes 5.5990 seconds +AUTOTUNE mm(128x768, 768x768) + mm 0.0119 ms 100.0% + triton_mm_65 0.0125 ms 94.6% + triton_mm_66 0.0131 ms 90.7% + triton_mm_69 0.0133 ms 89.0% + triton_mm_68 0.0136 ms 87.5% + triton_mm_64 0.0158 ms 74.9% + triton_mm_63 0.0162 ms 73.4% + triton_mm_62 0.0192 ms 61.8% + triton_mm_61 0.0197 ms 60.1% + triton_mm_60 0.0279 ms 42.5% +SingleProcess AUTOTUNE takes 5.1503 seconds +AUTOTUNE mm(128x768, 768x3072) + mm 0.0145 ms 100.0% + triton_mm_80 0.0150 ms 96.8% + triton_mm_76 0.0168 ms 86.6% + triton_mm_78 0.0170 ms 85.5% + triton_mm_75 0.0173 ms 84.1% + triton_mm_77 0.0178 ms 81.7% + triton_mm_81 0.0187 ms 77.6% + triton_mm_74 0.0202 ms 72.1% + triton_mm_73 0.0203 ms 71.7% + triton_mm_72 0.0300 ms 48.5% +SingleProcess AUTOTUNE takes 4.9015 seconds +AUTOTUNE mm(128x3072, 3072x768) + mm 0.0179 ms 100.0% + triton_mm_90 0.0298 ms 59.9% + triton_mm_89 0.0300 ms 59.4% + triton_mm_93 0.0310 ms 57.6% + triton_mm_92 0.0343 ms 52.1% + triton_mm_88 0.0411 ms 43.5% + triton_mm_87 0.0415 ms 43.0% + triton_mm_86 0.0557 ms 32.1% + triton_mm_85 0.0557 ms 32.0% + triton_mm_84 0.0747 ms 23.9% +SingleProcess AUTOTUNE takes 4.8119 seconds + running benchmark: 0%| | 0/30 [00:00= MINSIZE && prev_inuse (old_top) && ((unsigned long) old_end & (pagesize - 1)) == 0) +Run failed with return code: -6 +Output: None +Error: None + loading model: 0it [00:00, ?it/s]WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead + loading model: 0it [00:05, ?it/s] +WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead +doctr_reco_predictor +cuda eval doctr_reco_predictor baseline-bs1 +WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead + running benchmark: 0%| | 0/30 [00:00 will be ignored +[rank0]:[2023-12-12 09:46:53,460] [1/0_1] torch._dynamo.backends.distributed: [WARNING] Some buckets were extended beyond their requested parameter capacities in order to ensure each subgraph has an output node, required for fx graph partitioning. This can be the case when a subgraph would have only contained nodes performing inplace mutation, and returning no logical outputs. This should not be a problem, unless it results in too few graph partitions for optimal DDP performance. +[rank0]:[2023-12-12 09:46:53,486] [1/0_1] torch._dynamo.backends.distributed: [WARNING] DDPOptimizer extended these buckets to ensure per-subgraph output nodes: +[rank0]:[2023-12-12 09:46:53,486] [1/0_1] torch._dynamo.backends.distributed: [WARNING] ┌─────────┬─────────────┬────────────────────────┐ +[rank0]:[2023-12-12 09:46:53,486] [1/0_1] torch._dynamo.backends.distributed: [WARNING] │ Index │ Extra Ops │ Extra Param Size (b) │ +[rank0]:[2023-12-12 09:46:53,486] [1/0_1] torch._dynamo.backends.distributed: [WARNING] ├─────────┼─────────────┼────────────────────────┤ +[rank0]:[2023-12-12 09:46:53,486] [1/0_1] torch._dynamo.backends.distributed: [WARNING] │ 0 │ 157 │ 44910720 │ +[rank0]:[2023-12-12 09:46:53,486] [1/0_1] torch._dynamo.backends.distributed: [WARNING] └─────────┴─────────────┴────────────────────────┘ +AUTOTUNE addmm(1x128, 1x2048, 2048x128) + bias_addmm 0.0112 ms 100.0% + addmm 0.0112 ms 100.0% + triton_mm_540 0.0187 ms 60.1% + triton_mm_541 0.0198 ms 56.8% + triton_mm_543 0.0201 ms 55.9% + triton_mm_544 0.0210 ms 53.5% + triton_mm_539 0.0217 ms 51.7% + triton_mm_538 0.0241 ms 46.7% + triton_mm_537 0.0308 ms 36.4% + triton_mm_536 0.0331 ms 34.0% +SingleProcess AUTOTUNE takes 4.3309 seconds +skipping cudagraphs due to ['mutated inputs'] +[rank0]:[2023-12-12 09:47:26,228] [5/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program +skipping cudagraphs due to ['non-cuda device in graph'] +AUTOTUNE mm(1x2048, 2048x128) + mm 0.0083 ms 100.0% + triton_mm_1087 0.0177 ms 46.8% + triton_mm_1088 0.0188 ms 44.0% + triton_mm_1090 0.0196 ms 42.2% + triton_mm_1091 0.0205 ms 40.3% + triton_mm_1086 0.0207 ms 40.0% + triton_mm_1085 0.0236 ms 35.0% + triton_mm_1084 0.0308 ms 26.9% + triton_mm_1083 0.0326 ms 25.4% + triton_mm_1082 0.0543 ms 15.3% +SingleProcess AUTOTUNE takes 4.4818 seconds +AUTOTUNE bmm(1x1x128, 1x128x1) + triton_bmm_1096 0.0061 ms 100.0% + triton_bmm_1098 0.0061 ms 100.0% + bmm 0.0065 ms 93.1% + triton_bmm_1097 0.0066 ms 91.8% + triton_bmm_1099 0.0066 ms 91.8% + triton_bmm_1095 0.0070 ms 86.4% + triton_bmm_1094 0.0074 ms 82.3% + triton_bmm_1100 0.0081 ms 75.4% + triton_bmm_1101 0.0086 ms 70.6% +SingleProcess AUTOTUNE takes 2.5842 seconds +AUTOTUNE bmm(1x1x128, 1x128x32000) + triton_bmm_1102 0.0140 ms 100.0% + triton_bmm_1104 0.0145 ms 96.6% + triton_bmm_1103 0.0150 ms 93.2% + triton_bmm_1106 0.0152 ms 91.8% + triton_bmm_1108 0.0154 ms 90.6% + triton_bmm_1105 0.0157 ms 89.2% + triton_bmm_1111 0.0158 ms 88.6% + triton_bmm_1112 0.0158 ms 88.6% + triton_bmm_1113 0.0162 ms 86.4% + triton_bmm_1109 0.0181 ms 77.1% +SingleProcess AUTOTUNE takes 3.8806 seconds +[rank0]:[W CUDAGraph.cpp:145] Warning: Waiting for pending NCCL work to finish before starting graph capture. (function operator()) + running benchmark: 0%| | 0/30 [00:00 + run() + File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 30, in run + benchmark.run(bm_args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/run.py", line 24, in run + main(TorchBenchmarkRunner(), original_dir, args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3133, in main + process_entry(0, runner, original_dir, args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3090, in process_entry + return maybe_fresh_cache( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1714, in inner + return fn(*args, **kwargs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 383, in load_model + module = importlib.import_module(c) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/importlib/__init__.py", line 126, in import_module + return _bootstrap._gcd_import(name[level:], package, level) + File "", line 1050, in _gcd_import + File "", line 1027, in _find_and_load + File "", line 1006, in _find_and_load_unlocked + File "", line 688, in _load_unlocked + File "", line 883, in exec_module + File "", line 241, in _call_with_frames_removed + File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/__init__.py", line 7, in + from .data.dlrm_dataloader import get_dataloader + File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/data/dlrm_dataloader.py", line 13, in + from torchrec.datasets.criteo import ( + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/__init__.py", line 8, in + import torchrec.distributed # noqa + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/__init__.py", line 36, in + from torchrec.distributed.model_parallel import DistributedModelParallel # noqa + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/model_parallel.py", line 21, in + from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/__init__.py", line 22, in + from torchrec.distributed.planner.planners import EmbeddingShardingPlanner # noqa + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/planners.py", line 19, in + from torchrec.distributed.planner.constants import BATCH_SIZE, MAX_SIZE + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/constants.py", line 10, in + from torchrec.distributed.embedding_types import EmbeddingComputeKernel + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/embedding_types.py", line 14, in + from fbgemm_gpu.split_table_batched_embeddings_ops_training import EmbeddingLocation + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 23, in + from . import _fbgemm_gpu_docs, sparse_ops # noqa: F401, E402 # noqa: F401, E402 + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/_fbgemm_gpu_docs.py", line 19, in + torch.ops.fbgemm.jagged_2d_to_dense, + File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 822, in __getattr__ + raise AttributeError( +AttributeError: '_OpNamespace' 'fbgemm' object has no attribute 'jagged_2d_to_dense' +Run failed with return code: 1 +Output: None +Error: None + loading model: 0it [00:00, ?it/s] loading model: 0it [00:04, ?it/s] +BERT_pytorch +cuda eval BERT_pytorch int8dynamic-bs32 +AUTOTUNE bmm(384x128x64, 384x64x128) + triton_bmm_30 0.0256 ms 100.0% + triton_bmm_23 0.0266 ms 96.0% + triton_bmm_24 0.0267 ms 95.9% + triton_bmm_25 0.0274 ms 93.3% + triton_bmm_26 0.0275 ms 93.1% + triton_bmm_32 0.0281 ms 91.1% + triton_bmm_22 0.0282 ms 90.6% + bmm 0.0292 ms 87.5% + triton_bmm_29 0.0296 ms 86.4% + triton_bmm_31 0.0316 ms 80.9% +SingleProcess AUTOTUNE takes 1.8196 seconds +AUTOTUNE bmm(384x128x128, 384x128x64) + triton_bmm_47 0.0282 ms 100.0% + triton_bmm_46 0.0296 ms 95.3% + triton_bmm_53 0.0299 ms 94.2% + triton_bmm_45 0.0300 ms 93.9% + triton_bmm_49 0.0301 ms 93.6% + triton_bmm_52 0.0301 ms 93.6% + triton_bmm_48 0.0313 ms 90.1% + triton_bmm_51 0.0322 ms 87.5% + triton_bmm_55 0.0325 ms 86.7% + triton_bmm_50 0.0334 ms 84.4% +SingleProcess AUTOTUNE takes 1.8928 seconds + running benchmark: 0%| | 0/30 [00:00 + async_compile.wait(globals()) + File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 2470, in wait + scope[key] = result.result() + File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 2313, in result + self.future.result() + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/concurrent/futures/_base.py", line 458, in result + return self.__get_result() + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result + raise self._exception +torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised: +CompilationError: at 14:40: xnumel = 196 + yoffset = tl.program_id(1).to(tl.int64) * YBLOCK + yindex = yoffset + tl.arange(0, YBLOCK)[None, :].to(tl.int64) + ymask = yindex < ynumel + xoffset = tl.program_id(0).to(tl.int64) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:, None].to(tl.int64) + xmask = xindex < xnumel + x2 = xindex + y3 = yindex + y0 = yindex % 1024 + y1 = (yindex // 1024) + tmp0 = tl.load(in_ptr0 + (x2 + (196*y3)), xmask, eviction_policy='evict_last').to(tl.float32) + ^ +ValueError('numel (262144) exceeds triton maximum tensor numel (131072)') + +Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information + + +You can suppress this exception and fall back to eager by setting: + import torch._dynamo + torch._dynamo.config.suppress_errors = True + +Run failed with return code: 255 +Output: None +Error: None + loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_dc5 + loading model: 0it [00:08, ?it/s] +WARNING:root:detectron2_fasterrcnn_r_101_dc5 failed to load +Original Error: "roi_align_forward_kernel" not implemented for 'BFloat16' +Eager model failed to run +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model + self.model_iter_fn(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward + return self.inference(batched_inputs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference + results, _ = self.roi_heads(images, features, proposals, None) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 747, in forward + pred_instances = self._forward_box(features, proposals) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 798, in _forward_box + box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals]) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward + return self.level_poolers[0](x[0], pooler_fmt_boxes) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward + return roi_align( + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 238, in roi_align + return torch.ops.torchvision.roi_align( + File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 755, in __call__ + return self._op(*args, **(kwargs or {})) +RuntimeError: "roi_align_forward_kernel" not implemented for 'BFloat16' + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model + self.validate_model(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model + raise NotImplementedError("Eager model failed to run") from e +NotImplementedError: Eager model failed to run + + loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_fpn +WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead + loading model: 0it [00:07, ?it/s] +WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead +cuda eval detectron2_fasterrcnn_r_101_fpn int8dynamic-bs32 +WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead +skipping cudagraphs due to ['mutated inputs'] +AUTOTUNE convolution(32x3x1216x1344, 64x3x7x7) + convolution 3.7106 ms 100.0% + triton_convolution_3 22.1062 ms 16.8% + triton_convolution_4 24.2973 ms 15.3% + triton_convolution_5 26.9378 ms 13.8% + triton_convolution_0 30.0738 ms 12.3% + triton_convolution_2 32.5176 ms 11.4% + triton_convolution_1 81.4506 ms 4.6% +SingleProcess AUTOTUNE takes 5.0160 seconds +AUTOTUNE mm(3268608x64, 64x64) + triton_mm_14 0.5186 ms 100.0% + triton_mm_8 0.5193 ms 99.9% + triton_mm_7 0.5283 ms 98.2% + triton_mm_10 0.5368 ms 96.6% + triton_mm_6 0.5460 ms 95.0% + triton_mm_13 0.5492 ms 94.4% + triton_mm_9 0.5513 ms 94.1% + mm 0.5668 ms 91.5% + triton_mm_15 0.6987 ms 74.2% + triton_mm_16 0.7410 ms 70.0% +SingleProcess AUTOTUNE takes 4.1927 seconds +AUTOTUNE convolution(32x64x304x336, 64x64x3x3) + convolution 1.4962 ms 100.0% + triton_convolution_18 7.1458 ms 20.9% + triton_convolution_23 8.0892 ms 18.5% + triton_convolution_24 9.7211 ms 15.4% + triton_convolution_19 11.9942 ms 12.5% + triton_convolution_21 12.1926 ms 12.3% + triton_convolution_22 12.5636 ms 11.9% + triton_convolution_20 28.5195 ms 5.2% +SingleProcess AUTOTUNE takes 4.7321 seconds +AUTOTUNE mm(3268608x64, 64x256) + triton_mm_27 1.5259 ms 100.0% + triton_mm_26 1.5455 ms 98.7% + triton_mm_28 1.7398 ms 87.7% + mm 1.7660 ms 86.4% + triton_mm_29 1.7692 ms 86.2% + triton_mm_33 1.8433 ms 82.8% + triton_mm_25 1.8774 ms 81.3% + triton_mm_32 1.9772 ms 77.2% + triton_mm_35 2.4140 ms 63.2% + triton_mm_34 2.8250 ms 54.0% +SingleProcess AUTOTUNE takes 4.9396 seconds +AUTOTUNE mm(3268608x256, 256x64) + triton_mm_51 1.3316 ms 100.0% + triton_mm_53 1.3624 ms 97.7% + triton_mm_56 1.3661 ms 97.5% + mm 1.4040 ms 94.8% + triton_mm_57 1.4285 ms 93.2% + triton_mm_50 1.4361 ms 92.7% + triton_mm_49 1.4596 ms 91.2% + triton_mm_52 1.4744 ms 90.3% + triton_mm_54 2.1015 ms 63.4% + triton_mm_55 2.2606 ms 58.9% +SingleProcess AUTOTUNE takes 4.7929 seconds +AUTOTUNE convolution(32x256x304x336, 128x256x1x1) + convolution 0.4567 ms 100.0% + triton_convolution_114 1.0757 ms 42.5% + triton_convolution_111 1.2215 ms 37.4% + triton_convolution_117 1.3653 ms 33.5% + triton_convolution_116 1.5024 ms 30.4% + triton_convolution_115 1.6693 ms 27.4% + triton_convolution_112 3.1440 ms 14.5% + triton_convolution_113 6.8288 ms 6.7% +SingleProcess AUTOTUNE takes 4.5671 seconds +AUTOTUNE convolution(32x128x152x168, 128x128x3x3) + convolution 1.1805 ms 100.0% + triton_convolution_121 7.0887 ms 16.7% + triton_convolution_118 8.0229 ms 14.7% + triton_convolution_123 8.3772 ms 14.1% + triton_convolution_124 10.6232 ms 11.1% + triton_convolution_122 12.4481 ms 9.5% + triton_convolution_119 13.7375 ms 8.6% + triton_convolution_120 28.6058 ms 4.1% +SingleProcess AUTOTUNE takes 4.7865 seconds +AUTOTUNE mm(817152x128, 128x512) + triton_mm_127 0.9996 ms 100.0% + triton_mm_126 1.0023 ms 99.7% + triton_mm_132 1.0768 ms 92.8% + triton_mm_125 1.1161 ms 89.6% + mm 1.1354 ms 88.0% + triton_mm_128 1.1937 ms 83.7% + triton_mm_129 1.2079 ms 82.8% + triton_mm_133 1.3932 ms 71.8% + triton_mm_135 1.6393 ms 61.0% + triton_mm_134 2.2798 ms 43.8% +SingleProcess AUTOTUNE takes 4.8697 seconds +AUTOTUNE convolution(32x256x304x336, 512x256x1x1) + convolution 1.5260 ms 100.0% + triton_convolution_140 4.2531 ms 35.9% + triton_convolution_142 4.5226 ms 33.7% + triton_convolution_143 5.3815 ms 28.4% + triton_convolution_141 6.6562 ms 22.9% + triton_convolution_137 8.2852 ms 18.4% + triton_convolution_138 12.5198 ms 12.2% + triton_convolution_139 27.1271 ms 5.6% +SingleProcess AUTOTUNE takes 5.0886 seconds +AUTOTUNE mm(817152x512, 512x128) + mm 0.7224 ms 100.0% + triton_mm_146 0.8024 ms 90.0% + triton_mm_145 0.8341 ms 86.6% + triton_mm_148 0.8952 ms 80.7% + triton_mm_147 0.9011 ms 80.2% + triton_mm_151 0.9721 ms 74.3% + triton_mm_152 1.0261 ms 70.4% + triton_mm_144 1.0666 ms 67.7% + triton_mm_154 1.5977 ms 45.2% + triton_mm_149 1.7777 ms 40.6% +SingleProcess AUTOTUNE takes 5.0220 seconds +AUTOTUNE convolution(32x512x152x168, 256x512x1x1) + convolution 0.3106 ms 100.0% + triton_convolution_240 1.0496 ms 29.6% + triton_convolution_242 1.0762 ms 28.9% + triton_convolution_243 1.2981 ms 23.9% + triton_convolution_241 1.7032 ms 18.2% + triton_convolution_237 2.1030 ms 14.8% + triton_convolution_238 3.1476 ms 9.9% + triton_convolution_239 6.7274 ms 4.6% +SingleProcess AUTOTUNE takes 4.7450 seconds +AUTOTUNE convolution(32x256x76x84, 256x256x3x3) + convolution 1.0683 ms 100.0% + triton_convolution_249 6.4305 ms 16.6% + triton_convolution_247 7.2643 ms 14.7% + triton_convolution_244 7.9726 ms 13.4% + triton_convolution_250 11.5396 ms 9.3% + triton_convolution_248 16.4705 ms 6.5% + triton_convolution_245 19.3214 ms 5.5% + triton_convolution_246 29.2912 ms 3.6% +SingleProcess AUTOTUNE takes 5.4721 seconds +AUTOTUNE mm(204288x256, 256x1024) + mm 0.6795 ms 100.0% + triton_mm_253 0.7678 ms 88.5% + triton_mm_252 0.7709 ms 88.1% + triton_mm_258 0.7911 ms 85.9% + triton_mm_251 0.9138 ms 74.4% + triton_mm_254 0.9144 ms 74.3% + triton_mm_255 0.9386 ms 72.4% + triton_mm_259 1.0814 ms 62.8% + triton_mm_261 1.3090 ms 51.9% + triton_mm_260 1.8635 ms 36.5% +SingleProcess AUTOTUNE takes 4.9279 seconds +AUTOTUNE convolution(32x512x152x168, 1024x512x1x1) + convolution 1.2123 ms 100.0% + triton_convolution_266 4.1210 ms 29.4% + triton_convolution_268 4.2024 ms 28.8% + triton_convolution_269 5.0803 ms 23.9% + triton_convolution_267 6.5498 ms 18.5% + triton_convolution_263 8.1485 ms 14.9% + triton_convolution_264 12.4510 ms 9.7% + triton_convolution_265 26.7531 ms 4.5% +SingleProcess AUTOTUNE takes 5.0502 seconds +AUTOTUNE mm(204288x1024, 1024x256) + mm 0.5419 ms 100.0% + triton_mm_271 0.6533 ms 82.9% + triton_mm_272 0.6558 ms 82.6% + triton_mm_273 0.7579 ms 71.5% + triton_mm_274 0.7680 ms 70.6% + triton_mm_270 0.8710 ms 62.2% + triton_mm_278 0.8870 ms 61.1% + triton_mm_277 0.9553 ms 56.7% + triton_mm_280 1.3468 ms 40.2% + triton_mm_279 1.5510 ms 34.9% +SingleProcess AUTOTUNE takes 5.6453 seconds +AUTOTUNE convolution(32x1024x76x84, 512x1024x1x1) + convolution 0.2594 ms 100.0% + triton_convolution_955 1.0005 ms 25.9% + triton_convolution_957 1.1321 ms 22.9% + triton_convolution_958 1.2459 ms 20.8% + triton_convolution_956 1.9279 ms 13.5% + triton_convolution_952 2.2604 ms 11.5% + triton_convolution_953 3.1723 ms 8.2% + triton_convolution_954 6.5568 ms 4.0% +SingleProcess AUTOTUNE takes 5.2660 seconds +AUTOTUNE convolution(32x512x38x42, 512x512x3x3) + convolution 1.0310 ms 100.0% + triton_convolution_964 6.9642 ms 14.8% + triton_convolution_959 9.8914 ms 10.4% + triton_convolution_962 10.1237 ms 10.2% + triton_convolution_965 17.1603 ms 6.0% + triton_convolution_963 19.1972 ms 5.4% + triton_convolution_960 21.0584 ms 4.9% + triton_convolution_961 28.6043 ms 3.6% +SingleProcess AUTOTUNE takes 5.4783 seconds +AUTOTUNE mm(51072x512, 512x2048) + mm 0.5481 ms 100.0% + triton_mm_968 0.6581 ms 83.3% + triton_mm_967 0.6594 ms 83.1% + triton_mm_973 0.6983 ms 78.5% + triton_mm_970 0.7820 ms 70.1% + triton_mm_969 0.7847 ms 69.9% + triton_mm_966 0.8211 ms 66.8% + triton_mm_974 0.9289 ms 59.0% + triton_mm_976 1.2285 ms 44.6% + triton_mm_972 1.6656 ms 32.9% +SingleProcess AUTOTUNE takes 5.7071 seconds +AUTOTUNE convolution(32x1024x76x84, 2048x1024x1x1) + convolution 1.0465 ms 100.0% + triton_convolution_981 3.9368 ms 26.6% + triton_convolution_983 4.3779 ms 23.9% + triton_convolution_984 4.9079 ms 21.3% + triton_convolution_982 7.6491 ms 13.7% + triton_convolution_978 8.3042 ms 12.6% + triton_convolution_979 12.2804 ms 8.5% + triton_convolution_980 26.2045 ms 4.0% +SingleProcess AUTOTUNE takes 5.1454 seconds +AUTOTUNE mm(51072x2048, 2048x512) + mm 0.4900 ms 100.0% + triton_mm_987 0.6065 ms 80.8% + triton_mm_986 0.6077 ms 80.6% + triton_mm_988 0.7009 ms 69.9% + triton_mm_989 0.7034 ms 69.7% + triton_mm_985 0.8206 ms 59.7% + triton_mm_993 0.8374 ms 58.5% + triton_mm_992 0.8790 ms 55.7% + triton_mm_995 1.2860 ms 38.1% + triton_mm_991 1.5045 ms 32.6% +SingleProcess AUTOTUNE takes 5.4583 seconds +AUTOTUNE addmm(51072x256, 51072x2048, 2048x256) + bias_addmm 0.3029 ms 100.0% + addmm 0.3089 ms 98.1% + triton_mm_1049 0.3235 ms 93.6% + triton_mm_1048 0.3275 ms 92.5% + triton_mm_1050 0.3550 ms 85.3% + triton_mm_1051 0.3622 ms 83.6% + triton_mm_1055 0.4242 ms 71.4% + triton_mm_1047 0.4364 ms 69.4% + triton_mm_1054 0.5085 ms 59.6% + triton_mm_1057 0.7001 ms 43.3% +SingleProcess AUTOTUNE takes 5.7065 seconds +AUTOTUNE convolution(32x256x38x42, 256x256x3x3) + convolution 0.2750 ms 100.0% + triton_convolution_1064 1.7089 ms 16.1% + triton_convolution_1062 1.8415 ms 14.9% + triton_convolution_1059 2.1425 ms 12.8% + triton_convolution_1065 2.9491 ms 9.3% + triton_convolution_1063 3.9602 ms 6.9% + triton_convolution_1060 4.6976 ms 5.9% + triton_convolution_1061 7.4853 ms 3.7% +SingleProcess AUTOTUNE takes 5.0653 seconds +AUTOTUNE addmm(3268608x256, 3268608x256, 256x256) + triton_mm_1067 3.5880 ms 100.0% + triton_mm_1068 3.6040 ms 99.6% + triton_mm_1073 3.8683 ms 92.8% + triton_mm_1066 4.2427 ms 84.6% + triton_mm_1070 4.2668 ms 84.1% + triton_mm_1069 4.2703 ms 84.0% + triton_mm_1074 4.8278 ms 74.3% + bias_addmm 5.9413 ms 60.4% + addmm 5.9656 ms 60.1% + triton_mm_1076 6.2184 ms 57.7% +SingleProcess AUTOTUNE takes 5.7881 seconds +AUTOTUNE addmm(817152x256, 817152x512, 512x256) + bias_addmm 1.3238 ms 100.0% + triton_mm_1080 1.5291 ms 86.6% + triton_mm_1079 1.5313 ms 86.4% + triton_mm_1081 1.7488 ms 75.7% + triton_mm_1082 1.7947 ms 73.8% + addmm 1.8729 ms 70.7% + triton_mm_1078 1.9184 ms 69.0% + triton_mm_1086 2.0476 ms 64.6% + triton_mm_1085 2.6087 ms 50.7% + triton_mm_1088 2.8319 ms 46.7% +SingleProcess AUTOTUNE takes 6.2909 seconds +AUTOTUNE addmm(204288x256, 204288x1024, 1024x256) + bias_addmm 0.5711 ms 100.0% + triton_mm_1091 0.6686 ms 85.4% + triton_mm_1092 0.6850 ms 83.4% + addmm 0.7367 ms 77.5% + triton_mm_1093 0.7751 ms 73.7% + triton_mm_1094 0.7860 ms 72.7% + triton_mm_1090 0.8937 ms 63.9% + triton_mm_1098 0.9115 ms 62.7% + triton_mm_1097 0.9857 ms 57.9% + triton_mm_1100 1.3505 ms 42.3% +SingleProcess AUTOTUNE takes 5.8394 seconds +AUTOTUNE convolution(32x256x304x336, 256x256x3x3) + convolution 17.6540 ms 100.0% + triton_convolution_1107 100.5644 ms 17.6% + triton_convolution_1105 118.3672 ms 14.9% + triton_convolution_1102 122.4366 ms 14.4% + triton_convolution_1108 186.0839 ms 9.5% + triton_convolution_1106 265.2757 ms 6.7% + triton_convolution_1103 300.7162 ms 5.9% + triton_convolution_1104 466.2684 ms 3.8% +SingleProcess AUTOTUNE takes 17.1328 seconds +AUTOTUNE convolution(32x256x152x168, 256x256x3x3) + convolution 4.3941 ms 100.0% + triton_convolution_1114 25.2578 ms 17.4% + triton_convolution_1112 29.7074 ms 14.8% + triton_convolution_1109 31.0365 ms 14.2% + triton_convolution_1115 46.3984 ms 9.5% + triton_convolution_1113 66.1080 ms 6.6% + triton_convolution_1110 76.1160 ms 5.8% + triton_convolution_1111 116.5012 ms 3.8% +SingleProcess AUTOTUNE takes 7.3893 seconds +AUTOTUNE addmm(3268608x3, 3268608x256, 256x3) + triton_mm_1132 1.0489 ms 100.0% + triton_mm_1131 1.0684 ms 98.2% + triton_mm_1135 1.0694 ms 98.1% + triton_mm_1133 1.0721 ms 97.8% + triton_mm_1134 1.0732 ms 97.7% + triton_mm_1137 1.0832 ms 96.8% + triton_mm_1130 1.0861 ms 96.6% + triton_mm_1141 1.1324 ms 92.6% + triton_mm_1140 1.1654 ms 90.0% + triton_mm_1138 1.6334 ms 64.2% +SingleProcess AUTOTUNE takes 4.4295 seconds +AUTOTUNE addmm(817152x3, 817152x256, 256x3) + triton_mm_1151 0.2790 ms 100.0% + triton_mm_1150 0.2827 ms 98.7% + triton_mm_1154 0.2834 ms 98.4% + triton_mm_1152 0.2838 ms 98.3% + triton_mm_1153 0.2846 ms 98.0% + triton_mm_1149 0.2883 ms 96.8% + triton_mm_1156 0.2890 ms 96.5% + triton_mm_1160 0.3026 ms 92.2% + triton_mm_1159 0.3090 ms 90.3% + bias_addmm 0.4003 ms 69.7% +SingleProcess AUTOTUNE takes 4.7481 seconds +AUTOTUNE addmm(204288x3, 204288x256, 256x3) + triton_mm_1169 0.0863 ms 100.0% + triton_mm_1171 0.0867 ms 99.5% + triton_mm_1173 0.0870 ms 99.2% + triton_mm_1172 0.0878 ms 98.3% + triton_mm_1170 0.0879 ms 98.2% + triton_mm_1168 0.0897 ms 96.2% + triton_mm_1175 0.0902 ms 95.7% + triton_mm_1178 0.0952 ms 90.6% + triton_mm_1179 0.1020 ms 84.6% + triton_mm_1176 0.1132 ms 76.3% +SingleProcess AUTOTUNE takes 4.5108 seconds +AUTOTUNE addmm(51072x3, 51072x256, 256x3) + triton_mm_1189 0.0340 ms 100.0% + triton_mm_1191 0.0343 ms 99.0% + triton_mm_1190 0.0344 ms 98.7% + triton_mm_1188 0.0347 ms 97.9% + triton_mm_1187 0.0351 ms 96.9% + triton_mm_1194 0.0351 ms 96.8% + triton_mm_1192 0.0352 ms 96.6% + triton_mm_1197 0.0364 ms 93.4% + triton_mm_1198 0.0364 ms 93.3% + bias_addmm 0.0380 ms 89.4% +SingleProcess AUTOTUNE takes 4.2754 seconds +AUTOTUNE convolution(32x256x19x21, 256x256x3x3) + convolution 0.0683 ms 100.0% + triton_convolution_1202 0.4277 ms 16.0% + triton_convolution_1204 0.4415 ms 15.5% + triton_convolution_1199 0.5705 ms 12.0% + triton_convolution_1205 0.6938 ms 9.8% + triton_convolution_1200 0.7713 ms 8.9% + triton_convolution_1203 0.7883 ms 8.7% + triton_convolution_1201 1.8232 ms 3.7% +SingleProcess AUTOTUNE takes 4.8825 seconds +AUTOTUNE addmm(12768x3, 12768x256, 256x3) + triton_mm_1209 0.0132 ms 100.0% + triton_mm_1207 0.0133 ms 99.8% + triton_mm_1211 0.0133 ms 99.5% + triton_mm_1208 0.0135 ms 97.9% + triton_mm_1210 0.0141 ms 94.1% + triton_mm_1214 0.0149 ms 89.0% + triton_mm_1206 0.0153 ms 86.6% + bias_addmm 0.0161 ms 82.5% + triton_mm_1213 0.0161 ms 82.3% + triton_mm_1216 0.0173 ms 76.7% +SingleProcess AUTOTUNE takes 4.5147 seconds +AUTOTUNE addmm(3268608x12, 3268608x256, 256x12) + triton_mm_1226 1.0755 ms 100.0% + triton_mm_1220 1.1086 ms 97.0% + triton_mm_1219 1.1316 ms 95.0% + triton_mm_1221 1.1340 ms 94.8% + triton_mm_1223 1.1346 ms 94.8% + triton_mm_1222 1.1365 ms 94.6% + triton_mm_1225 1.1634 ms 92.4% + triton_mm_1218 1.1695 ms 92.0% + triton_mm_1227 1.2061 ms 89.2% + triton_mm_1229 1.2184 ms 88.3% +SingleProcess AUTOTUNE takes 4.3601 seconds +AUTOTUNE addmm(817152x12, 817152x256, 256x12) + triton_mm_1238 0.2854 ms 100.0% + triton_mm_1232 0.2928 ms 97.5% + triton_mm_1231 0.2960 ms 96.4% + triton_mm_1233 0.2968 ms 96.2% + triton_mm_1235 0.2970 ms 96.1% + triton_mm_1234 0.2985 ms 95.6% + triton_mm_1237 0.3063 ms 93.2% + triton_mm_1230 0.3072 ms 92.9% + triton_mm_1239 0.3164 ms 90.2% + triton_mm_1236 0.3207 ms 89.0% +SingleProcess AUTOTUNE takes 4.1640 seconds +AUTOTUNE addmm(204288x12, 204288x256, 256x12) + triton_mm_1250 0.0884 ms 100.0% + triton_mm_1243 0.0885 ms 99.8% + triton_mm_1247 0.0889 ms 99.4% + triton_mm_1245 0.0892 ms 99.1% + triton_mm_1244 0.0893 ms 98.9% + triton_mm_1246 0.0901 ms 98.1% + triton_mm_1242 0.0923 ms 95.8% + triton_mm_1249 0.0923 ms 95.8% + triton_mm_1248 0.0940 ms 94.0% + triton_mm_1251 0.0940 ms 94.0% +SingleProcess AUTOTUNE takes 4.0702 seconds +AUTOTUNE addmm(51072x12, 51072x256, 256x12) + triton_mm_1256 0.0333 ms 100.0% + triton_mm_1262 0.0336 ms 99.0% + triton_mm_1263 0.0343 ms 97.2% + triton_mm_1255 0.0343 ms 97.1% + triton_mm_1257 0.0344 ms 96.8% + triton_mm_1258 0.0344 ms 96.7% + triton_mm_1254 0.0350 ms 95.1% + triton_mm_1261 0.0350 ms 95.1% + triton_mm_1259 0.0352 ms 94.5% + triton_mm_1260 0.0361 ms 92.2% +SingleProcess AUTOTUNE takes 4.3500 seconds +AUTOTUNE addmm(12768x12, 12768x256, 256x12) + triton_mm_1267 0.0132 ms 100.0% + triton_mm_1275 0.0132 ms 100.0% + triton_mm_1271 0.0134 ms 98.6% + triton_mm_1270 0.0135 ms 97.9% + triton_mm_1274 0.0136 ms 97.4% + triton_mm_1272 0.0137 ms 96.7% + triton_mm_1269 0.0138 ms 95.6% + triton_mm_1268 0.0139 ms 95.2% + bias_addmm 0.0147 ms 90.2% + triton_mm_1266 0.0152 ms 87.1% +SingleProcess AUTOTUNE takes 4.2287 seconds +skipping cudagraphs due to ['non-cuda device in graph'] +[2023-12-12 11:30:21,351] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program +skipping cudagraphs due to ['non-cuda device in graph'] +AUTOTUNE int_mm(32000x12544, 12544x1024, 32000x1024) + triton_mm_1288 1.6268 ms 100.0% + triton_mm_1287 1.6641 ms 97.8% + triton_mm_1286 3.2028 ms 50.8% + triton_mm_1279 3.3561 ms 48.5% + triton_mm_1280 3.3804 ms 48.1% + triton_mm_1281 3.5864 ms 45.4% + triton_mm_1282 3.6539 ms 44.5% + triton_mm_1278 4.6130 ms 35.3% + triton_mm_1285 6.7362 ms 24.2% + triton_mm_1284 10.3430 ms 15.7% +SingleProcess AUTOTUNE takes 7.9031 seconds +AUTOTUNE int_mm(32000x1024, 1024x1024, 32000x1024) + triton_mm_1299 0.3092 ms 100.0% + triton_mm_1298 0.3122 ms 99.0% + triton_mm_1291 0.3562 ms 86.8% + triton_mm_1290 0.3630 ms 85.2% + triton_mm_1297 0.3685 ms 83.9% + triton_mm_1292 0.4132 ms 74.8% + triton_mm_1293 0.4229 ms 73.1% + triton_mm_1289 0.4459 ms 69.3% + triton_mm_1296 0.6285 ms 49.2% + triton_mm_1295 0.9780 ms 31.6% +SingleProcess AUTOTUNE takes 7.8710 seconds +AUTOTUNE int_mm(32000x1024, 1024x81, 32000x81) + triton_mm_1309 0.0622 ms 100.0% + triton_mm_1308 0.0670 ms 92.8% + triton_mm_1304 0.0760 ms 81.8% + triton_mm_1302 0.0789 ms 78.8% + triton_mm_1310 0.0896 ms 69.4% + triton_mm_1301 0.0958 ms 64.9% + triton_mm_1303 0.0958 ms 64.9% + triton_mm_1305 0.1015 ms 61.2% + triton_mm_1300 0.1186 ms 52.4% + triton_mm_1307 0.1248 ms 49.8% +SingleProcess AUTOTUNE takes 8.0148 seconds +AUTOTUNE int_mm(32000x1024, 1024x320, 32000x320) + triton_mm_1319 0.1247 ms 100.0% + triton_mm_1321 0.1270 ms 98.2% + triton_mm_1313 0.1348 ms 92.5% + triton_mm_1315 0.1486 ms 83.9% + triton_mm_1320 0.1552 ms 80.4% + triton_mm_1312 0.1607 ms 77.6% + triton_mm_1311 0.1653 ms 75.4% + triton_mm_1314 0.1705 ms 73.1% + triton_mm_1318 0.2664 ms 46.8% + triton_mm_1317 0.3146 ms 39.6% +SingleProcess AUTOTUNE takes 7.6002 seconds +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] + running benchmark: 0%| | 0/30 [00:00 + async_compile.wait(globals()) + File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 2470, in wait + scope[key] = result.result() + File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 2313, in result + self.future.result() + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/concurrent/futures/_base.py", line 458, in result + return self.__get_result() + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result + raise self._exception +torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised: +CompilationError: at 14:40: xnumel = 196 + yoffset = tl.program_id(1).to(tl.int64) * YBLOCK + yindex = yoffset + tl.arange(0, YBLOCK)[None, :].to(tl.int64) + ymask = yindex < ynumel + xoffset = tl.program_id(0).to(tl.int64) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:, None].to(tl.int64) + xmask = xindex < xnumel + x2 = xindex + y3 = yindex + y0 = yindex % 1024 + y1 = (yindex // 1024) + tmp0 = tl.load(in_ptr0 + (x2 + (196*y3)), xmask, eviction_policy='evict_last').to(tl.float32) + ^ +ValueError('numel (262144) exceeds triton maximum tensor numel (131072)') + +Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information + + +You can suppress this exception and fall back to eager by setting: + import torch._dynamo + torch._dynamo.config.suppress_errors = True + +Run failed with return code: 255 +Output: None +Error: None + loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_101_fpn +WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead + loading model: 0it [00:08, ?it/s] +WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead +cuda eval detectron2_maskrcnn_r_101_fpn int8dynamic-bs32 +WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['non-cuda device in graph'] +[2023-12-12 12:04:47,676] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program +skipping cudagraphs due to ['non-cuda device in graph'] +skipping cudagraphs due to ['non-cuda device in graph'] +[2023-12-12 12:06:04,086] [30/1_1] torch._inductor.utils: [WARNING] DeviceCopy in input program +skipping cudagraphs due to ['non-cuda device in graph'] +AUTOTUNE convolution(967x256x14x14, 256x256x3x3) + convolution 0.9932 ms 100.0% + triton_convolution_1327 5.8403 ms 17.0% + triton_convolution_1325 6.4791 ms 15.3% + triton_convolution_1322 7.3768 ms 13.5% + triton_convolution_1328 10.8547 ms 9.1% + triton_convolution_1326 14.0171 ms 7.1% + triton_convolution_1323 18.6382 ms 5.3% + triton_convolution_1324 25.5688 ms 3.9% +SingleProcess AUTOTUNE takes 5.9567 seconds +AUTOTUNE addmm(758128x80, 758128x256, 256x80) + bias_addmm 0.3601 ms 100.0% + triton_mm_1352 0.4278 ms 84.2% + triton_mm_1357 0.4576 ms 78.7% + triton_mm_1351 0.4592 ms 78.4% + triton_mm_1354 0.4899 ms 73.5% + triton_mm_1353 0.4958 ms 72.6% + triton_mm_1350 0.5550 ms 64.9% + triton_mm_1358 0.5671 ms 63.5% + addmm 0.6006 ms 60.0% + triton_mm_1355 0.7203 ms 50.0% +SingleProcess AUTOTUNE takes 5.8245 seconds +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] + running benchmark: 0%| | 0/30 [00:00 + async_compile.wait(globals()) + File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 2470, in wait + scope[key] = result.result() + File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 2313, in result + self.future.result() + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/concurrent/futures/_base.py", line 458, in result + return self.__get_result() + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result + raise self._exception +torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised: +CompilationError: at 14:40: xnumel = 196 + yoffset = tl.program_id(1).to(tl.int64) * YBLOCK + yindex = yoffset + tl.arange(0, YBLOCK)[None, :].to(tl.int64) + ymask = yindex < ynumel + xoffset = tl.program_id(0).to(tl.int64) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:, None].to(tl.int64) + xmask = xindex < xnumel + x2 = xindex + y3 = yindex + y0 = yindex % 1024 + y1 = (yindex // 1024) + tmp0 = tl.load(in_ptr0 + (x2 + (196*y3)), xmask, eviction_policy='evict_last').to(tl.float32) + ^ +ValueError('numel (262144) exceeds triton maximum tensor numel (131072)') + +Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information + + +You can suppress this exception and fall back to eager by setting: + import torch._dynamo + torch._dynamo.config.suppress_errors = True + +Run failed with return code: 255 +Output: None +Error: None + loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_50_fpn +WARNING:common:Model detectron2_maskrcnn_r_50_fpn does not support bfloat16, running with amp instead + loading model: 0it [00:06, ?it/s] +WARNING:common:Model detectron2_maskrcnn_r_50_fpn does not support bfloat16, running with amp instead +cuda eval detectron2_maskrcnn_r_50_fpn int8dynamic-bs32 +WARNING:common:Model detectron2_maskrcnn_r_50_fpn does not support bfloat16, running with amp instead +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['non-cuda device in graph'] +[2023-12-12 12:17:51,708] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program +skipping cudagraphs due to ['non-cuda device in graph'] +skipping cudagraphs due to ['non-cuda device in graph'] +[2023-12-12 12:18:56,679] [30/1_1] torch._inductor.utils: [WARNING] DeviceCopy in input program +skipping cudagraphs due to ['non-cuda device in graph'] +AUTOTUNE convolution(1154x256x14x14, 256x256x3x3) + convolution 1.1834 ms 100.0% + triton_convolution_800 6.9411 ms 17.0% + triton_convolution_798 7.7173 ms 15.3% + triton_convolution_795 8.7726 ms 13.5% + triton_convolution_801 12.9288 ms 9.2% + triton_convolution_799 16.7297 ms 7.1% + triton_convolution_796 22.5466 ms 5.2% + triton_convolution_797 30.0653 ms 3.9% +SingleProcess AUTOTUNE takes 5.5937 seconds +AUTOTUNE addmm(904736x80, 904736x256, 256x80) + bias_addmm 0.4253 ms 100.0% + triton_mm_830 0.5515 ms 77.1% + triton_mm_825 0.5585 ms 76.2% + triton_mm_827 0.5972 ms 71.2% + triton_mm_824 0.7048 ms 60.3% + triton_mm_826 0.7101 ms 59.9% + addmm 0.7138 ms 59.6% + triton_mm_831 0.7736 ms 55.0% + triton_mm_823 0.8249 ms 51.6% + triton_mm_828 0.8686 ms 49.0% +SingleProcess AUTOTUNE takes 5.7222 seconds +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] + running benchmark: 0%| | 0/30 [00:00bhts", q, k * softmax_scale) + File "/home/cdhernandez/local/pytorch/torch/functional.py", line 380, in einsum + return _VF.einsum(equation, operands) # type: ignore[attr-defined] +torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU 0 has a total capacity of 79.15 GiB of which 589.69 MiB is free. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 77.73 GiB is allocated by PyTorch, and 336.30 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model + self.validate_model(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model + raise NotImplementedError("Eager model failed to run") from e +NotImplementedError: Eager model failed to run + + loading model: 0it [00:00, ?it/s] loading model: 0it [00:02, ?it/s] +phlippe_densenet +cuda eval phlippe_densenet int8dynamic-bs32 +AUTOTUNE convolution(32x3x32x32, 32x3x3x3) + triton_convolution_0 0.0162 ms 100.0% + triton_convolution_4 0.0166 ms 97.9% + convolution 0.0184 ms 88.2% + triton_convolution_3 0.0198 ms 81.8% + triton_convolution_2 0.0214 ms 75.8% + triton_convolution_5 0.0251 ms 64.7% + triton_convolution_1 0.0260 ms 62.4% +SingleProcess AUTOTUNE takes 2.9950 seconds +AUTOTUNE mm(32768x32, 32x32) + triton_mm_9 0.0096 ms 100.0% + triton_mm_11 0.0096 ms 100.0% + triton_mm_14 0.0096 ms 100.0% + triton_mm_6 0.0098 ms 98.0% + triton_mm_7 0.0098 ms 98.0% + triton_mm_17 0.0098 ms 98.0% + triton_mm_8 0.0099 ms 97.1% + triton_mm_16 0.0100 ms 96.1% + triton_mm_12 0.0100 ms 95.5% + triton_mm_15 0.0100 ms 95.5% +SingleProcess AUTOTUNE takes 3.3572 seconds +AUTOTUNE convolution(32x32x32x32, 16x32x3x3) + convolution 0.0164 ms 100.0% + triton_convolution_21 0.0302 ms 54.4% + triton_convolution_22 0.0314 ms 52.4% + triton_convolution_18 0.0315 ms 52.2% + triton_convolution_23 0.0425 ms 38.7% + triton_convolution_19 0.0427 ms 38.6% + triton_convolution_20 0.0780 ms 21.1% +SingleProcess AUTOTUNE takes 3.1045 seconds +AUTOTUNE mm(32768x48, 48x32) + triton_mm_34 0.0109 ms 100.0% + triton_mm_35 0.0113 ms 96.0% + triton_mm_26 0.0116 ms 94.2% + triton_mm_33 0.0116 ms 93.4% + triton_mm_31 0.0117 ms 92.9% + triton_mm_24 0.0117 ms 92.6% + triton_mm_32 0.0118 ms 91.9% + triton_mm_27 0.0121 ms 89.7% + triton_mm_28 0.0122 ms 89.1% + triton_mm_25 0.0126 ms 86.5% +SingleProcess AUTOTUNE takes 4.0981 seconds +AUTOTUNE mm(32768x128, 128x64) + triton_mm_114 0.0159 ms 100.0% + triton_mm_115 0.0166 ms 96.0% + triton_mm_121 0.0168 ms 94.7% + triton_mm_116 0.0170 ms 94.0% + triton_mm_117 0.0175 ms 90.9% + triton_mm_122 0.0181 ms 88.0% + triton_mm_118 0.0191 ms 83.4% + triton_mm_124 0.0204 ms 78.1% + mm 0.0205 ms 77.8% + triton_mm_120 0.0205 ms 77.6% +SingleProcess AUTOTUNE takes 4.1403 seconds +AUTOTUNE mm(8192x64, 64x32) + triton_mm_128 0.0079 ms 100.0% + triton_mm_126 0.0084 ms 94.6% + triton_mm_133 0.0084 ms 94.6% + triton_mm_132 0.0084 ms 94.3% + triton_mm_129 0.0085 ms 92.9% + triton_mm_135 0.0086 ms 92.2% + triton_mm_127 0.0086 ms 91.5% + triton_mm_130 0.0087 ms 90.8% + triton_mm_134 0.0088 ms 90.1% + triton_mm_137 0.0088 ms 89.5% +SingleProcess AUTOTUNE takes 3.7232 seconds +AUTOTUNE convolution(32x32x16x16, 16x32x3x3) + convolution 0.0109 ms 100.0% + triton_convolution_141 0.0175 ms 62.3% + triton_convolution_142 0.0184 ms 59.4% + triton_convolution_138 0.0193 ms 56.5% + triton_convolution_143 0.0242 ms 45.2% + triton_convolution_139 0.0316 ms 34.5% + triton_convolution_140 0.0779 ms 14.0% +SingleProcess AUTOTUNE takes 2.6783 seconds +AUTOTUNE mm(8192x160, 160x80) + triton_mm_235 0.0125 ms 100.0% + triton_mm_236 0.0125 ms 99.7% + triton_mm_234 0.0129 ms 97.0% + triton_mm_237 0.0130 ms 96.3% + triton_mm_238 0.0130 ms 96.3% + mm 0.0132 ms 94.7% + triton_mm_239 0.0137 ms 91.1% + triton_mm_242 0.0140 ms 89.5% + triton_mm_245 0.0142 ms 88.1% + triton_mm_241 0.0146 ms 85.7% +SingleProcess AUTOTUNE takes 5.1406 seconds +AUTOTUNE mm(2048x80, 80x32) + triton_mm_252 0.0071 ms 100.0% + triton_mm_249 0.0072 ms 98.2% + triton_mm_251 0.0072 ms 98.2% + triton_mm_254 0.0074 ms 96.1% + triton_mm_247 0.0076 ms 92.9% + triton_mm_250 0.0076 ms 92.5% + triton_mm_248 0.0079 ms 89.8% + triton_mm_255 0.0079 ms 89.8% + triton_mm_246 0.0084 ms 84.7% + mm 0.0084 ms 83.7% +SingleProcess AUTOTUNE takes 4.4931 seconds +AUTOTUNE convolution(32x32x8x8, 16x32x3x3) + convolution 0.0106 ms 100.0% + triton_convolution_262 0.0164 ms 64.6% + triton_convolution_258 0.0171 ms 62.2% + triton_convolution_261 0.0179 ms 59.3% + triton_convolution_263 0.0257 ms 41.4% + triton_convolution_259 0.0326 ms 32.6% + triton_convolution_260 0.0773 ms 13.7% +SingleProcess AUTOTUNE takes 2.7801 seconds +AUTOTUNE mm(2048x96, 96x32) + triton_mm_270 0.0070 ms 100.0% + triton_mm_267 0.0072 ms 97.8% + triton_mm_266 0.0074 ms 94.4% + triton_mm_265 0.0076 ms 91.6% + triton_mm_268 0.0076 ms 91.6% + triton_mm_269 0.0078 ms 90.3% + triton_mm_272 0.0079 ms 89.0% + triton_mm_273 0.0079 ms 89.0% + triton_mm_264 0.0084 ms 83.9% + mm 0.0084 ms 83.0% +SingleProcess AUTOTUNE takes 3.7179 seconds +AUTOTUNE mm(2048x112, 112x32) + triton_mm_288 0.0074 ms 100.0% + triton_mm_285 0.0076 ms 97.1% + triton_mm_287 0.0076 ms 97.1% + triton_mm_283 0.0079 ms 94.5% + triton_mm_290 0.0079 ms 94.3% + triton_mm_291 0.0079 ms 94.3% + triton_mm_286 0.0081 ms 91.7% + triton_mm_284 0.0083 ms 89.2% + mm 0.0084 ms 87.9% + triton_mm_282 0.0095 ms 78.4% +SingleProcess AUTOTUNE takes 3.7757 seconds +AUTOTUNE mm(2048x128, 128x32) + triton_mm_308 0.0074 ms 100.0% + triton_mm_305 0.0076 ms 96.7% + triton_mm_306 0.0078 ms 94.7% + triton_mm_303 0.0080 ms 92.0% + triton_mm_302 0.0081 ms 91.3% + triton_mm_304 0.0081 ms 91.3% + triton_mm_309 0.0081 ms 91.3% + mm 0.0081 ms 90.9% + triton_mm_301 0.0083 ms 88.8% + triton_mm_300 0.0092 ms 79.9% +SingleProcess AUTOTUNE takes 3.7442 seconds +AUTOTUNE mm(2048x144, 144x32) + triton_mm_324 0.0080 ms 100.0% + triton_mm_326 0.0083 ms 96.2% + triton_mm_327 0.0084 ms 95.8% + triton_mm_321 0.0084 ms 95.4% + triton_mm_323 0.0084 ms 94.7% + triton_mm_322 0.0085 ms 94.0% + triton_mm_320 0.0085 ms 93.6% + triton_mm_319 0.0086 ms 93.3% + mm 0.0092 ms 86.5% + triton_mm_318 0.0099 ms 80.9% +SingleProcess AUTOTUNE takes 3.8789 seconds +AUTOTUNE mm(2048x160, 160x32) + triton_mm_342 0.0076 ms 100.0% + triton_mm_341 0.0079 ms 95.9% + triton_mm_344 0.0079 ms 95.9% + triton_mm_345 0.0084 ms 90.4% + triton_mm_338 0.0085 ms 88.4% + triton_mm_337 0.0086 ms 88.1% + triton_mm_339 0.0086 ms 88.1% + triton_mm_340 0.0090 ms 83.7% + mm 0.0100 ms 75.4% + triton_mm_336 0.0104 ms 72.8% +SingleProcess AUTOTUNE takes 4.1559 seconds +AUTOTUNE mm(2048x176, 176x88) + triton_mm_359 0.0086 ms 100.0% + triton_mm_362 0.0088 ms 97.8% + triton_mm_357 0.0092 ms 92.7% + mm 0.0095 ms 90.5% + triton_mm_355 0.0095 ms 90.5% + triton_mm_363 0.0096 ms 89.6% + triton_mm_356 0.0097 ms 88.4% + triton_mm_358 0.0101 ms 84.8% + triton_mm_360 0.0106 ms 80.7% + triton_mm_354 0.0115 ms 74.4% +SingleProcess AUTOTUNE takes 4.7225 seconds +AUTOTUNE mm(512x88, 88x32) + triton_mm_367 0.0069 ms 100.0% + triton_mm_369 0.0071 ms 96.9% + triton_mm_372 0.0071 ms 96.9% + triton_mm_374 0.0071 ms 96.9% + triton_mm_368 0.0074 ms 93.5% + triton_mm_371 0.0076 ms 91.3% + triton_mm_370 0.0076 ms 90.8% + triton_mm_375 0.0081 ms 85.7% + triton_mm_366 0.0083 ms 83.4% + mm 0.0088 ms 78.8% +SingleProcess AUTOTUNE takes 4.2952 seconds +AUTOTUNE convolution(32x32x4x4, 16x32x3x3) + convolution 0.0109 ms 100.0% + triton_convolution_378 0.0168 ms 65.3% + triton_convolution_382 0.0184 ms 59.6% + triton_convolution_381 0.0189 ms 58.0% + triton_convolution_383 0.0280 ms 39.0% + triton_convolution_379 0.0321 ms 34.1% + triton_convolution_380 0.0421 ms 26.0% +SingleProcess AUTOTUNE takes 2.5215 seconds +AUTOTUNE mm(512x104, 104x32) + triton_mm_390 0.0068 ms 100.0% + triton_mm_385 0.0071 ms 95.7% + triton_mm_392 0.0071 ms 95.7% + triton_mm_389 0.0074 ms 92.8% + triton_mm_387 0.0074 ms 92.4% + triton_mm_388 0.0078 ms 87.1% + triton_mm_386 0.0079 ms 86.1% + triton_mm_393 0.0080 ms 84.9% + mm 0.0088 ms 77.9% + triton_mm_384 0.0090 ms 76.0% +SingleProcess AUTOTUNE takes 3.7611 seconds +AUTOTUNE mm(512x120, 120x32) + triton_mm_403 0.0072 ms 100.0% + triton_mm_408 0.0074 ms 97.4% + triton_mm_404 0.0078 ms 91.8% + triton_mm_410 0.0078 ms 91.6% + triton_mm_407 0.0078 ms 91.4% + triton_mm_405 0.0079 ms 91.1% + triton_mm_411 0.0081 ms 88.9% + triton_mm_406 0.0085 ms 84.7% + triton_mm_402 0.0092 ms 78.0% + mm 0.0095 ms 75.2% +SingleProcess AUTOTUNE takes 3.8015 seconds +AUTOTUNE mm(512x136, 136x32) + triton_mm_426 0.0074 ms 100.0% + triton_mm_421 0.0077 ms 96.7% + triton_mm_429 0.0079 ms 94.3% + triton_mm_428 0.0081 ms 91.9% + triton_mm_423 0.0081 ms 91.7% + triton_mm_425 0.0081 ms 91.7% + mm 0.0086 ms 86.6% + triton_mm_422 0.0087 ms 85.1% + triton_mm_424 0.0088 ms 84.7% + triton_mm_420 0.0099 ms 74.8% +SingleProcess AUTOTUNE takes 4.1494 seconds +AUTOTUNE mm(512x152, 152x32) + triton_mm_444 0.0071 ms 100.0% + triton_mm_441 0.0076 ms 93.7% + triton_mm_447 0.0078 ms 91.4% + triton_mm_446 0.0081 ms 88.5% + triton_mm_443 0.0081 ms 88.1% + triton_mm_439 0.0083 ms 85.8% + triton_mm_440 0.0083 ms 85.8% + triton_mm_442 0.0083 ms 85.8% + mm 0.0091 ms 78.2% + triton_mm_438 0.0097 ms 73.8% +SingleProcess AUTOTUNE takes 4.0948 seconds +AUTOTUNE mm(512x168, 168x32) + triton_mm_465 0.0074 ms 100.0% + triton_mm_464 0.0076 ms 97.5% + triton_mm_462 0.0076 ms 97.1% + triton_mm_461 0.0079 ms 94.3% + triton_mm_459 0.0085 ms 87.2% + triton_mm_457 0.0085 ms 86.9% + triton_mm_458 0.0090 ms 82.6% + mm 0.0091 ms 81.4% + triton_mm_460 0.0092 ms 80.8% + triton_mm_456 0.0104 ms 71.6% +SingleProcess AUTOTUNE takes 4.0597 seconds +AUTOTUNE int_mm(32x184, 184x10, 32x10) + triton_mm_479 0.0069 ms 100.0% + triton_mm_477 0.0071 ms 96.9% + triton_mm_478 0.0076 ms 90.8% + triton_mm_476 0.0078 ms 88.2% + triton_mm_475 0.0083 ms 83.4% + triton_mm_474 0.0092 ms 75.3% +SingleProcess AUTOTUNE takes 2.5471 seconds + running benchmark: 0%| | 0/30 [00:00 /home/cdhernandez/local/pytorch/torch/_ops.py(759)__call__() +-> return self._op(*args, **(kwargs or {})) +(Pdb) TIMEOUT + loading model: 0it [00:00, ?it/s] loading model: 0it [00:02, ?it/s] +timm_efficientnet +cuda eval timm_efficientnet int8dynamic-bs32 +AUTOTUNE convolution(32x3x224x224, 32x3x3x3) + convolution 0.1094 ms 100.0% + triton_convolution_4 0.1242 ms 88.1% + triton_convolution_0 0.1343 ms 81.5% + triton_convolution_3 0.1370 ms 79.9% + triton_convolution_2 0.1484 ms 73.7% + triton_convolution_5 0.1879 ms 58.2% + triton_convolution_1 0.2244 ms 48.8% +SingleProcess AUTOTUNE takes 0.9131 seconds +AUTOTUNE addmm(32x8, 32x32, 32x8) + triton_mm_6 0.0065 ms 100.0% + triton_mm_7 0.0070 ms 91.8% + triton_mm_8 0.0070 ms 91.8% + triton_mm_9 0.0070 ms 91.8% + triton_mm_11 0.0071 ms 91.0% + triton_mm_10 0.0074 ms 87.3% + bias_addmm 0.0076 ms 84.5% + addmm 0.0117 ms 55.3% +SingleProcess AUTOTUNE takes 1.0004 seconds +AUTOTUNE addmm(32x32, 32x8, 8x32) + triton_mm_13 0.0060 ms 100.0% + triton_mm_16 0.0062 ms 96.4% + triton_mm_14 0.0066 ms 90.8% + triton_mm_15 0.0066 ms 90.8% + triton_mm_12 0.0066 ms 90.6% + bias_addmm 0.0074 ms 81.0% + addmm 0.0120 ms 50.1% +SingleProcess AUTOTUNE takes 0.8723 seconds +AUTOTUNE mm(401408x32, 32x16) + triton_mm_17 0.0348 ms 100.0% + triton_mm_21 0.0348 ms 99.8% + triton_mm_24 0.0348 ms 99.7% + triton_mm_18 0.0351 ms 99.0% + triton_mm_20 0.0351 ms 98.9% + triton_mm_22 0.0351 ms 98.9% + triton_mm_19 0.0352 ms 98.8% + triton_mm_25 0.0354 ms 98.2% + triton_mm_23 0.0354 ms 98.1% + triton_mm_27 0.0386 ms 90.1% +SingleProcess AUTOTUNE takes 1.4750 seconds +AUTOTUNE mm(401408x16, 16x96) + triton_mm_33 0.0646 ms 100.0% + triton_mm_37 0.0647 ms 99.9% + triton_mm_38 0.0661 ms 97.8% + triton_mm_28 0.0673 ms 96.1% + triton_mm_32 0.0678 ms 95.4% + triton_mm_29 0.0685 ms 94.3% + triton_mm_30 0.0685 ms 94.3% + triton_mm_35 0.0686 ms 94.2% + triton_mm_31 0.0689 ms 93.9% + triton_mm_36 0.0690 ms 93.7% +SingleProcess AUTOTUNE takes 1.6201 seconds +AUTOTUNE addmm(32x4, 32x96, 96x4) + triton_mm_42 0.0067 ms 100.0% + triton_mm_41 0.0073 ms 92.5% + triton_mm_40 0.0073 ms 92.1% + triton_mm_43 0.0074 ms 91.3% + triton_mm_44 0.0078 ms 85.7% + bias_addmm 0.0081 ms 82.7% + triton_mm_39 0.0082 ms 82.2% + triton_mm_46 0.0083 ms 80.8% + triton_mm_45 0.0089 ms 75.8% + addmm 0.0112 ms 59.8% +SingleProcess AUTOTUNE takes 1.2656 seconds +AUTOTUNE addmm(32x96, 32x4, 4x96) + triton_mm_52 0.0062 ms 100.0% + triton_mm_51 0.0063 ms 99.5% + triton_mm_53 0.0063 ms 99.5% + triton_mm_57 0.0065 ms 96.1% + triton_mm_56 0.0067 ms 93.1% + triton_mm_48 0.0067 ms 92.9% + triton_mm_50 0.0069 ms 90.3% + triton_mm_55 0.0069 ms 90.1% + triton_mm_49 0.0070 ms 89.4% + triton_mm_47 0.0070 ms 89.0% +SingleProcess AUTOTUNE takes 1.9945 seconds +AUTOTUNE mm(100352x96, 96x24) + triton_mm_65 0.0292 ms 100.0% + triton_mm_61 0.0296 ms 98.7% + triton_mm_62 0.0299 ms 97.7% + triton_mm_58 0.0300 ms 97.5% + triton_mm_63 0.0304 ms 96.3% + triton_mm_66 0.0304 ms 96.1% + triton_mm_60 0.0309 ms 94.6% + triton_mm_59 0.0310 ms 94.2% + triton_mm_69 0.0318 ms 91.9% + triton_mm_68 0.0320 ms 91.3% +SingleProcess AUTOTUNE takes 1.6133 seconds +AUTOTUNE mm(100352x24, 24x144) + triton_mm_72 0.0318 ms 100.0% + triton_mm_70 0.0332 ms 95.8% + triton_mm_81 0.0342 ms 93.1% + triton_mm_74 0.0343 ms 92.9% + triton_mm_75 0.0355 ms 89.7% + triton_mm_78 0.0355 ms 89.6% + triton_mm_79 0.0375 ms 85.0% + triton_mm_71 0.0380 ms 83.8% + triton_mm_77 0.0390 ms 81.6% + triton_mm_76 0.0396 ms 80.3% +SingleProcess AUTOTUNE takes 1.6121 seconds +AUTOTUNE addmm(32x6, 32x144, 144x6) + triton_mm_85 0.0069 ms 100.0% + triton_mm_86 0.0076 ms 90.8% + triton_mm_84 0.0077 ms 90.0% + triton_mm_87 0.0078 ms 88.6% + triton_mm_83 0.0079 ms 87.5% + bias_addmm 0.0091 ms 76.7% + triton_mm_82 0.0091 ms 76.1% + triton_mm_88 0.0108 ms 64.0% + triton_mm_89 0.0111 ms 62.5% + addmm 0.0123 ms 56.7% +SingleProcess AUTOTUNE takes 1.2720 seconds +AUTOTUNE addmm(32x144, 32x6, 6x144) + triton_mm_90 0.0065 ms 100.0% + triton_mm_96 0.0065 ms 100.0% + triton_mm_93 0.0067 ms 96.2% + triton_mm_97 0.0067 ms 96.2% + triton_mm_95 0.0068 ms 95.7% + triton_mm_99 0.0068 ms 95.3% + triton_mm_92 0.0070 ms 92.2% + triton_mm_94 0.0070 ms 92.2% + triton_mm_98 0.0070 ms 92.2% + triton_mm_100 0.0070 ms 92.2% +SingleProcess AUTOTUNE takes 1.7632 seconds +AUTOTUNE mm(100352x144, 144x24) + triton_mm_105 0.0410 ms 100.0% + triton_mm_103 0.0410 ms 99.8% + triton_mm_106 0.0416 ms 98.5% + triton_mm_108 0.0416 ms 98.5% + triton_mm_104 0.0421 ms 97.3% + triton_mm_112 0.0421 ms 97.3% + triton_mm_109 0.0422 ms 97.0% + triton_mm_111 0.0432 ms 94.8% + triton_mm_102 0.0435 ms 94.1% + mm 0.0449 ms 91.3% +SingleProcess AUTOTUNE takes 1.5944 seconds +AUTOTUNE mm(25088x144, 144x40) + triton_mm_146 0.0155 ms 100.0% + triton_mm_151 0.0162 ms 95.5% + triton_mm_148 0.0166 ms 93.1% + triton_mm_155 0.0177 ms 87.5% + triton_mm_145 0.0180 ms 86.1% + triton_mm_152 0.0181 ms 85.3% + triton_mm_144 0.0182 ms 85.0% + triton_mm_147 0.0188 ms 82.4% + triton_mm_149 0.0196 ms 79.1% + mm 0.0201 ms 76.8% +SingleProcess AUTOTUNE takes 4.1262 seconds +AUTOTUNE mm(25088x40, 40x240) + triton_mm_164 0.0207 ms 100.0% + triton_mm_158 0.0208 ms 99.7% + mm 0.0216 ms 95.9% + triton_mm_167 0.0223 ms 93.1% + triton_mm_160 0.0226 ms 91.8% + triton_mm_163 0.0240 ms 86.3% + triton_mm_157 0.0243 ms 85.3% + triton_mm_159 0.0249 ms 83.4% + triton_mm_165 0.0258 ms 80.5% + triton_mm_156 0.0261 ms 79.4% +SingleProcess AUTOTUNE takes 1.6583 seconds +AUTOTUNE addmm(32x10, 32x240, 240x10) + triton_mm_171 0.0078 ms 100.0% + triton_mm_172 0.0078 ms 99.2% + triton_mm_173 0.0081 ms 96.4% + triton_mm_170 0.0084 ms 92.4% + triton_mm_169 0.0085 ms 91.0% + bias_addmm 0.0095 ms 81.8% + triton_mm_168 0.0109 ms 71.1% + addmm 0.0128 ms 60.6% + triton_mm_175 0.0148 ms 52.5% + triton_mm_174 0.0150 ms 51.8% +SingleProcess AUTOTUNE takes 1.3745 seconds +AUTOTUNE addmm(32x240, 32x10, 10x240) + triton_mm_181 0.0062 ms 100.0% + triton_mm_185 0.0062 ms 100.0% + triton_mm_178 0.0065 ms 96.5% + triton_mm_180 0.0065 ms 96.5% + triton_mm_182 0.0065 ms 96.5% + triton_mm_184 0.0065 ms 96.5% + triton_mm_186 0.0065 ms 96.5% + triton_mm_176 0.0070 ms 89.0% + triton_mm_183 0.0075 ms 83.7% + triton_mm_179 0.0075 ms 83.3% +SingleProcess AUTOTUNE takes 1.6535 seconds +AUTOTUNE mm(25088x240, 240x40) + triton_mm_189 0.0208 ms 100.0% + triton_mm_191 0.0210 ms 99.2% + triton_mm_194 0.0216 ms 96.2% + triton_mm_195 0.0231 ms 90.2% + triton_mm_190 0.0233 ms 89.2% + mm 0.0240 ms 86.6% + triton_mm_187 0.0247 ms 84.1% + triton_mm_188 0.0248 ms 84.0% + triton_mm_198 0.0259 ms 80.3% + triton_mm_192 0.0260 ms 80.0% +SingleProcess AUTOTUNE takes 4.6825 seconds +AUTOTUNE mm(6272x240, 240x80) + triton_mm_232 0.0122 ms 100.0% + triton_mm_234 0.0124 ms 99.0% + triton_mm_233 0.0125 ms 97.7% + triton_mm_238 0.0126 ms 97.2% + triton_mm_231 0.0128 ms 95.7% + triton_mm_235 0.0140 ms 87.6% + mm 0.0141 ms 86.8% + triton_mm_239 0.0145 ms 84.1% + triton_mm_230 0.0156 ms 78.6% + triton_mm_236 0.0170 ms 71.9% +SingleProcess AUTOTUNE takes 1.6571 seconds +AUTOTUNE mm(6272x80, 80x480) + triton_mm_249 0.0139 ms 100.0% + mm 0.0148 ms 94.0% + triton_mm_245 0.0148 ms 94.0% + triton_mm_243 0.0149 ms 93.1% + triton_mm_242 0.0152 ms 91.4% + triton_mm_246 0.0154 ms 90.6% + triton_mm_244 0.0154 ms 90.4% + triton_mm_250 0.0182 ms 76.4% + triton_mm_252 0.0185 ms 75.4% + triton_mm_253 0.0209 ms 66.7% +SingleProcess AUTOTUNE takes 1.7384 seconds +AUTOTUNE addmm(32x20, 32x480, 480x20) + triton_mm_257 0.0092 ms 100.0% + triton_mm_258 0.0100 ms 92.0% + triton_mm_259 0.0101 ms 90.9% + triton_mm_256 0.0103 ms 89.4% + triton_mm_255 0.0113 ms 81.4% + bias_addmm 0.0119 ms 77.2% + addmm 0.0155 ms 59.5% + triton_mm_254 0.0170 ms 54.3% + triton_mm_261 0.0198 ms 46.5% + triton_mm_260 0.0207 ms 44.6% +SingleProcess AUTOTUNE takes 1.2630 seconds +AUTOTUNE addmm(32x480, 32x20, 20x480) + triton_mm_267 0.0065 ms 100.0% + triton_mm_262 0.0065 ms 99.5% + triton_mm_268 0.0065 ms 99.5% + triton_mm_273 0.0069 ms 93.1% + triton_mm_271 0.0070 ms 92.2% + triton_mm_264 0.0071 ms 91.4% + triton_mm_266 0.0071 ms 90.6% + triton_mm_270 0.0071 ms 90.6% + triton_mm_263 0.0074 ms 87.4% + triton_mm_269 0.0075 ms 86.3% +SingleProcess AUTOTUNE takes 1.7823 seconds +AUTOTUNE mm(6272x480, 480x80) + triton_mm_278 0.0151 ms 100.0% + mm 0.0153 ms 98.7% + triton_mm_277 0.0154 ms 98.0% + triton_mm_282 0.0157 ms 95.9% + triton_mm_275 0.0164 ms 91.8% + triton_mm_276 0.0165 ms 91.5% + triton_mm_279 0.0174 ms 86.7% + triton_mm_283 0.0205 ms 73.4% + triton_mm_280 0.0209 ms 72.1% + triton_mm_274 0.0231 ms 65.2% +SingleProcess AUTOTUNE takes 1.6409 seconds +AUTOTUNE mm(6272x480, 480x112) + triton_mm_366 0.0156 ms 100.0% + mm 0.0159 ms 98.4% + triton_mm_365 0.0159 ms 98.4% + triton_mm_364 0.0166 ms 94.0% + triton_mm_370 0.0171 ms 91.4% + triton_mm_363 0.0174 ms 89.9% + triton_mm_367 0.0202 ms 77.4% + triton_mm_362 0.0233 ms 67.3% + triton_mm_371 0.0234 ms 67.0% + triton_mm_368 0.0244 ms 64.3% +SingleProcess AUTOTUNE takes 1.6373 seconds +AUTOTUNE mm(6272x112, 112x672) + triton_mm_376 0.0191 ms 100.0% + triton_mm_381 0.0207 ms 92.6% + triton_mm_375 0.0217 ms 88.2% + triton_mm_378 0.0221 ms 86.4% + triton_mm_382 0.0223 ms 85.7% + triton_mm_377 0.0226 ms 84.6% + triton_mm_374 0.0231 ms 82.8% + mm 0.0250 ms 76.7% + triton_mm_384 0.0277 ms 69.0% + triton_mm_383 0.0313 ms 61.1% +SingleProcess AUTOTUNE takes 1.6365 seconds +AUTOTUNE addmm(32x28, 32x672, 672x28) + triton_mm_389 0.0107 ms 100.0% + triton_mm_390 0.0112 ms 96.0% + triton_mm_388 0.0118 ms 90.5% + bias_addmm 0.0121 ms 88.6% + triton_mm_391 0.0128 ms 84.0% + triton_mm_387 0.0139 ms 77.0% + addmm 0.0156 ms 68.8% + triton_mm_386 0.0213 ms 50.4% + triton_mm_393 0.0257 ms 41.7% + triton_mm_392 0.0261 ms 41.1% +SingleProcess AUTOTUNE takes 1.2713 seconds +AUTOTUNE addmm(32x672, 32x28, 28x672) + triton_mm_403 0.0065 ms 100.0% + triton_mm_398 0.0067 ms 96.7% + triton_mm_397 0.0069 ms 93.1% + triton_mm_399 0.0070 ms 92.2% + triton_mm_396 0.0072 ms 90.0% + triton_mm_402 0.0072 ms 89.8% + triton_mm_394 0.0072 ms 89.4% + triton_mm_400 0.0072 ms 89.4% + triton_mm_395 0.0074 ms 87.8% + triton_mm_401 0.0075 ms 86.3% +SingleProcess AUTOTUNE takes 1.7761 seconds +AUTOTUNE mm(6272x672, 672x112) + mm 0.0181 ms 100.0% + triton_mm_410 0.0182 ms 99.6% + triton_mm_409 0.0190 ms 95.1% + triton_mm_408 0.0203 ms 89.1% + triton_mm_414 0.0205 ms 88.3% + triton_mm_407 0.0210 ms 86.3% + triton_mm_411 0.0250 ms 72.5% + triton_mm_412 0.0282 ms 64.3% + triton_mm_415 0.0295 ms 61.4% + triton_mm_406 0.0298 ms 60.7% +SingleProcess AUTOTUNE takes 1.6350 seconds +AUTOTUNE mm(1568x672, 672x192) + mm 0.0125 ms 100.0% + triton_mm_502 0.0135 ms 92.4% + triton_mm_498 0.0149 ms 83.9% + triton_mm_497 0.0154 ms 80.9% + triton_mm_499 0.0157 ms 79.3% + triton_mm_500 0.0157 ms 79.3% + triton_mm_503 0.0160 ms 78.0% + triton_mm_495 0.0177 ms 70.4% + triton_mm_496 0.0179 ms 69.8% + triton_mm_494 0.0256 ms 48.8% +SingleProcess AUTOTUNE takes 5.0328 seconds +AUTOTUNE mm(1568x192, 192x1152) + triton_mm_507 0.0138 ms 100.0% + triton_mm_508 0.0142 ms 97.1% + triton_mm_506 0.0144 ms 95.3% + mm 0.0149 ms 92.1% + triton_mm_514 0.0154 ms 89.2% + triton_mm_509 0.0156 ms 87.9% + triton_mm_510 0.0158 ms 87.2% + triton_mm_513 0.0166 ms 82.9% + triton_mm_516 0.0175 ms 78.6% + triton_mm_512 0.0215 ms 63.9% +SingleProcess AUTOTUNE takes 1.6465 seconds +AUTOTUNE addmm(32x48, 32x1152, 1152x48) + bias_addmm 0.0122 ms 100.0% + triton_mm_521 0.0134 ms 90.7% + triton_mm_522 0.0151 ms 80.3% + triton_mm_524 0.0153 ms 79.2% + addmm 0.0155 ms 78.7% + triton_mm_525 0.0158 ms 77.2% + triton_mm_520 0.0161 ms 75.4% + triton_mm_519 0.0209 ms 58.2% + triton_mm_518 0.0346 ms 35.1% + triton_mm_523 0.0355 ms 34.2% +SingleProcess AUTOTUNE takes 1.5143 seconds +AUTOTUNE addmm(32x1152, 32x48, 48x1152) + triton_mm_536 0.0072 ms 100.0% + triton_mm_533 0.0074 ms 97.0% + triton_mm_530 0.0076 ms 94.1% + triton_mm_537 0.0077 ms 93.0% + triton_mm_529 0.0079 ms 91.5% + triton_mm_534 0.0079 ms 91.5% + triton_mm_531 0.0080 ms 89.6% + triton_mm_532 0.0082 ms 87.9% + triton_mm_528 0.0082 ms 87.5% + triton_mm_538 0.0083 ms 86.5% +SingleProcess AUTOTUNE takes 1.7876 seconds +AUTOTUNE mm(1568x1152, 1152x192) + mm 0.0146 ms 100.0% + triton_mm_548 0.0175 ms 83.2% + triton_mm_549 0.0197 ms 73.9% + triton_mm_543 0.0203 ms 72.0% + triton_mm_544 0.0207 ms 70.4% + triton_mm_545 0.0210 ms 69.4% + triton_mm_546 0.0218 ms 67.1% + triton_mm_541 0.0252 ms 57.8% + triton_mm_542 0.0261 ms 55.9% + triton_mm_540 0.0390 ms 37.4% +SingleProcess AUTOTUNE takes 1.8321 seconds +AUTOTUNE mm(1568x1152, 1152x320) + mm 0.0172 ms 100.0% + triton_mm_681 0.0209 ms 82.4% + triton_mm_682 0.0209 ms 82.4% + triton_mm_686 0.0223 ms 77.3% + triton_mm_680 0.0257 ms 66.9% + triton_mm_679 0.0260 ms 66.2% + triton_mm_683 0.0284 ms 60.6% + triton_mm_684 0.0289 ms 59.5% + triton_mm_687 0.0314 ms 54.8% + triton_mm_678 0.0392 ms 44.0% +SingleProcess AUTOTUNE takes 1.6326 seconds +AUTOTUNE mm(1568x320, 320x1280) + triton_mm_691 0.0172 ms 100.0% + triton_mm_692 0.0173 ms 99.4% + mm 0.0180 ms 95.6% + triton_mm_690 0.0184 ms 93.4% + triton_mm_693 0.0193 ms 89.4% + triton_mm_694 0.0195 ms 88.5% + triton_mm_698 0.0204 ms 84.3% + triton_mm_697 0.0211 ms 81.8% + triton_mm_700 0.0258 ms 66.8% + triton_mm_696 0.0310 ms 55.5% +SingleProcess AUTOTUNE takes 1.6535 seconds +AUTOTUNE int_mm(32x1280, 1280x1000, 32x1000) + triton_mm_712 0.0139 ms 100.0% + triton_mm_707 0.0156 ms 89.3% + triton_mm_710 0.0162 ms 85.8% + triton_mm_708 0.0169 ms 82.2% + triton_mm_711 0.0172 ms 80.8% + triton_mm_706 0.0182 ms 76.1% + triton_mm_705 0.0208 ms 66.7% + triton_mm_704 0.0236 ms 58.9% + triton_mm_703 0.0267 ms 52.0% + triton_mm_702 0.0346 ms 40.1% +SingleProcess AUTOTUNE takes 1.5528 seconds + running benchmark: 0%| | 0/30 [00:00 + run() + File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 30, in run + benchmark.run(bm_args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/run.py", line 24, in run + main(TorchBenchmarkRunner(), original_dir, args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3133, in main + process_entry(0, runner, original_dir, args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3090, in process_entry + return maybe_fresh_cache( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1714, in inner + return fn(*args, **kwargs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 383, in load_model + module = importlib.import_module(c) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/importlib/__init__.py", line 126, in import_module + return _bootstrap._gcd_import(name[level:], package, level) + File "", line 1050, in _gcd_import + File "", line 1027, in _find_and_load + File "", line 1006, in _find_and_load_unlocked + File "", line 688, in _load_unlocked + File "", line 883, in exec_module + File "", line 241, in _call_with_frames_removed + File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/__init__.py", line 7, in + from .data.dlrm_dataloader import get_dataloader + File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/data/dlrm_dataloader.py", line 13, in + from torchrec.datasets.criteo import ( + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/__init__.py", line 8, in + import torchrec.distributed # noqa + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/__init__.py", line 36, in + from torchrec.distributed.model_parallel import DistributedModelParallel # noqa + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/model_parallel.py", line 21, in + from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/__init__.py", line 22, in + from torchrec.distributed.planner.planners import EmbeddingShardingPlanner # noqa + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/planners.py", line 19, in + from torchrec.distributed.planner.constants import BATCH_SIZE, MAX_SIZE + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/constants.py", line 10, in + from torchrec.distributed.embedding_types import EmbeddingComputeKernel + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/embedding_types.py", line 14, in + from fbgemm_gpu.split_table_batched_embeddings_ops_training import EmbeddingLocation + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 23, in + from . import _fbgemm_gpu_docs, sparse_ops # noqa: F401, E402 # noqa: F401, E402 + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/_fbgemm_gpu_docs.py", line 19, in + torch.ops.fbgemm.jagged_2d_to_dense, + File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 822, in __getattr__ + raise AttributeError( +AttributeError: '_OpNamespace' 'fbgemm' object has no attribute 'jagged_2d_to_dense' +Run failed with return code: 1 +Output: None +Error: None + loading model: 0it [00:00, ?it/s] loading model: 0it [00:04, ?it/s] +BERT_pytorch +cuda eval BERT_pytorch baseline-bs32 +AUTOTUNE mm(4096x768, 768x768) + mm 0.0306 ms 100.0% + triton_mm_2 0.0371 ms 82.5% + triton_mm_1 0.0403 ms 75.8% + triton_mm_3 0.0411 ms 74.4% + triton_mm_4 0.0413 ms 73.9% + triton_mm_7 0.0438 ms 69.8% + triton_mm_8 0.0481 ms 63.6% + triton_mm_0 0.0570 ms 53.7% + triton_mm_10 0.0712 ms 42.9% + triton_mm_9 0.0790 ms 38.7% +SingleProcess AUTOTUNE takes 1.6112 seconds +AUTOTUNE bmm(384x128x64, 384x64x128) + triton_bmm_32 0.0257 ms 100.0% + triton_bmm_26 0.0265 ms 97.2% + triton_bmm_25 0.0267 ms 96.5% + triton_bmm_27 0.0272 ms 94.8% + triton_bmm_28 0.0281 ms 91.7% + triton_bmm_34 0.0281 ms 91.6% + triton_bmm_24 0.0282 ms 91.2% + bmm 0.0284 ms 90.8% + triton_bmm_31 0.0296 ms 87.0% + triton_bmm_33 0.0316 ms 81.6% +SingleProcess AUTOTUNE takes 1.7496 seconds +AUTOTUNE bmm(384x128x128, 384x128x64) + triton_bmm_50 0.0285 ms 100.0% + triton_bmm_49 0.0293 ms 97.4% + triton_bmm_52 0.0297 ms 96.1% + triton_bmm_48 0.0300 ms 95.1% + triton_bmm_55 0.0300 ms 94.9% + triton_bmm_56 0.0300 ms 94.9% + triton_bmm_51 0.0312 ms 91.6% + triton_bmm_54 0.0319 ms 89.5% + triton_bmm_58 0.0326 ms 87.5% + triton_bmm_53 0.0332 ms 85.9% +SingleProcess AUTOTUNE takes 1.6227 seconds +AUTOTUNE mm(4096x768, 768x3072) + mm 0.0995 ms 100.0% + triton_mm_74 0.1117 ms 89.0% + triton_mm_73 0.1118 ms 89.0% + triton_mm_79 0.1285 ms 77.4% + triton_mm_75 0.1327 ms 75.0% + triton_mm_76 0.1349 ms 73.8% + triton_mm_72 0.1421 ms 70.0% + triton_mm_80 0.1557 ms 63.9% + triton_mm_82 0.2340 ms 42.5% + triton_mm_77 0.2836 ms 35.1% +SingleProcess AUTOTUNE takes 1.6582 seconds +AUTOTUNE mm(4096x3072, 3072x768) + mm 0.0851 ms 100.0% + triton_mm_86 0.1235 ms 68.9% + triton_mm_85 0.1276 ms 66.7% + triton_mm_87 0.1304 ms 65.3% + triton_mm_88 0.1315 ms 64.7% + triton_mm_92 0.1557 ms 54.7% + triton_mm_84 0.1623 ms 52.4% + triton_mm_91 0.1916 ms 44.4% + triton_mm_89 0.2732 ms 31.2% + triton_mm_90 0.2747 ms 31.0% +SingleProcess AUTOTUNE takes 1.6729 seconds + running benchmark: 0%| | 0/30 [00:00 + async_compile.wait(globals()) + File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 2470, in wait + scope[key] = result.result() + File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 2313, in result + self.future.result() + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/concurrent/futures/_base.py", line 458, in result + return self.__get_result() + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result + raise self._exception +torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised: +CompilationError: at 14:40: xnumel = 196 + yoffset = tl.program_id(1).to(tl.int64) * YBLOCK + yindex = yoffset + tl.arange(0, YBLOCK)[None, :].to(tl.int64) + ymask = yindex < ynumel + xoffset = tl.program_id(0).to(tl.int64) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:, None].to(tl.int64) + xmask = xindex < xnumel + x2 = xindex + y3 = yindex + y0 = yindex % 1024 + y1 = (yindex // 1024) + tmp0 = tl.load(in_ptr0 + (x2 + (196*y3)), xmask, eviction_policy='evict_last').to(tl.float32) + ^ +ValueError('numel (262144) exceeds triton maximum tensor numel (131072)') + +Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information + + +You can suppress this exception and fall back to eager by setting: + import torch._dynamo + torch._dynamo.config.suppress_errors = True + +Run failed with return code: 255 +Output: None +Error: None + loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_dc5 + loading model: 0it [00:09, ?it/s] +WARNING:root:detectron2_fasterrcnn_r_101_dc5 failed to load +Original Error: "roi_align_forward_kernel" not implemented for 'BFloat16' +Eager model failed to run +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model + self.model_iter_fn(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward + return self.inference(batched_inputs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference + results, _ = self.roi_heads(images, features, proposals, None) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 747, in forward + pred_instances = self._forward_box(features, proposals) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 798, in _forward_box + box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals]) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward + return self.level_poolers[0](x[0], pooler_fmt_boxes) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward + return roi_align( + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 238, in roi_align + return torch.ops.torchvision.roi_align( + File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 755, in __call__ + return self._op(*args, **(kwargs or {})) +RuntimeError: "roi_align_forward_kernel" not implemented for 'BFloat16' + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model + self.validate_model(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model + raise NotImplementedError("Eager model failed to run") from e +NotImplementedError: Eager model failed to run + + loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_fpn +WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead + loading model: 0it [00:05, ?it/s] +WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead +cuda eval detectron2_fasterrcnn_r_101_fpn baseline-bs32 +WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead +skipping cudagraphs due to ['mutated inputs'] +AUTOTUNE convolution(32x3x1216x1344, 64x3x7x7) + convolution 3.7276 ms 100.0% + triton_convolution_3 22.1441 ms 16.8% + triton_convolution_4 24.2946 ms 15.3% + triton_convolution_5 26.9298 ms 13.8% + triton_convolution_0 30.0927 ms 12.4% + triton_convolution_2 32.5115 ms 11.5% + triton_convolution_1 81.4334 ms 4.6% +SingleProcess AUTOTUNE takes 4.6148 seconds +AUTOTUNE mm(3268608x64, 64x64) + triton_mm_14 0.5204 ms 100.0% + triton_mm_8 0.5205 ms 100.0% + triton_mm_7 0.5292 ms 98.3% + triton_mm_10 0.5374 ms 96.8% + triton_mm_6 0.5463 ms 95.2% + triton_mm_13 0.5497 ms 94.7% + triton_mm_9 0.5542 ms 93.9% + mm 0.5696 ms 91.4% + triton_mm_15 0.7099 ms 73.3% + triton_mm_16 0.7413 ms 70.2% +SingleProcess AUTOTUNE takes 3.9672 seconds +AUTOTUNE convolution(32x64x304x336, 64x64x3x3) + convolution 1.4512 ms 100.0% + triton_convolution_18 7.1300 ms 20.4% + triton_convolution_23 8.1302 ms 17.8% + triton_convolution_24 9.7394 ms 14.9% + triton_convolution_19 12.0317 ms 12.1% + triton_convolution_21 12.1762 ms 11.9% + triton_convolution_22 12.6389 ms 11.5% + triton_convolution_20 28.5142 ms 5.1% +SingleProcess AUTOTUNE takes 4.5511 seconds +AUTOTUNE mm(3268608x64, 64x256) + triton_mm_27 1.5106 ms 100.0% + triton_mm_26 1.5171 ms 99.6% + triton_mm_28 1.7556 ms 86.0% + triton_mm_29 1.7616 ms 85.8% + mm 1.7827 ms 84.7% + triton_mm_33 1.8637 ms 81.1% + triton_mm_25 1.8785 ms 80.4% + triton_mm_32 1.9761 ms 76.4% + triton_mm_35 2.2260 ms 67.9% + triton_mm_34 2.8418 ms 53.2% +SingleProcess AUTOTUNE takes 4.4901 seconds +AUTOTUNE mm(3268608x256, 256x64) + triton_mm_51 1.3317 ms 100.0% + triton_mm_50 1.3587 ms 98.0% + triton_mm_53 1.3629 ms 97.7% + triton_mm_56 1.3673 ms 97.4% + triton_mm_57 1.3967 ms 95.3% + triton_mm_49 1.4020 ms 95.0% + mm 1.4044 ms 94.8% + triton_mm_52 1.4348 ms 92.8% + triton_mm_59 2.0760 ms 64.1% + triton_mm_55 2.1260 ms 62.6% +SingleProcess AUTOTUNE takes 4.6271 seconds +AUTOTUNE convolution(32x256x304x336, 128x256x1x1) + convolution 0.4530 ms 100.0% + triton_convolution_114 1.0770 ms 42.1% + triton_convolution_111 1.2264 ms 36.9% + triton_convolution_117 1.3645 ms 33.2% + triton_convolution_116 1.5053 ms 30.1% + triton_convolution_115 1.6691 ms 27.1% + triton_convolution_112 3.1430 ms 14.4% + triton_convolution_113 6.8266 ms 6.6% +SingleProcess AUTOTUNE takes 4.1577 seconds +AUTOTUNE convolution(32x128x152x168, 128x128x3x3) + convolution 1.2069 ms 100.0% + triton_convolution_121 7.0850 ms 17.0% + triton_convolution_118 8.0108 ms 15.1% + triton_convolution_123 8.3698 ms 14.4% + triton_convolution_124 10.6358 ms 11.3% + triton_convolution_122 12.3811 ms 9.7% + triton_convolution_119 13.8907 ms 8.7% + triton_convolution_120 28.6004 ms 4.2% +SingleProcess AUTOTUNE takes 4.5632 seconds +AUTOTUNE mm(817152x128, 128x512) + triton_mm_127 1.0106 ms 100.0% + triton_mm_126 1.0184 ms 99.2% + triton_mm_132 1.0308 ms 98.0% + mm 1.1343 ms 89.1% + triton_mm_125 1.1611 ms 87.0% + triton_mm_128 1.1932 ms 84.7% + triton_mm_129 1.2067 ms 83.7% + triton_mm_133 1.3923 ms 72.6% + triton_mm_135 1.7286 ms 58.5% + triton_mm_134 2.2943 ms 44.0% +SingleProcess AUTOTUNE takes 4.4943 seconds +AUTOTUNE convolution(32x256x304x336, 512x256x1x1) + convolution 1.5192 ms 100.0% + triton_convolution_140 4.2484 ms 35.8% + triton_convolution_142 4.5286 ms 33.5% + triton_convolution_143 5.4167 ms 28.0% + triton_convolution_141 6.6385 ms 22.9% + triton_convolution_137 8.3060 ms 18.3% + triton_convolution_138 12.5158 ms 12.1% + triton_convolution_139 27.1067 ms 5.6% +SingleProcess AUTOTUNE takes 5.2291 seconds +AUTOTUNE mm(817152x512, 512x128) + mm 0.7270 ms 100.0% + triton_mm_146 0.8278 ms 87.8% + triton_mm_145 0.8757 ms 83.0% + triton_mm_148 0.9058 ms 80.3% + triton_mm_147 0.9367 ms 77.6% + triton_mm_151 0.9769 ms 74.4% + triton_mm_152 1.0830 ms 67.1% + triton_mm_144 1.1040 ms 65.9% + triton_mm_154 1.6869 ms 43.1% + triton_mm_149 1.7984 ms 40.4% +SingleProcess AUTOTUNE takes 5.0176 seconds +AUTOTUNE convolution(32x512x152x168, 256x512x1x1) + convolution 0.3219 ms 100.0% + triton_convolution_240 1.0508 ms 30.6% + triton_convolution_242 1.0755 ms 29.9% + triton_convolution_243 1.2962 ms 24.8% + triton_convolution_241 1.6665 ms 19.3% + triton_convolution_237 2.1070 ms 15.3% + triton_convolution_238 3.1459 ms 10.2% + triton_convolution_239 6.7243 ms 4.8% +SingleProcess AUTOTUNE takes 4.9713 seconds +AUTOTUNE convolution(32x256x76x84, 256x256x3x3) + convolution 1.0804 ms 100.0% + triton_convolution_249 6.4292 ms 16.8% + triton_convolution_247 7.2717 ms 14.9% + triton_convolution_244 7.9900 ms 13.5% + triton_convolution_250 11.5452 ms 9.4% + triton_convolution_248 16.4734 ms 6.6% + triton_convolution_245 19.3684 ms 5.6% + triton_convolution_246 29.2683 ms 3.7% +SingleProcess AUTOTUNE takes 5.1353 seconds +AUTOTUNE mm(204288x256, 256x1024) + mm 0.6786 ms 100.0% + triton_mm_253 0.7821 ms 86.8% + triton_mm_252 0.7883 ms 86.1% + triton_mm_258 0.8225 ms 82.5% + triton_mm_251 0.9310 ms 72.9% + triton_mm_255 0.9324 ms 72.8% + triton_mm_254 0.9355 ms 72.5% + triton_mm_259 1.0938 ms 62.0% + triton_mm_261 1.3321 ms 50.9% + triton_mm_260 1.9005 ms 35.7% +SingleProcess AUTOTUNE takes 4.6551 seconds +AUTOTUNE convolution(32x512x152x168, 1024x512x1x1) + convolution 1.2104 ms 100.0% + triton_convolution_266 4.1278 ms 29.3% + triton_convolution_268 4.2156 ms 28.7% + triton_convolution_269 5.0886 ms 23.8% + triton_convolution_267 6.5420 ms 18.5% + triton_convolution_263 8.1750 ms 14.8% + triton_convolution_264 12.4580 ms 9.7% + triton_convolution_265 26.7420 ms 4.5% +SingleProcess AUTOTUNE takes 5.2265 seconds +AUTOTUNE mm(204288x1024, 1024x256) + mm 0.5437 ms 100.0% + triton_mm_271 0.6632 ms 82.0% + triton_mm_272 0.6646 ms 81.8% + triton_mm_274 0.7669 ms 70.9% + triton_mm_273 0.7675 ms 70.8% + triton_mm_270 0.8797 ms 61.8% + triton_mm_278 0.8978 ms 60.6% + triton_mm_277 0.9712 ms 56.0% + triton_mm_280 1.3469 ms 40.4% + triton_mm_279 1.5811 ms 34.4% +SingleProcess AUTOTUNE takes 5.2192 seconds +AUTOTUNE convolution(32x1024x76x84, 512x1024x1x1) + convolution 0.2622 ms 100.0% + triton_convolution_955 1.0030 ms 26.1% + triton_convolution_957 1.1227 ms 23.4% + triton_convolution_958 1.2446 ms 21.1% + triton_convolution_956 1.9245 ms 13.6% + triton_convolution_952 2.2700 ms 11.6% + triton_convolution_953 3.1720 ms 8.3% + triton_convolution_954 6.5570 ms 4.0% +SingleProcess AUTOTUNE takes 4.5819 seconds +AUTOTUNE convolution(32x512x38x42, 512x512x3x3) + convolution 1.0515 ms 100.0% + triton_convolution_964 6.9621 ms 15.1% + triton_convolution_959 9.8507 ms 10.7% + triton_convolution_962 10.1274 ms 10.4% + triton_convolution_965 17.0127 ms 6.2% + triton_convolution_963 19.1787 ms 5.5% + triton_convolution_960 21.0096 ms 5.0% + triton_convolution_961 28.6011 ms 3.7% +SingleProcess AUTOTUNE takes 5.3532 seconds +AUTOTUNE mm(51072x512, 512x2048) + mm 0.5559 ms 100.0% + triton_mm_968 0.6594 ms 84.3% + triton_mm_967 0.6638 ms 83.7% + triton_mm_973 0.7133 ms 77.9% + triton_mm_970 0.7841 ms 70.9% + triton_mm_969 0.7904 ms 70.3% + triton_mm_966 0.8376 ms 66.4% + triton_mm_974 0.9407 ms 59.1% + triton_mm_976 1.2370 ms 44.9% + triton_mm_971 1.7099 ms 32.5% +SingleProcess AUTOTUNE takes 4.7618 seconds +AUTOTUNE convolution(32x1024x76x84, 2048x1024x1x1) + convolution 1.0470 ms 100.0% + triton_convolution_981 3.9419 ms 26.6% + triton_convolution_983 4.3415 ms 24.1% + triton_convolution_984 4.9420 ms 21.2% + triton_convolution_982 7.6665 ms 13.7% + triton_convolution_978 8.2708 ms 12.7% + triton_convolution_979 12.2919 ms 8.5% + triton_convolution_980 26.1428 ms 4.0% +SingleProcess AUTOTUNE takes 4.8388 seconds +AUTOTUNE mm(51072x2048, 2048x512) + mm 0.4948 ms 100.0% + triton_mm_987 0.6086 ms 81.3% + triton_mm_986 0.6168 ms 80.2% + triton_mm_988 0.7104 ms 69.6% + triton_mm_989 0.7249 ms 68.3% + triton_mm_985 0.8283 ms 59.7% + triton_mm_993 0.8379 ms 59.1% + triton_mm_992 0.8847 ms 55.9% + triton_mm_995 1.3007 ms 38.0% + triton_mm_990 1.5264 ms 32.4% +SingleProcess AUTOTUNE takes 5.0192 seconds +AUTOTUNE addmm(51072x256, 51072x2048, 2048x256) + bias_addmm 0.3045 ms 100.0% + addmm 0.3104 ms 98.1% + triton_mm_1049 0.3329 ms 91.5% + triton_mm_1048 0.3370 ms 90.4% + triton_mm_1050 0.3632 ms 83.8% + triton_mm_1051 0.3691 ms 82.5% + triton_mm_1055 0.4297 ms 70.9% + triton_mm_1047 0.4396 ms 69.3% + triton_mm_1054 0.5082 ms 59.9% + triton_mm_1057 0.7069 ms 43.1% +SingleProcess AUTOTUNE takes 5.5123 seconds +AUTOTUNE convolution(32x256x38x42, 256x256x3x3) + convolution 0.2800 ms 100.0% + triton_convolution_1064 1.7070 ms 16.4% + triton_convolution_1062 1.8459 ms 15.2% + triton_convolution_1059 2.1395 ms 13.1% + triton_convolution_1065 2.9374 ms 9.5% + triton_convolution_1063 3.9718 ms 7.0% + triton_convolution_1060 4.7213 ms 5.9% + triton_convolution_1061 7.4863 ms 3.7% +SingleProcess AUTOTUNE takes 4.7817 seconds +AUTOTUNE addmm(3268608x256, 3268608x256, 256x256) + triton_mm_1068 3.6515 ms 100.0% + triton_mm_1067 3.7399 ms 97.6% + triton_mm_1073 3.9075 ms 93.4% + triton_mm_1069 4.2690 ms 85.5% + triton_mm_1070 4.3191 ms 84.5% + triton_mm_1066 4.4729 ms 81.6% + triton_mm_1074 4.8846 ms 74.8% + bias_addmm 5.9405 ms 61.5% + addmm 5.9812 ms 61.0% + triton_mm_1076 6.6301 ms 55.1% +SingleProcess AUTOTUNE takes 5.7035 seconds +AUTOTUNE addmm(817152x256, 817152x512, 512x256) + bias_addmm 1.3548 ms 100.0% + triton_mm_1080 1.4332 ms 94.5% + triton_mm_1079 1.5655 ms 86.5% + triton_mm_1081 1.8005 ms 75.2% + triton_mm_1082 1.8064 ms 75.0% + addmm 1.9001 ms 71.3% + triton_mm_1078 1.9506 ms 69.5% + triton_mm_1086 2.0710 ms 65.4% + triton_mm_1085 2.6391 ms 51.3% + triton_mm_1088 2.8551 ms 47.5% +SingleProcess AUTOTUNE takes 5.6193 seconds +AUTOTUNE addmm(204288x256, 204288x1024, 1024x256) + bias_addmm 0.5761 ms 100.0% + triton_mm_1091 0.6856 ms 84.0% + triton_mm_1092 0.6867 ms 83.9% + addmm 0.7476 ms 77.1% + triton_mm_1093 0.7855 ms 73.3% + triton_mm_1094 0.7955 ms 72.4% + triton_mm_1090 0.9032 ms 63.8% + triton_mm_1098 0.9230 ms 62.4% + triton_mm_1097 0.9917 ms 58.1% + triton_mm_1100 1.3640 ms 42.2% +SingleProcess AUTOTUNE takes 5.7268 seconds +AUTOTUNE convolution(32x256x304x336, 256x256x3x3) + convolution 17.6539 ms 100.0% + triton_convolution_1107 100.5318 ms 17.6% + triton_convolution_1105 118.5658 ms 14.9% + triton_convolution_1102 122.7876 ms 14.4% + triton_convolution_1108 186.2668 ms 9.5% + triton_convolution_1106 265.2579 ms 6.7% + triton_convolution_1103 300.6816 ms 5.9% + triton_convolution_1104 465.4240 ms 3.8% +SingleProcess AUTOTUNE takes 16.9654 seconds +AUTOTUNE convolution(32x256x152x168, 256x256x3x3) + convolution 4.4577 ms 100.0% + triton_convolution_1114 25.2454 ms 17.7% + triton_convolution_1112 29.8068 ms 15.0% + triton_convolution_1109 31.0213 ms 14.4% + triton_convolution_1115 46.3662 ms 9.6% + triton_convolution_1113 66.0892 ms 6.7% + triton_convolution_1110 76.3744 ms 5.8% + triton_convolution_1111 116.2610 ms 3.8% +SingleProcess AUTOTUNE takes 7.4362 seconds +AUTOTUNE addmm(3268608x3, 3268608x256, 256x3) + triton_mm_1132 1.0499 ms 100.0% + triton_mm_1131 1.0695 ms 98.2% + triton_mm_1135 1.0697 ms 98.1% + triton_mm_1133 1.0720 ms 97.9% + triton_mm_1134 1.0749 ms 97.7% + triton_mm_1137 1.0843 ms 96.8% + triton_mm_1130 1.0863 ms 96.6% + triton_mm_1141 1.1345 ms 92.5% + triton_mm_1140 1.1662 ms 90.0% + triton_mm_1138 1.2461 ms 84.3% +SingleProcess AUTOTUNE takes 5.0383 seconds +AUTOTUNE addmm(817152x3, 817152x256, 256x3) + triton_mm_1151 0.2790 ms 100.0% + triton_mm_1150 0.2831 ms 98.6% + triton_mm_1154 0.2835 ms 98.4% + triton_mm_1152 0.2840 ms 98.3% + triton_mm_1153 0.2846 ms 98.1% + triton_mm_1149 0.2888 ms 96.6% + triton_mm_1156 0.2890 ms 96.5% + triton_mm_1160 0.3040 ms 91.8% + triton_mm_1159 0.3087 ms 90.4% + triton_mm_1157 0.3271 ms 85.3% +SingleProcess AUTOTUNE takes 4.3400 seconds +AUTOTUNE addmm(204288x3, 204288x256, 256x3) + triton_mm_1169 0.0862 ms 100.0% + triton_mm_1171 0.0866 ms 99.5% + triton_mm_1173 0.0869 ms 99.2% + triton_mm_1172 0.0875 ms 98.5% + triton_mm_1170 0.0877 ms 98.2% + triton_mm_1168 0.0899 ms 95.9% + triton_mm_1175 0.0902 ms 95.5% + triton_mm_1178 0.0952 ms 90.6% + triton_mm_1176 0.0966 ms 89.2% + triton_mm_1179 0.1013 ms 85.0% +SingleProcess AUTOTUNE takes 3.9785 seconds +AUTOTUNE addmm(51072x3, 51072x256, 256x3) + triton_mm_1189 0.0339 ms 100.0% + triton_mm_1191 0.0343 ms 98.8% + triton_mm_1188 0.0344 ms 98.7% + triton_mm_1190 0.0345 ms 98.3% + triton_mm_1187 0.0353 ms 96.2% + triton_mm_1194 0.0353 ms 96.2% + triton_mm_1192 0.0355 ms 95.7% + triton_mm_1198 0.0364 ms 93.3% + triton_mm_1197 0.0368 ms 92.3% + triton_mm_1195 0.0371 ms 91.4% +SingleProcess AUTOTUNE takes 4.3372 seconds +AUTOTUNE convolution(32x256x19x21, 256x256x3x3) + convolution 0.0686 ms 100.0% + triton_convolution_1202 0.4279 ms 16.0% + triton_convolution_1204 0.4406 ms 15.6% + triton_convolution_1199 0.5708 ms 12.0% + triton_convolution_1205 0.6937 ms 9.9% + triton_convolution_1200 0.7684 ms 8.9% + triton_convolution_1203 0.7883 ms 8.7% + triton_convolution_1201 1.8229 ms 3.8% +SingleProcess AUTOTUNE takes 4.5925 seconds +AUTOTUNE addmm(12768x3, 12768x256, 256x3) + triton_mm_1209 0.0132 ms 100.0% + triton_mm_1207 0.0133 ms 99.5% + triton_mm_1211 0.0133 ms 99.3% + triton_mm_1208 0.0140 ms 94.7% + triton_mm_1210 0.0140 ms 94.7% + triton_mm_1214 0.0143 ms 92.2% + bias_addmm 0.0155 ms 85.2% + triton_mm_1206 0.0157 ms 84.3% + triton_mm_1213 0.0161 ms 82.3% + triton_mm_1216 0.0176 ms 74.9% +SingleProcess AUTOTUNE takes 4.0712 seconds +AUTOTUNE addmm(3268608x12, 3268608x256, 256x12) + triton_mm_1226 1.0775 ms 100.0% + triton_mm_1220 1.1099 ms 97.1% + triton_mm_1219 1.1344 ms 95.0% + triton_mm_1223 1.1381 ms 94.7% + triton_mm_1222 1.1384 ms 94.7% + triton_mm_1221 1.1386 ms 94.6% + triton_mm_1225 1.1644 ms 92.5% + triton_mm_1227 1.1674 ms 92.3% + triton_mm_1224 1.1682 ms 92.2% + triton_mm_1218 1.1714 ms 92.0% +SingleProcess AUTOTUNE takes 4.1467 seconds +AUTOTUNE addmm(817152x12, 817152x256, 256x12) + triton_mm_1238 0.2854 ms 100.0% + triton_mm_1232 0.2924 ms 97.6% + triton_mm_1231 0.2959 ms 96.5% + triton_mm_1233 0.2972 ms 96.0% + triton_mm_1235 0.2973 ms 96.0% + triton_mm_1234 0.2988 ms 95.5% + triton_mm_1236 0.3048 ms 93.6% + triton_mm_1237 0.3065 ms 93.1% + triton_mm_1239 0.3069 ms 93.0% + triton_mm_1230 0.3071 ms 92.9% +SingleProcess AUTOTUNE takes 3.9259 seconds +AUTOTUNE addmm(204288x12, 204288x256, 256x12) + triton_mm_1250 0.0881 ms 100.0% + triton_mm_1243 0.0881 ms 99.9% + triton_mm_1245 0.0890 ms 98.9% + triton_mm_1247 0.0893 ms 98.6% + triton_mm_1246 0.0896 ms 98.3% + triton_mm_1244 0.0896 ms 98.3% + triton_mm_1248 0.0911 ms 96.6% + triton_mm_1249 0.0923 ms 95.4% + triton_mm_1242 0.0924 ms 95.3% + triton_mm_1251 0.0925 ms 95.2% +SingleProcess AUTOTUNE takes 4.2935 seconds +AUTOTUNE addmm(51072x12, 51072x256, 256x12) + triton_mm_1256 0.0333 ms 100.0% + triton_mm_1262 0.0336 ms 99.3% + triton_mm_1258 0.0342 ms 97.5% + triton_mm_1263 0.0343 ms 97.2% + triton_mm_1255 0.0344 ms 97.0% + triton_mm_1257 0.0345 ms 96.8% + triton_mm_1261 0.0346 ms 96.3% + triton_mm_1259 0.0348 ms 95.7% + triton_mm_1254 0.0352 ms 94.6% + triton_mm_1265 0.0362 ms 92.1% +SingleProcess AUTOTUNE takes 4.0971 seconds +AUTOTUNE addmm(12768x12, 12768x256, 256x12) + triton_mm_1274 0.0129 ms 100.0% + triton_mm_1267 0.0132 ms 98.1% + triton_mm_1268 0.0134 ms 96.7% + triton_mm_1271 0.0134 ms 96.7% + triton_mm_1275 0.0135 ms 96.0% + triton_mm_1269 0.0137 ms 94.4% + triton_mm_1272 0.0138 ms 94.0% + triton_mm_1270 0.0139 ms 92.9% + bias_addmm 0.0149 ms 86.9% + triton_mm_1266 0.0152 ms 85.2% +SingleProcess AUTOTUNE takes 3.8293 seconds +skipping cudagraphs due to ['non-cuda device in graph'] +[2023-12-12 17:13:02,525] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program +skipping cudagraphs due to ['non-cuda device in graph'] +AUTOTUNE mm(32000x12544, 12544x1024) + mm 3.6433 ms 100.0% + triton_mm_1279 4.8504 ms 75.1% + triton_mm_1280 4.9807 ms 73.1% + triton_mm_1281 5.4958 ms 66.3% + triton_mm_1282 5.5499 ms 65.6% + triton_mm_1286 6.4978 ms 56.1% + triton_mm_1278 6.5740 ms 55.4% + triton_mm_1285 6.9910 ms 52.1% + triton_mm_1288 10.0610 ms 36.2% + triton_mm_1283 11.3868 ms 32.0% +SingleProcess AUTOTUNE takes 5.8258 seconds +AUTOTUNE mm(32000x1024, 1024x1024) + mm 0.3167 ms 100.0% + triton_mm_1292 0.3782 ms 83.7% + triton_mm_1291 0.3793 ms 83.5% + triton_mm_1293 0.4526 ms 70.0% + triton_mm_1294 0.4528 ms 69.9% + triton_mm_1290 0.4931 ms 64.2% + triton_mm_1298 0.5370 ms 59.0% + triton_mm_1297 0.5453 ms 58.1% + triton_mm_1300 0.7575 ms 41.8% + triton_mm_1299 0.9494 ms 33.4% +SingleProcess AUTOTUNE takes 5.4527 seconds +AUTOTUNE addmm(32000x81, 32000x1024, 1024x81) + triton_mm_1303 0.0913 ms 100.0% + triton_mm_1305 0.0941 ms 97.0% + triton_mm_1310 0.1024 ms 89.2% + triton_mm_1306 0.1032 ms 88.5% + triton_mm_1304 0.1063 ms 85.9% + triton_mm_1307 0.1226 ms 74.5% + triton_mm_1311 0.1240 ms 73.7% + triton_mm_1309 0.1309 ms 69.7% + triton_mm_1302 0.1377 ms 66.3% + triton_mm_1308 0.1441 ms 63.4% +SingleProcess AUTOTUNE takes 6.6387 seconds +AUTOTUNE addmm(32000x320, 32000x1024, 1024x320) + triton_mm_1316 0.1288 ms 100.0% + bias_addmm 0.1451 ms 88.8% + triton_mm_1318 0.1492 ms 86.3% + triton_mm_1315 0.1502 ms 85.7% + addmm 0.1558 ms 82.6% + triton_mm_1314 0.1653 ms 77.9% + triton_mm_1317 0.1708 ms 75.4% + triton_mm_1322 0.1740 ms 74.0% + triton_mm_1321 0.2398 ms 53.7% + triton_mm_1324 0.2614 ms 49.3% +SingleProcess AUTOTUNE takes 5.5531 seconds +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +[2023-12-12 17:14:54,209] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (8) +[2023-12-12 17:14:54,209] torch._dynamo.convert_frame: [WARNING] function: 'resume_in_detector_postprocess' (/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/postprocessing.py:45) +[2023-12-12 17:14:54,209] torch._dynamo.convert_frame: [WARNING] last reason: L['scale_x'] == 0.5337781484570475 # self.tensor[:, 0::2] *= scale_x # miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/structures/boxes.py:275 in scale +[2023-12-12 17:14:54,209] torch._dynamo.convert_frame: [WARNING] To log all recompilation reasons, use TORCH_LOGS="recompiles". +[2023-12-12 17:14:54,209] torch._dynamo.convert_frame: [WARNING] To diagnose recompilation issues, see https://pytorch.org/docs/master/compile/troubleshooting.html. + running benchmark: 0%| | 0/30 [00:00 + async_compile.wait(globals()) + File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 2470, in wait + scope[key] = result.result() + File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 2313, in result + self.future.result() + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/concurrent/futures/_base.py", line 458, in result + return self.__get_result() + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result + raise self._exception +torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised: +CompilationError: at 14:40: xnumel = 196 + yoffset = tl.program_id(1).to(tl.int64) * YBLOCK + yindex = yoffset + tl.arange(0, YBLOCK)[None, :].to(tl.int64) + ymask = yindex < ynumel + xoffset = tl.program_id(0).to(tl.int64) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:, None].to(tl.int64) + xmask = xindex < xnumel + x2 = xindex + y3 = yindex + y0 = yindex % 1024 + y1 = (yindex // 1024) + tmp0 = tl.load(in_ptr0 + (x2 + (196*y3)), xmask, eviction_policy='evict_last').to(tl.float32) + ^ +ValueError('numel (262144) exceeds triton maximum tensor numel (131072)') + +Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information + + +You can suppress this exception and fall back to eager by setting: + import torch._dynamo + torch._dynamo.config.suppress_errors = True + +Run failed with return code: 255 +Output: None +Error: None + loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_101_fpn +WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead + loading model: 0it [00:06, ?it/s] +WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead +cuda eval detectron2_maskrcnn_r_101_fpn baseline-bs32 +WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['non-cuda device in graph'] +[2023-12-12 17:40:18,830] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program +skipping cudagraphs due to ['non-cuda device in graph'] +skipping cudagraphs due to ['non-cuda device in graph'] +[2023-12-12 17:41:27,948] [30/1_1] torch._inductor.utils: [WARNING] DeviceCopy in input program +skipping cudagraphs due to ['non-cuda device in graph'] +AUTOTUNE convolution(957x256x14x14, 256x256x3x3) + convolution 1.2820 ms 100.0% + triton_convolution_1332 4.0781 ms 31.4% + triton_convolution_1327 4.1190 ms 31.1% + triton_convolution_1330 4.6714 ms 27.4% + triton_convolution_1329 5.5829 ms 23.0% + triton_convolution_1326 6.5548 ms 19.6% + triton_convolution_1331 8.8159 ms 14.5% + triton_convolution_1328 12.4215 ms 10.3% +SingleProcess AUTOTUNE takes 5.6736 seconds +AUTOTUNE convolution(957x256x28x28, 80x256x1x1) + triton_convolution_1354 0.8452 ms 100.0% + triton_convolution_1355 0.9540 ms 88.6% + triton_convolution_1359 0.9582 ms 88.2% + triton_convolution_1357 1.0138 ms 83.4% + convolution 1.0945 ms 77.2% + triton_convolution_1358 1.1035 ms 76.6% + triton_convolution_1360 1.1451 ms 73.8% + triton_convolution_1356 1.7224 ms 49.1% + conv1x1_via_mm 3.6812 ms 23.0% +SingleProcess AUTOTUNE takes 4.8280 seconds +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['non-cuda device in graph'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['non-cuda device in graph'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['non-cuda device in graph'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +skipping cudagraphs due to ['mutated inputs'] +[2023-12-12 17:43:12,768] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (8) +[2023-12-12 17:43:12,768] torch._dynamo.convert_frame: [WARNING] function: 'resume_in_paste_masks_in_image' (/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/mask_ops.py:123) +[2023-12-12 17:43:12,768] torch._dynamo.convert_frame: [WARNING] last reason: L['N'] == 36 # num_chunks <= N # miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/mask_ops.py:125 in resume_in_paste_masks_in_image +[2023-12-12 17:43:12,768] torch._dynamo.convert_frame: [WARNING] To log all recompilation reasons, use TORCH_LOGS="recompiles". +[2023-12-12 17:43:12,768] torch._dynamo.convert_frame: [WARNING] To diagnose recompilation issues, see https://pytorch.org/docs/master/compile/troubleshooting.html. +skipping cudagraphs due to ['mutated inputs'] +[2023-12-12 17:43:22,978] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (8) +[2023-12-12 17:43:22,978] torch._dynamo.convert_frame: [WARNING] function: 'resume_in_detector_postprocess' (/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/postprocessing.py:45) +[2023-12-12 17:43:22,978] torch._dynamo.convert_frame: [WARNING] last reason: L['scale_x'] == 0.5337781484570475 # self.tensor[:, 0::2] *= scale_x # miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/structures/boxes.py:275 in scale +[2023-12-12 17:43:22,978] torch._dynamo.convert_frame: [WARNING] To log all recompilation reasons, use TORCH_LOGS="recompiles". +[2023-12-12 17:43:22,978] torch._dynamo.convert_frame: [WARNING] To diagnose recompilation issues, see https://pytorch.org/docs/master/compile/troubleshooting.html. + running benchmark: 0%| | 0/30 [00:00 /home/cdhernandez/local/pytorch/torch/_inductor/decomposition.py(221)mm() +-> if config.coordinate_descent_tuning: +(Pdb) \ No newline at end of file diff --git a/torchao_benchmarks.sh b/torchao_benchmarks.sh new file mode 100644 index 0000000000..11fcbc1314 --- /dev/null +++ b/torchao_benchmarks.sh @@ -0,0 +1,25 @@ +echo "start dynamic" +python run_benchmark.py dynamo --bfloat16 --inductor --performance --inference --quantization int8dynamic --inductor-compile-mode max-autotune --tag int8dynamic +echo "start int8 weight only" +python run_benchmark.py dynamo --bfloat16 --inductor --performance --inference --quantization int8weightonly --inductor-compile-mode max-autotune --tag int8weightonly +echo "start int4 weight only" +python run_benchmark.py dynamo --bfloat16 --inductor --performance --inference --quantization int4weightonly --inductor-compile-mode max-autotune --tag int4weightonly +echo "start baseline" +python run_benchmark.py dynamo --bfloat16 --inductor --performance --inference --inductor-compile-mode max-autotune --tag baseline + +echo "start int8 weight only batchsize 1" +python run_benchmark.py dynamo --bfloat16 --inductor --performance --inference --quantization int8weightonly --inductor-compile-mode max-autotune --batch_size 1 --tag int8weightonly-bs1 +echo "start int4 weight only batchsize 1" +python run_benchmark.py dynamo --bfloat16 --inductor --performance --inference --quantization int4weightonly --inductor-compile-mode max-autotune --batch_size 1 --tag int4weightonly-bs1 +echo "start baseline batchsize 1" +python run_benchmark.py dynamo --bfloat16 --inductor --performance --inference --inductor-compile-mode max-autotune --batch_size 1 --tag baseline-bs1 + +echo "start dynamic batchsize 32" +python run_benchmark.py dynamo --bfloat16 --inductor --performance --inference --quantization int8dynamic --inductor-compile-mode max-autotune --batch_size 32 --tag int8dynamic-bs32 +echo "start baseline batchsize 32" +python run_benchmark.py dynamo --bfloat16 --inductor --performance --inference --inductor-compile-mode max-autotune --batch_size 32 --tag baseline-bs32 + +echo "start accuracy" +python run_benchmark.py dynamo --bfloat16 --inductor --inference --quantization int8dynamic --inductor-compile-mode max-autotune --batch_size 1 --tag int8dynamic-bs1-acc --accuracy +python run_benchmark.py dynamo --bfloat16 --inductor --inference --quantization int8weightonly --inductor-compile-mode max-autotune --batch_size 1 --tag int8weightonly-bs1-acc --accuracy +python run_benchmark.py dynamo --bfloat16 --inductor --inference --quantization int4weightonly --inductor-compile-mode max-autotune --batch_size 1 --tag int4weightonly-bs1-acc --accuracy diff --git a/userbenchmark/dynamo/dynamobench/common.py b/userbenchmark/dynamo/dynamobench/common.py index b11c3cbf62..fd2df37f45 100644 --- a/userbenchmark/dynamo/dynamobench/common.py +++ b/userbenchmark/dynamo/dynamobench/common.py @@ -73,6 +73,12 @@ from torch.utils import _pytree as pytree from torch.utils._pytree import tree_map, tree_map_only +import torchao +from torchao.quantization import ( + change_linear_weights_to_int8_dqtensors, + change_linear_weights_to_int8_woqtensors, + change_linear_weights_to_int4_woqtensors +) from tqdm.auto import tqdm, trange @@ -576,7 +582,7 @@ def maybe_mark_profile(*args, **kwargs): first_fields.append(kwargs["tag"]) headers = first_headers + ["speedup", "abs_latency"] row = first_fields + [float(speedup), median[1] * 1000] - msg = f"{speedup:.3f}x" + msg = f"{speedup*1000:.3f}ms" if args.baseline: headers.extend( [ @@ -2066,7 +2072,7 @@ def deepcopy_and_maybe_ddp(self, model): return model def check_accuracy( - self, name, model, example_inputs, optimize_ctx, experiment, tag + self, name, model, example_inputs, optimize_ctx, experiment, tag, res=None ): """ Checks accuracy. @@ -2238,6 +2244,10 @@ def record_status(accuracy_status, dynamo_start_stats): finally: del model_copy + sqnr = "err" + if res is not None and isinstance(res, torch.Tensor): + sqnr = 20 * torch.log10(torch.linalg.norm(res) / torch.linalg.norm(res - new_result)).item() + if name in self.skip_accuracy_check_as_eager_non_deterministic: return record_status("pass_due_to_skip", dynamo_start_stats=start_stats) @@ -2276,9 +2286,7 @@ def record_status(accuracy_status, dynamo_start_stats): accuracy_status = "pass_due_to_skip" else: accuracy_status = "fail_accuracy" - return record_status(accuracy_status, dynamo_start_stats=start_stats) - - return record_status(accuracy_status, dynamo_start_stats=start_stats) + return record_status(accuracy_status+f"-sqnr-{sqnr:.3f}", dynamo_start_stats=start_stats) def check_tolerance( self, name, model, example_inputs, optimize_ctx, base_device="cpu" @@ -2496,6 +2504,7 @@ def run_one_model( experiment, explain=False, tag=None, + res=None, ): mode = "train" if self.args.training else "eval" msg = f"{current_device:4} {mode:5} {current_name:34} " @@ -2507,7 +2516,7 @@ def run_one_model( if self.args.accuracy: status = self.check_accuracy( - name, model, example_inputs, optimize_ctx, experiment, tag + name, model, example_inputs, optimize_ctx, experiment, tag, res, ) print(status) if status == "fail_accuracy" and self.args.minify: @@ -2714,6 +2723,11 @@ def get_example_inputs(self): action="store_true", help="Create n processes based on the number of devices (distributed use case).", ) + parser.add_argument( + "--quantization", + choices=["int8dynamic", "int8weightonly", "int4weightonly"], + help="Apply quantization to the model before running it", + ) parser.add_argument( "--ddp", action="store_true", @@ -3535,6 +3549,7 @@ def run(runner, args, original_dir=None): extra_args=extra_args, ) else: + print(model_name) ( device, name, @@ -3547,6 +3562,24 @@ def run(runner, args, original_dir=None): batch_size=batch_size, extra_args=extra_args, ) + res = None + if args.quantization: + if args.accuracy: + res=model(*example_inputs) # to later calculate SQNR + + torch._dynamo.config.automatic_dynamic_shapes = False + torch._dynamo.config.force_parameter_static_shapes = False + torch._dynamo.config.cache_size_limit = 1000 + assert "cuda" in device + if args.quantization=="int8dynamic": + torch._inductor.config.force_fuse_int_mm_with_mul = True + change_linear_weights_to_int8_dqtensors(model) + elif args.quantization=="int8weightonly": + torch._inductor.config.use_mixed_mm = True + change_linear_weights_to_int8_woqtensors(model) + elif args.quantization=="int4weightonly": + change_linear_weights_to_int4_woqtensors(model) + except NotImplementedError as e: print(e) import traceback @@ -3614,6 +3647,7 @@ def detect_and_mark_batch(t): experiment, explain=args.explain, tag=args.tag, + res=res, ) if args.generate_aot_autograd_stats: stats_file = output_filename.split(".csv")[0] + "_stats.csv" @@ -3629,8 +3663,8 @@ def detect_and_mark_batch(t): ) else: metrics.purge_old_log_files() - if output_filename and os.path.exists(output_filename): - os.unlink(output_filename) + # if output_filename and os.path.exists(output_filename): + # os.unlink(output_filename) if original_dir: os.chdir(original_dir) model_names = list(runner.iter_model_names(args)) diff --git a/userbenchmark/dynamo/dynamobench/torchbench.py b/userbenchmark/dynamo/dynamobench/torchbench.py index 9919332e1d..3222762243 100755 --- a/userbenchmark/dynamo/dynamobench/torchbench.py +++ b/userbenchmark/dynamo/dynamobench/torchbench.py @@ -265,6 +265,17 @@ def setup_torchbench_cwd(): "tts_angular", "pyhpc_turbulent_kinetic_energy", "detectron2_fcos_r_50_fpn", + "detectron2_fasterrcnn_r_101_dc5" + "detectron2_fasterrcnn_r_50_c4", + "detectron2_fasterrcnn_r_101_c4", + "detectron2_fasterrcnn_r_101_fpn", + "detectron2_fasterrcnn_r_50_dc5", + "detectron2_fasterrcnn_r_50_fpn", + "detectron2_maskrcnn_r_101_c4", + "detectron2_maskrcnn_r_101_fpn", + "detectron2_maskrcnn_r_50_c4", + "detectron2_maskrcnn_r_50_fpn", + "demucs", } FORCE_FP16_FOR_BF16_MODELS = {"vision_maskrcnn"}