diff --git a/src/lib.rs b/src/lib.rs index 95236e4..bb29698 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -82,12 +82,23 @@ fn run_parser<'t>( payload: &str, output_count: &mut i32, output: &mut Vec<(PathBuf, String)>, - compile_directory: &mut Vec<(String, String, i32)>, + compile_directory: &mut Vec<(String, String, i32, String)>, multi: &MultiProgress, stats: &mut Stats, ) { if let Some(md) = parser.get_metadata(&e) { let results = parser.parse(lineno, md, e.rank, &e.compile_id, &payload); + fn extact_suffix(filename: &String) -> String { + if filename.contains("fx_graph_cache_miss") { + "❎".to_string() + } else if filename.contains("fx_graph_cache_hit") { + "✅".to_string() + } else if filename.contains("fx_graph_cache_bypass") { + "❓".to_string() + } else { + "".to_string() + } + } match results { Ok(results) => { for parser_result in results { @@ -108,25 +119,29 @@ fn run_parser<'t>( }; output.push((filename.clone(), out)); let filename_str = format!("{}", filename.to_string_lossy()); + let suffix = extact_suffix(&filename_str); compile_directory.push(( filename_str.clone(), filename_str, *output_count, + suffix, )); *output_count += 1; } ParserOutput::GlobalFile(filename, out) => { output.push((filename.clone(), out)); let filename_str = format!("{}", filename.to_string_lossy()); + let suffix = extact_suffix(&filename_str); compile_directory.push(( filename_str.clone(), filename_str, *output_count, + suffix, )); *output_count += 1; } ParserOutput::Link(name, url) => { - compile_directory.push((url, name, *output_count)); + compile_directory.push((url, name, *output_count, "".to_string())); *output_count += 1; } } @@ -191,7 +206,7 @@ pub fn parse_path(path: &PathBuf, config: ParseConfig) -> anyhow::Result (link, rendered name, output number) // For files, link and rendered name are the same // For links, you can specify a custom name for the link - let mut directory: FxIndexMap, Vec<(String, String, i32)>> = + let mut directory: FxIndexMap, Vec<(String, String, i32, String)>> = FxIndexMap::default(); let mut metrics_index: CompilationMetricsIndex = FxIndexMap::default(); diff --git a/src/parsers.rs b/src/parsers.rs index eab4d32..f5978de 100644 --- a/src/parsers.rs +++ b/src/parsers.rs @@ -349,7 +349,7 @@ pub struct CompilationMetricsParser<'t> { pub tt: &'t TinyTemplate<'t>, pub stack_index: &'t RefCell, pub symbolic_shape_specialization_index: &'t RefCell, - pub output_files: &'t Vec<(String, String, i32)>, + pub output_files: &'t Vec<(String, String, i32, String)>, pub compile_id_dir: &'t PathBuf, } impl StructuredLogParser for CompilationMetricsParser<'_> { @@ -416,11 +416,16 @@ impl StructuredLogParser for CompilationMetricsParser<'_> { let new_str: String = parts[1..].join(""); new_str }; - let output_files: Vec<(String, String, i32)> = self + let output_files: Vec<(String, String, i32, String)> = self .output_files .iter() - .map(|(url, name, number)| { - return (remove_prefix(url), remove_prefix(name), number.clone()); + .map(|(url, name, number, suffix)| { + ( + remove_prefix(url), + remove_prefix(name), + number.clone(), + suffix.clone(), + ) }) .collect(); let context = CompilationMetricsContext { diff --git a/src/templates.rs b/src/templates.rs index 73b2df1..69f7d78 100644 --- a/src/templates.rs +++ b/src/templates.rs @@ -138,7 +138,7 @@ Build products below:
  • {compile_directory.0}
      {{ for path_idx in compile_directory.1 }} -
    • {path_idx.1} ({path_idx.2})
    • +
    • {path_idx.1} {path_idx.3} ({path_idx.2})
    • {{ endfor }}
  • diff --git a/src/types.rs b/src/types.rs index 8f54b45..ef5c32d 100644 --- a/src/types.rs +++ b/src/types.rs @@ -351,7 +351,7 @@ pub struct CompilationMetricsContext<'e> { pub compile_id: String, pub stack_html: String, pub symbolic_shape_specializations: Vec, - pub output_files: &'e Vec<(String, String, i32)>, + pub output_files: &'e Vec<(String, String, i32, String)>, pub compile_id_dir: &'e PathBuf, pub mini_stack_html: String, } @@ -556,7 +556,7 @@ pub struct DynamoGuardsContext { pub struct IndexContext { pub css: &'static str, pub javascript: &'static str, - pub directory: Vec<(String, Vec<(String, String, i32)>)>, + pub directory: Vec<(String, Vec<(String, String, i32, String)>)>, pub stack_trie_html: String, pub unknown_stack_trie_html: String, pub has_unknown_stack_trie: bool, diff --git a/tests/inputs/cache_hit_miss.log b/tests/inputs/cache_hit_miss.log new file mode 100644 index 0000000..9fe0f4f --- /dev/null +++ b/tests/inputs/cache_hit_miss.log @@ -0,0 +1,5574 @@ +V1003 10:10:50.629000 2235078 torch/_logging/structured.py:22] {"str": ["/data/users/oulgen/pytorch/torch/_dynamo/convert_frame.py", 0]} +V1003 10:10:50.630000 2235078 torch/_logging/structured.py:22] {"str": ["/data/users/oulgen/pytorch/test/inductor/test_codecache.py", 1]} +V1003 10:10:50.630000 2235078 torch/_logging/structured.py:22] {"str": ["/data/users/oulgen/pytorch/torch/_inductor/test_case.py", 2]} +V1003 10:10:50.630000 2235078 torch/_logging/structured.py:22] {"str": ["/data/users/oulgen/pytorch/torch/_dynamo/test_case.py", 3]} +V1003 10:10:50.631000 2235078 torch/_logging/structured.py:22] {"str": ["/data/users/oulgen/pytorch/torch/testing/_internal/common_utils.py", 4]} +V1003 10:10:50.631000 2235078 torch/_logging/structured.py:22] {"str": ["/home/oulgen/.conda/envs/py311/lib/python3.11/unittest/main.py", 5]} +V1003 10:10:50.631000 2235078 torch/_logging/structured.py:22] {"str": ["/home/oulgen/.conda/envs/py311/lib/python3.11/unittest/runner.py", 6]} +V1003 10:10:50.631000 2235078 torch/_logging/structured.py:22] {"str": ["/home/oulgen/.conda/envs/py311/lib/python3.11/unittest/suite.py", 7]} +V1003 10:10:50.631000 2235078 torch/_logging/structured.py:22] {"str": ["/home/oulgen/.conda/envs/py311/lib/python3.11/unittest/case.py", 8]} +V1003 10:10:50.632000 2235078 torch/_logging/structured.py:22] {"str": ["/home/oulgen/.conda/envs/py311/lib/python3.11/contextlib.py", 9]} +V1003 10:10:50.632000 2235078 torch/_logging/structured.py:22] {"str": ["/data/users/oulgen/pytorch/torch/nn/attention/flex_attention.py", 10]} +V1003 10:10:50.632000 2235078 torch/_dynamo/convert_frame.py:915] {"dynamo_start": {"stack": [{"line": 916, "name": "", "filename": 1}, {"line": 14, "name": "run_tests", "filename": 2}, {"line": 38, "name": "run_tests", "filename": 3}, {"line": 1273, "name": "run_tests", "filename": 4}, {"line": 102, "name": "__init__", "filename": 5}, {"line": 274, "name": "runTests", "filename": 5}, {"line": 217, "name": "run", "filename": 6}, {"line": 84, "name": "__call__", "filename": 7}, {"line": 122, "name": "run", "filename": 7}, {"line": 84, "name": "__call__", "filename": 7}, {"line": 122, "name": "run", "filename": 7}, {"line": 678, "name": "__call__", "filename": 8}, {"line": 3116, "name": "run", "filename": 4}, {"line": 3088, "name": "_run_custom", "filename": 4}, {"line": 623, "name": "run", "filename": 8}, {"line": 579, "name": "_callTestMethod", "filename": 8}, {"line": 2983, "name": "wrapper", "filename": 4}, {"line": 81, "name": "inner", "filename": 9}, {"line": 81, "name": "inner", "filename": 9}, {"line": 395, "name": "test_flex_attention_caching", "filename": 1}, {"line": 380, "name": "fn", "filename": 1}, {"line": 1062, "name": "flex_attention", "filename": 10}, {"line": 1049, "name": "_flex_attention_hop_wrapper", "filename": 10}]}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:50.632000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "d442394c2c9a973ef4b64ef5deddb37f"} + { + "name": "_compile.compile_inner", + "ts": 1727975450632606.5, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:10:50.633000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "686bcefec76c75096eb61274391c2867"} + { + "name": "entire_frame_compile", + "ts": 1727975450632606.5, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:10:50.639000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 0, "describer_id": 0, "size": 524288}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:50.640000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 0, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1, 4, 512, 64], "is_leaf": true, "stride": [131072, 32768, 64, 1], "storage": 0, "view_func": "", "describer_id": 0}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:50.640000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 0, "id": 0, "source": "L['args'][0]"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:50.835000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 1, "describer_id": 0, "size": 524288}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:50.836000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 4, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1, 4, 512, 64], "is_leaf": true, "stride": [131072, 32768, 64, 1], "storage": 1, "view_func": "", "describer_id": 0}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:50.836000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 0, "id": 4, "source": "L['args'][1]"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:50.837000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 2, "describer_id": 0, "size": 524288}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:50.837000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 5, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1, 4, 512, 64], "is_leaf": true, "stride": [131072, 32768, 64, 1], "storage": 2, "view_func": "", "describer_id": 0}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:50.838000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 0, "id": 5, "source": "L['args'][2]"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:50.839000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 3, "describer_id": 0, "size": 64}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:50.839000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 6, "ndim": 3, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16], "is_leaf": true, "stride": [16, 16, 1], "storage": 3, "view_func": "", "describer_id": 0}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:50.839000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 0, "id": 6, "source": "L['args'][4][0]"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:50.840000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 4, "describer_id": 0, "size": 1024}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:50.841000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 7, "ndim": 4, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16, 16], "is_leaf": true, "stride": [256, 256, 16, 1], "storage": 4, "view_func": "", "describer_id": 0}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:50.841000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 0, "id": 7, "source": "L['args'][4][1]"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:50.842000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 5, "describer_id": 0, "size": 64}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:50.842000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 8, "ndim": 3, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16], "is_leaf": true, "stride": [16, 16, 1], "storage": 5, "view_func": "", "describer_id": 0}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:50.843000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 0, "id": 8, "source": "L['args'][4][2]"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:50.844000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 6, "describer_id": 0, "size": 1024}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:50.844000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 9, "ndim": 4, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16, 16], "is_leaf": true, "stride": [256, 256, 16, 1], "storage": 6, "view_func": "", "describer_id": 0}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:50.844000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 0, "id": 9, "source": "L['args'][4][3]"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:50.845000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 7, "describer_id": 0, "size": 64}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:50.846000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 10, "ndim": 3, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16], "is_leaf": true, "stride": [16, 16, 1], "storage": 7, "view_func": "", "describer_id": 0}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:50.846000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 0, "id": 10, "source": "L['args'][4][4]"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:50.847000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 8, "describer_id": 0, "size": 1024}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:50.847000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 11, "ndim": 4, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16, 16], "is_leaf": true, "stride": [256, 256, 16, 1], "storage": 8, "view_func": "", "describer_id": 0}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:50.847000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 0, "id": 11, "source": "L['args'][4][5]"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:50.848000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 9, "describer_id": 0, "size": 64}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:50.849000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 12, "ndim": 3, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16], "is_leaf": true, "stride": [16, 16, 1], "storage": 9, "view_func": "", "describer_id": 0}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:50.849000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 0, "id": 12, "source": "L['args'][4][6]"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:50.850000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 10, "describer_id": 0, "size": 1024}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:50.850000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 13, "ndim": 4, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16, 16], "is_leaf": true, "stride": [256, 256, 16, 1], "storage": 10, "view_func": "", "describer_id": 0}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:50.850000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 0, "id": 13, "source": "L['args'][4][7]"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:50.864000 2235078 torch/_dynamo/output_graph.py:1347] {"dynamo_output_graph": {"sizes": {"l_args_0_": [1, 4, 512, 64], "l_args_1_": [1, 4, 512, 64], "l_args_2_": [1, 4, 512, 64], "l_args_4_0_": [1, 1, 16], "l_args_4_1_": [1, 1, 16, 16], "l_args_4_2_": [1, 1, 16], "l_args_4_3_": [1, 1, 16, 16], "l_args_4_4_": [1, 1, 16], "l_args_4_5_": [1, 1, 16, 16], "l_args_4_6_": [1, 1, 16], "l_args_4_7_": [1, 1, 16, 16], "child_1": [], "child_2": [], "child_3": [], "child_4": [], "child": [], "child_5": [], "child_6": [], "child_7": [], "child_8": [], "getitem": [1, 4, 512, 64], "getitem_1": [1, 4, 512]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "36de5ad6eb1efc648a27dc62c107a2ca"} + class GraphModule(torch.nn.Module): + def forward(self, L_args_0_: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0", L_args_1_: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0", L_args_2_: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0", L_args_4_0_: "i32[1, 1, 16][16, 16, 1]cuda:0", L_args_4_1_: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0", L_args_4_2_: "i32[1, 1, 16][16, 16, 1]cuda:0", L_args_4_3_: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0", L_args_4_4_: "i32[1, 1, 16][16, 16, 1]cuda:0", L_args_4_5_: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0", L_args_4_6_: "i32[1, 1, 16][16, 16, 1]cuda:0", L_args_4_7_: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0"): + l_args_0_ = L_args_0_ + l_args_1_ = L_args_1_ + l_args_2_ = L_args_2_ + l_args_4_0_ = L_args_4_0_ + l_args_4_1_ = L_args_4_1_ + l_args_4_2_ = L_args_4_2_ + l_args_4_3_ = L_args_4_3_ + l_args_4_4_ = L_args_4_4_ + l_args_4_5_ = L_args_4_5_ + l_args_4_6_ = L_args_4_6_ + l_args_4_7_ = L_args_4_7_ + + # File: /data/users/oulgen/pytorch/torch/nn/attention/flex_attention.py:1050 in _flex_attention_hop_wrapper, code: return flex_attention_hop(*args, **kwargs) + child_1: "i32[][]cuda:0" = l_args_0_.new_empty([], dtype = torch.int32); child_1 = None + child_2: "i32[][]cuda:0" = l_args_0_.new_empty([], dtype = torch.int32); child_2 = None + child_3: "i32[][]cuda:0" = l_args_0_.new_empty([], dtype = torch.int32); child_3 = None + child_4: "i32[][]cuda:0" = l_args_0_.new_empty([], dtype = torch.int32); child_4 = None + child: "f32[][]cuda:0" = l_args_0_.new_empty([], requires_grad = False); child = None + score_mod_0 = self.score_mod_0 + child_5: "i32[][]cuda:0" = l_args_0_.new_empty([], dtype = torch.int32); child_5 = None + child_6: "i32[][]cuda:0" = l_args_0_.new_empty([], dtype = torch.int32); child_6 = None + child_7: "i32[][]cuda:0" = l_args_0_.new_empty([], dtype = torch.int32); child_7 = None + child_8: "i32[][]cuda:0" = l_args_0_.new_empty([], dtype = torch.int32); child_8 = None + mask_fn_0 = self.mask_fn_0 + flex_attention = torch.ops.higher_order.flex_attention(l_args_0_, l_args_1_, l_args_2_, score_mod_0, (l_args_4_0_, l_args_4_1_, l_args_4_2_, l_args_4_3_, l_args_4_4_, l_args_4_5_, l_args_4_6_, l_args_4_7_, 128, 128, mask_fn_0), 0.125, {'ROWS_GUARANTEED_SAFE': False, 'PRESCALE_QK': False, 'OUTPUT_LOGSUMEXP': False}, (), ()); l_args_0_ = l_args_1_ = l_args_2_ = score_mod_0 = l_args_4_0_ = l_args_4_1_ = l_args_4_2_ = l_args_4_3_ = l_args_4_4_ = l_args_4_5_ = l_args_4_6_ = l_args_4_7_ = mask_fn_0 = None + getitem: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0" = flex_attention[0] + getitem_1: "f32[1, 4, 512][2048, 512, 1]cuda:0" = flex_attention[1]; flex_attention = None + return (getitem, getitem_1) + + class score_mod_0(torch.nn.Module): + def forward(self, child: "f32[][]cuda:0", child_1: "i32[][]cuda:0", child_2: "i32[][]cuda:0", child_3: "i32[][]cuda:0", child_4: "i32[][]cuda:0"): + # File: /data/users/oulgen/pytorch/test/inductor/test_codecache.py:377 in score_mod, code: return score + (q - kv) + sub: "i32[][]cuda:0" = child_3 - child_4; child_3 = child_4 = None + add: "f32[][]cuda:0" = child + sub; child = sub = None + return add + + class mask_fn_0(torch.nn.Module): + def forward(self, child_5: "i32[][]cuda:0", child_6: "i32[][]cuda:0", child_7: "i32[][]cuda:0", child_8: "i32[][]cuda:0"): + # File: /data/users/oulgen/pytorch/test/inductor/test_codecache.py:373 in , code: lambda b, h, q, kv: q >= kv, None, None, 2048, 2048 + ge: "b8[][]cuda:0" = child_7 >= child_8; child_7 = child_8 = None + return ge + +V1003 10:10:50.865000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "78b3ca52339404a8a8b475af3393e9be"} + { + "name": "OutputGraph.call_user_compiler", + "ts": 1727975450865801.5, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:10:50.866000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "f7bc91b808bbb22050236e69d68d305e"} + { + "name": "backend_compile", + "ts": 1727975450865801.5, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:10:50.866000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "4a09b2e379da345bd3263251d88c5014"} + { + "name": "backend_compile", + "ts": 1727975450866663.5, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 0, + "fxgraph_cache_miss": 0, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:10:50.867000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "53dff51299e2eac55b28bb862cc5cb64"} + { + "name": "OutputGraph.call_user_compiler", + "ts": 1727975450867057.0, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 0, + "fxgraph_cache_miss": 0, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:10:50.893000 2235078 torch/_dynamo/guards.py:2311] {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "e693a6590122b61ee5c84da6a497ef23"} + + TREE_GUARD_MANAGER: + +- RootGuardManager + | +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:471 in init_ambient_guards + | +- GLOBAL_STATE: ___check_global_state() + | +- TORCH_FUNCTION_MODE_STACK: ___check_torch_function_mode_stack() + | +- GuardManager: source=L['args'], accessed_by=DictGetItemGuardAccessor(args) + | | +- TYPE_MATCH: ___check_type_id(L['args'], 8815232) + | | +- LENGTH_CHECK: len(L['args']) == 7 + | | +- GuardManager: source=L['args'][0], accessed_by=TupleGetItemGuardAccessor(0) + | | | +- TENSOR_MATCH: check_tensor(L['args'][0], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.float32, device=0, requires_grad=False, size=[1, 4, 512, 64], stride=[131072, 32768, 64, 1]) + | | | +- NO_HASATTR: hasattr(L['args'][0], '_dynamo_dynamic_indices') == False + | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['args'][0], L['args'][1], L['args'][2], L['args'][4][0], L['args'][4][1], L['args'][4][2], L['args'][4][3], L['args'][4][4], L['args'][4][5], L['args'][4][6], L['args'][4][7]) + | | +- GuardManager: source=L['args'][1], accessed_by=TupleGetItemGuardAccessor(1) + | | | +- TENSOR_MATCH: check_tensor(L['args'][1], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.float32, device=0, requires_grad=False, size=[1, 4, 512, 64], stride=[131072, 32768, 64, 1]) + | | | +- NO_HASATTR: hasattr(L['args'][1], '_dynamo_dynamic_indices') == False + | | | +- NO_TENSOR_ALIASING + | | +- GuardManager: source=L['args'][2], accessed_by=TupleGetItemGuardAccessor(2) + | | | +- TENSOR_MATCH: check_tensor(L['args'][2], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.float32, device=0, requires_grad=False, size=[1, 4, 512, 64], stride=[131072, 32768, 64, 1]) + | | | +- NO_HASATTR: hasattr(L['args'][2], '_dynamo_dynamic_indices') == False + | | | +- NO_TENSOR_ALIASING + | | +- GuardManager: source=L['args'][3], accessed_by=TupleGetItemGuardAccessor(3) + | | | +- GuardManager: source=L['args'][3].__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | +- ID_MATCH: ___check_obj_id(L['args'][3].__code__, 140413271879296) + | | +- GuardManager: source=L['args'][4], accessed_by=TupleGetItemGuardAccessor(4) + | | | +- TYPE_MATCH: ___check_type_id(L['args'][4], 8815232) + | | | +- LENGTH_CHECK: len(L['args'][4]) == 11 + | | | +- GuardManager: source=L['args'][4][0], accessed_by=TupleGetItemGuardAccessor(0) + | | | | +- TENSOR_MATCH: check_tensor(L['args'][4][0], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16], stride=[16, 16, 1]) + | | | | +- NO_HASATTR: hasattr(L['args'][4][0], '_dynamo_dynamic_indices') == False + | | | | +- NO_TENSOR_ALIASING + | | | +- GuardManager: source=L['args'][4][1], accessed_by=TupleGetItemGuardAccessor(1) + | | | | +- TENSOR_MATCH: check_tensor(L['args'][4][1], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16, 16], stride=[256, 256, 16, 1]) + | | | | +- NO_HASATTR: hasattr(L['args'][4][1], '_dynamo_dynamic_indices') == False + | | | | +- NO_TENSOR_ALIASING + | | | +- GuardManager: source=L['args'][4][2], accessed_by=TupleGetItemGuardAccessor(2) + | | | | +- TENSOR_MATCH: check_tensor(L['args'][4][2], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16], stride=[16, 16, 1]) + | | | | +- NO_HASATTR: hasattr(L['args'][4][2], '_dynamo_dynamic_indices') == False + | | | | +- NO_TENSOR_ALIASING + | | | +- GuardManager: source=L['args'][4][3], accessed_by=TupleGetItemGuardAccessor(3) + | | | | +- TENSOR_MATCH: check_tensor(L['args'][4][3], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16, 16], stride=[256, 256, 16, 1]) + | | | | +- NO_HASATTR: hasattr(L['args'][4][3], '_dynamo_dynamic_indices') == False + | | | | +- NO_TENSOR_ALIASING + | | | +- GuardManager: source=L['args'][4][4], accessed_by=TupleGetItemGuardAccessor(4) + | | | | +- TENSOR_MATCH: check_tensor(L['args'][4][4], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16], stride=[16, 16, 1]) + | | | | +- NO_HASATTR: hasattr(L['args'][4][4], '_dynamo_dynamic_indices') == False + | | | | +- NO_TENSOR_ALIASING + | | | +- GuardManager: source=L['args'][4][5], accessed_by=TupleGetItemGuardAccessor(5) + | | | | +- TENSOR_MATCH: check_tensor(L['args'][4][5], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16, 16], stride=[256, 256, 16, 1]) + | | | | +- NO_HASATTR: hasattr(L['args'][4][5], '_dynamo_dynamic_indices') == False + | | | | +- NO_TENSOR_ALIASING + | | | +- GuardManager: source=L['args'][4][6], accessed_by=TupleGetItemGuardAccessor(6) + | | | | +- TENSOR_MATCH: check_tensor(L['args'][4][6], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16], stride=[16, 16, 1]) + | | | | +- NO_HASATTR: hasattr(L['args'][4][6], '_dynamo_dynamic_indices') == False + | | | | +- NO_TENSOR_ALIASING + | | | +- GuardManager: source=L['args'][4][7], accessed_by=TupleGetItemGuardAccessor(7) + | | | | +- TENSOR_MATCH: check_tensor(L['args'][4][7], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16, 16], stride=[256, 256, 16, 1]) + | | | | +- NO_HASATTR: hasattr(L['args'][4][7], '_dynamo_dynamic_indices') == False + | | | | +- NO_TENSOR_ALIASING + | | | +- GuardManager: source=L['args'][4][8], accessed_by=TupleGetItemGuardAccessor(8) + | | | | +- EQUALS_MATCH: L['args'][4][8] == 128 + | | | +- GuardManager: source=L['args'][4][9], accessed_by=TupleGetItemGuardAccessor(9) + | | | | +- EQUALS_MATCH: L['args'][4][9] == 128 + | | | +- GuardManager: source=L['args'][4][10], accessed_by=TupleGetItemGuardAccessor(10) + | | | | +- GuardManager: source=L['args'][4][10].__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(L['args'][4][10].__code__, 140413271880128) + | | +- GuardManager: source=L['args'][5], accessed_by=TupleGetItemGuardAccessor(5) + | | | +- EQUALS_MATCH: L['args'][5] == 0.125 + | | +- GuardManager: source=L['args'][6], accessed_by=TupleGetItemGuardAccessor(6) + | | | +- DICT_LENGTH: len(L['args'][6]) == 3 + | | | +- GuardManager: source=L['args'][6]['ROWS_GUARANTEED_SAFE'], accessed_by=DictGetItemGuardAccessor(ROWS_GUARANTEED_SAFE) + | | | | +- ID_MATCH: ___check_obj_id(L['args'][6]['ROWS_GUARANTEED_SAFE'], 8910592) + | | | +- GuardManager: source=L['args'][6]['PRESCALE_QK'], accessed_by=DictGetItemGuardAccessor(PRESCALE_QK) + | | | | +- ID_MATCH: ___check_obj_id(L['args'][6]['PRESCALE_QK'], 8910592) + | | | +- GuardManager: source=L['args'][6]['OUTPUT_LOGSUMEXP'], accessed_by=DictGetItemGuardAccessor(OUTPUT_LOGSUMEXP) + | | | | +- ID_MATCH: ___check_obj_id(L['args'][6]['OUTPUT_LOGSUMEXP'], 8910592) + | +- GuardManager: source=L['kwargs'], accessed_by=DictGetItemGuardAccessor(kwargs) + | | +- DICT_LENGTH: not L['kwargs'] + | +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor + | | +- GuardManager: source=G['flex_attention_hop'], accessed_by=DictGetItemGuardAccessor(flex_attention_hop) + | | | +- TYPE_MATCH: ___check_type_id(G['flex_attention_hop'], 96992544) + | | | +- GuardManager: source=G['flex_attention_hop'].__name__, accessed_by=GetAttrGuardAccessor(__name__) + | | | | +- EQUALS_MATCH: G['flex_attention_hop'].__name__ == 'flex_attention' + | | +- GuardManager: source=G['__builtins_dict___0'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___0) + | | | +- GuardManager: source=G['__builtins_dict___0']['len'], accessed_by=DictGetItemGuardAccessor(len) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___0']['len'], 140413275558816) + | | | +- GuardManager: source=G['__builtins_dict___0']['sum'], accessed_by=DictGetItemGuardAccessor(sum) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___0']['sum'], 140413275559936) + | | | +- GuardManager: source=G['__builtins_dict___0']['list'], accessed_by=DictGetItemGuardAccessor(list) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___0']['list'], 8844320) + | | | +- GuardManager: source=G['__builtins_dict___0']['type'], accessed_by=DictGetItemGuardAccessor(type) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___0']['type'], 8813248) + | | | +- GuardManager: source=G['__builtins_dict___0']['tuple'], accessed_by=DictGetItemGuardAccessor(tuple) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___0']['tuple'], 8815232) + | | | +- GuardManager: source=G['__builtins_dict___0']['object'], accessed_by=DictGetItemGuardAccessor(object) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___0']['object'], 8813984) + | | | +- GuardManager: source=G['__builtins_dict___0']['isinstance'], accessed_by=DictGetItemGuardAccessor(isinstance) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___0']['isinstance'], 140413275558496) + | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot_utils_dot__pytree) + | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree'], 140411217627952) + | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree'].TreeSpec, accessed_by=GetAttrGuardAccessor(TreeSpec) + | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree'].TreeSpec, 84866496) + | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._is_leaf, accessed_by=GetAttrGuardAccessor(_is_leaf) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._is_leaf.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree']._is_leaf.__code__, 140411217262720) + | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC, accessed_by=GetAttrGuardAccessor(_LEAF_SPEC) + | | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC, 85171104) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.type, accessed_by=GetAttrGuardAccessor(type) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.type, 8825760) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.context, accessed_by=GetAttrGuardAccessor(context) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.context, 8825760) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.num_nodes, accessed_by=GetAttrGuardAccessor(num_nodes) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.num_nodes == 1 + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.num_leaves, accessed_by=GetAttrGuardAccessor(num_leaves) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.num_leaves == 1 + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.num_children, accessed_by=GetAttrGuardAccessor(num_children) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.num_children == 0 + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.children_specs, accessed_by=GetAttrGuardAccessor(children_specs) + | | | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.children_specs, 8844320) + | | | | | +- LENGTH_CHECK: not G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.children_specs + | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._get_node_type, accessed_by=GetAttrGuardAccessor(_get_node_type) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._get_node_type.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree']._get_node_type.__code__, 140411217262448) + | | | +- DictGuardManager: source=G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES, accessed_by=GetAttrGuardAccessor(SUPPORTED_NODES) + | | | | +- DICT_VERSION: ___dict_version(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES) == 519596 + | | | | +- KeyValueManager pair at index=1 + | | | | | +- ValueManager: GuardManager: source=G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]] + | | | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]].flatten_fn, accessed_by=GetAttrGuardAccessor(flatten_fn) + | | | | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]].flatten_fn.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]].flatten_fn.__code__, 140411196281984) + | | | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]].unflatten_fn, accessed_by=GetAttrGuardAccessor(unflatten_fn) + | | | | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]].unflatten_fn.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]].unflatten_fn.__code__, 140411217182288) + | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._tree_flatten_helper, accessed_by=GetAttrGuardAccessor(_tree_flatten_helper) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._tree_flatten_helper.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree']._tree_flatten_helper.__code__, 140411217413040) + | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._is_namedtuple_instance, accessed_by=GetAttrGuardAccessor(_is_namedtuple_instance) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._is_namedtuple_instance.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree']._is_namedtuple_instance.__code__, 140411217412592) + +V1003 10:10:50.894000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "20353735c197adbac78fed570e5a0d91"} + { + "name": "entire_frame_compile", + "ts": 1727975450894280.0, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 0, + "fxgraph_cache_miss": 0, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:10:50.894000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "6dc3af6842c03572276aaba0318e48d3"} + { + "name": "_compile.compile_inner", + "ts": 1727975450894591.0, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 0, + "fxgraph_cache_miss": 0, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:10:50.895000 2235078 torch/_dynamo/utils.py:840] {"compilation_metrics": {"compile_id": "0/0", "frame_key": "1", "co_name": "_flex_attention_hop_wrapper", "co_filename": "/data/users/oulgen/pytorch/torch/nn/attention/flex_attention.py", "co_firstlineno": 1049, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 55, "shape_env_guard_count": 0, "graph_op_count": 12, "graph_node_count": 26, "graph_input_count": 11, "start_time": 1727975450.6325822, "entire_frame_compile_time_s": 0.26139068603515625, "backend_compile_time_s": 0.0006465911865234375, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0, "has_guarded_code": true, "possibly_missed_reinplacing_opportunities": 0, "remote_cache_time_saved_s": 0, "structured_logging_overhead_s": 0.019217516000000004, "config_suppress_errors": false, "config_inline_inbuilt_nn_modules": true, "specialize_float": true}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:53.499000 2235078 torch/_dynamo/convert_frame.py:915] {"dynamo_start": {"stack": [{"line": 916, "name": "", "filename": 1}, {"line": 14, "name": "run_tests", "filename": 2}, {"line": 38, "name": "run_tests", "filename": 3}, {"line": 1273, "name": "run_tests", "filename": 4}, {"line": 102, "name": "__init__", "filename": 5}, {"line": 274, "name": "runTests", "filename": 5}, {"line": 217, "name": "run", "filename": 6}, {"line": 84, "name": "__call__", "filename": 7}, {"line": 122, "name": "run", "filename": 7}, {"line": 84, "name": "__call__", "filename": 7}, {"line": 122, "name": "run", "filename": 7}, {"line": 678, "name": "__call__", "filename": 8}, {"line": 3116, "name": "run", "filename": 4}, {"line": 3088, "name": "_run_custom", "filename": 4}, {"line": 623, "name": "run", "filename": 8}, {"line": 579, "name": "_callTestMethod", "filename": 8}, {"line": 2983, "name": "wrapper", "filename": 4}, {"line": 81, "name": "inner", "filename": 9}, {"line": 81, "name": "inner", "filename": 9}, {"line": 395, "name": "test_flex_attention_caching", "filename": 1}, {"line": 379, "name": "fn", "filename": 1}]}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:53.499000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "e10001bc2249e6200cda3bf42b80be57"} + { + "name": "_compile.compile_inner", + "ts": 1727975453499593.8, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:10:53.499000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "ff0561bc609e54d7e5775bdf8f81ff48"} + { + "name": "entire_frame_compile", + "ts": 1727975453499593.8, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:10:53.502000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 0, "describer_id": 9, "size": 524288}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:53.503000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 0, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1, 4, 512, 64], "is_leaf": true, "stride": [131072, 32768, 64, 1], "storage": 0, "view_func": "", "describer_id": 9}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:53.503000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 9, "id": 0, "source": "L['q']"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:53.511000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 1, "describer_id": 9, "size": 524288}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:53.512000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 1, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1, 4, 512, 64], "is_leaf": true, "stride": [131072, 32768, 64, 1], "storage": 1, "view_func": "", "describer_id": 9}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:53.512000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 9, "id": 1, "source": "L['k']"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:53.513000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 2, "describer_id": 9, "size": 524288}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:53.514000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1, 4, 512, 64], "is_leaf": true, "stride": [131072, 32768, 64, 1], "storage": 2, "view_func": "", "describer_id": 9}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:53.514000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 9, "id": 2, "source": "L['v']"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:53.532000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 3, "describer_id": 9, "size": 64}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:53.532000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 3, "ndim": 3, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16], "is_leaf": true, "stride": [16, 16, 1], "storage": 3, "view_func": "", "describer_id": 9}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:53.532000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 9, "id": 3, "source": "L['block_mask'].kv_num_blocks"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:53.676000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 4, "describer_id": 9, "size": 1024}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:53.677000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 7, "ndim": 4, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16, 16], "is_leaf": true, "stride": [256, 256, 16, 1], "storage": 4, "view_func": "", "describer_id": 9}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:53.677000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 9, "id": 7, "source": "L['block_mask'].kv_indices"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:53.679000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 5, "describer_id": 9, "size": 64}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:53.679000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 8, "ndim": 3, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16], "is_leaf": true, "stride": [16, 16, 1], "storage": 5, "view_func": "", "describer_id": 9}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:53.679000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 9, "id": 8, "source": "L['block_mask'].full_kv_num_blocks"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:53.680000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 6, "describer_id": 9, "size": 1024}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:53.681000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 9, "ndim": 4, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16, 16], "is_leaf": true, "stride": [256, 256, 16, 1], "storage": 6, "view_func": "", "describer_id": 9}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:53.681000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 9, "id": 9, "source": "L['block_mask'].full_kv_indices"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:53.682000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 7, "describer_id": 9, "size": 64}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:53.683000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 10, "ndim": 3, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16], "is_leaf": true, "stride": [16, 16, 1], "storage": 7, "view_func": "", "describer_id": 9}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:53.683000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 9, "id": 10, "source": "L['block_mask'].q_num_blocks"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:53.684000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 8, "describer_id": 9, "size": 1024}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:53.685000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 11, "ndim": 4, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16, 16], "is_leaf": true, "stride": [256, 256, 16, 1], "storage": 8, "view_func": "", "describer_id": 9}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:53.685000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 9, "id": 11, "source": "L['block_mask'].q_indices"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:53.686000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 9, "describer_id": 9, "size": 64}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:53.686000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 12, "ndim": 3, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16], "is_leaf": true, "stride": [16, 16, 1], "storage": 9, "view_func": "", "describer_id": 9}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:53.687000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 9, "id": 12, "source": "L['block_mask'].full_q_num_blocks"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:53.688000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 10, "describer_id": 9, "size": 1024}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:53.688000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 13, "ndim": 4, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16, 16], "is_leaf": true, "stride": [256, 256, 16, 1], "storage": 10, "view_func": "", "describer_id": 9}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:53.689000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 9, "id": 13, "source": "L['block_mask'].full_q_indices"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:10:53.697000 2235078 torch/_dynamo/output_graph.py:1347] {"dynamo_output_graph": {"sizes": {"l_q_": [1, 4, 512, 64], "l_k_": [1, 4, 512, 64], "l_v_": [1, 4, 512, 64], "l_block_mask_kv_num_blocks": [1, 1, 16], "l_block_mask_kv_indices": [1, 1, 16, 16], "l_block_mask_full_kv_num_blocks": [1, 1, 16], "l_block_mask_full_kv_indices": [1, 1, 16, 16], "l_block_mask_q_num_blocks": [1, 1, 16], "l_block_mask_q_indices": [1, 1, 16, 16], "l_block_mask_full_q_num_blocks": [1, 1, 16], "l_block_mask_full_q_indices": [1, 1, 16, 16], "child_1": [], "child_2": [], "child_3": [], "child_4": [], "child": [], "child_5": [], "child_6": [], "child_7": [], "child_8": [], "out": [1, 4, 512, 64]}}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "0365bd76b3474fb6d96e4c3c42585fb7"} + class GraphModule(torch.nn.Module): + def forward(self, L_q_: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0", L_k_: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0", L_v_: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0", L_block_mask_kv_num_blocks: "i32[1, 1, 16][16, 16, 1]cuda:0", L_block_mask_kv_indices: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0", L_block_mask_full_kv_num_blocks: "i32[1, 1, 16][16, 16, 1]cuda:0", L_block_mask_full_kv_indices: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0", L_block_mask_q_num_blocks: "i32[1, 1, 16][16, 16, 1]cuda:0", L_block_mask_q_indices: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0", L_block_mask_full_q_num_blocks: "i32[1, 1, 16][16, 16, 1]cuda:0", L_block_mask_full_q_indices: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0"): + l_q_ = L_q_ + l_k_ = L_k_ + l_v_ = L_v_ + l_block_mask_kv_num_blocks = L_block_mask_kv_num_blocks + l_block_mask_kv_indices = L_block_mask_kv_indices + l_block_mask_full_kv_num_blocks = L_block_mask_full_kv_num_blocks + l_block_mask_full_kv_indices = L_block_mask_full_kv_indices + l_block_mask_q_num_blocks = L_block_mask_q_num_blocks + l_block_mask_q_indices = L_block_mask_q_indices + l_block_mask_full_q_num_blocks = L_block_mask_full_q_num_blocks + l_block_mask_full_q_indices = L_block_mask_full_q_indices + + # File: /data/users/oulgen/pytorch/torch/nn/attention/flex_attention.py:1032 in flex_attention, code: out, lse = flex_attention_hop( + child_1: "i32[][]cuda:0" = l_q_.new_empty([], dtype = torch.int32); child_1 = None + child_2: "i32[][]cuda:0" = l_q_.new_empty([], dtype = torch.int32); child_2 = None + child_3: "i32[][]cuda:0" = l_q_.new_empty([], dtype = torch.int32); child_3 = None + child_4: "i32[][]cuda:0" = l_q_.new_empty([], dtype = torch.int32); child_4 = None + child: "f32[][]cuda:0" = l_q_.new_empty([], requires_grad = False); child = None + score_mod_0 = self.score_mod_0 + child_5: "i32[][]cuda:0" = l_q_.new_empty([], dtype = torch.int32); child_5 = None + child_6: "i32[][]cuda:0" = l_q_.new_empty([], dtype = torch.int32); child_6 = None + child_7: "i32[][]cuda:0" = l_q_.new_empty([], dtype = torch.int32); child_7 = None + child_8: "i32[][]cuda:0" = l_q_.new_empty([], dtype = torch.int32); child_8 = None + mask_fn_0 = self.mask_fn_0 + flex_attention = torch.ops.higher_order.flex_attention(l_q_, l_k_, l_v_, score_mod_0, (l_block_mask_kv_num_blocks, l_block_mask_kv_indices, l_block_mask_full_kv_num_blocks, l_block_mask_full_kv_indices, l_block_mask_q_num_blocks, l_block_mask_q_indices, l_block_mask_full_q_num_blocks, l_block_mask_full_q_indices, 128, 128, mask_fn_0), 0.125, {'ROWS_GUARANTEED_SAFE': False, 'PRESCALE_QK': False, 'OUTPUT_LOGSUMEXP': False}, (), ()); l_q_ = l_k_ = l_v_ = score_mod_0 = l_block_mask_kv_num_blocks = l_block_mask_kv_indices = l_block_mask_full_kv_num_blocks = l_block_mask_full_kv_indices = l_block_mask_q_num_blocks = l_block_mask_q_indices = l_block_mask_full_q_num_blocks = l_block_mask_full_q_indices = mask_fn_0 = None + out: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0" = flex_attention[0]; flex_attention = None + return (out,) + + class score_mod_0(torch.nn.Module): + def forward(self, child: "f32[][]cuda:0", child_1: "i32[][]cuda:0", child_2: "i32[][]cuda:0", child_3: "i32[][]cuda:0", child_4: "i32[][]cuda:0"): + # File: /data/users/oulgen/pytorch/test/inductor/test_codecache.py:377 in score_mod, code: return score + (q - kv) + sub: "i32[][]cuda:0" = child_3 - child_4; child_3 = child_4 = None + add: "f32[][]cuda:0" = child + sub; child = sub = None + return add + + class mask_fn_0(torch.nn.Module): + def forward(self, child_5: "i32[][]cuda:0", child_6: "i32[][]cuda:0", child_7: "i32[][]cuda:0", child_8: "i32[][]cuda:0"): + # File: /data/users/oulgen/pytorch/test/inductor/test_codecache.py:373 in , code: lambda b, h, q, kv: q >= kv, None, None, 2048, 2048 + ge: "b8[][]cuda:0" = child_7 >= child_8; child_7 = child_8 = None + return ge + +V1003 10:10:53.698000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "f3daea7b3445ed3e94cb46786b2c5d08"} + { + "name": "OutputGraph.call_user_compiler", + "ts": 1727975453698084.0, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:10:53.698000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "d8ed1bc2cb1632abc21c22f112ed0fec"} + { + "name": "backend_compile", + "ts": 1727975453698084.0, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:10:53.708000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "18faf90ed7d7bb7f081083da150205b3"} + { + "name": "create_aot_dispatcher_function", + "ts": 1727975453708777.0, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:10:53.842000 2235078 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:215] {"aot_forward_graph": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "f6cec159cb1cee52405afcfe40b62c03"} + class (torch.nn.Module): + def forward(self, arg0_1: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0", arg1_1: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0", arg2_1: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0", arg3_1: "i32[1, 1, 16][16, 16, 1]cuda:0", arg4_1: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0", arg5_1: "i32[1, 1, 16][16, 16, 1]cuda:0", arg6_1: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0", arg7_1: "i32[1, 1, 16][16, 16, 1]cuda:0", arg8_1: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0", arg9_1: "i32[1, 1, 16][16, 16, 1]cuda:0", arg10_1: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0"): + # File: /data/users/oulgen/pytorch/torch/nn/attention/flex_attention.py:1032 in flex_attention, code: out, lse = flex_attention_hop( + sdpa_score0 = self.sdpa_score0 + sdpa_mask0 = self.sdpa_mask0 + flex_attention = torch.ops.higher_order.flex_attention(arg0_1, arg1_1, arg2_1, sdpa_score0, (arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, 128, 128, sdpa_mask0), 0.125, {'ROWS_GUARANTEED_SAFE': False, 'PRESCALE_QK': False, 'OUTPUT_LOGSUMEXP': False}, (), ()); arg0_1 = arg1_1 = arg2_1 = sdpa_score0 = arg3_1 = arg4_1 = arg5_1 = arg6_1 = arg7_1 = arg8_1 = arg9_1 = arg10_1 = sdpa_mask0 = None + getitem: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0" = flex_attention[0]; flex_attention = None + return (getitem,) + + class sdpa_score0(torch.nn.Module): + def forward(self, arg0_1: "f32[][]cuda:0", arg1_1: "i32[][]cuda:0", arg2_1: "i32[][]cuda:0", arg3_1: "i32[][]cuda:0", arg4_1: "i32[][]cuda:0"): + # File: /data/users/oulgen/pytorch/torch/nn/attention/flex_attention.py:1032 in flex_attention, code: out, lse = flex_attention_hop( + sub: "i32[][]cuda:0" = torch.ops.aten.sub.Tensor(arg3_1, arg4_1); arg3_1 = arg4_1 = None + add: "f32[][]cuda:0" = torch.ops.aten.add.Tensor(arg0_1, sub); arg0_1 = sub = None + return add + + class sdpa_mask0(torch.nn.Module): + def forward(self, arg0_1: "i32[][]cuda:0", arg1_1: "i32[][]cuda:0", arg2_1: "i32[][]cuda:0", arg3_1: "i32[][]cuda:0"): + # File: /data/users/oulgen/pytorch/test/inductor/test_codecache.py:373 in , code: lambda b, h, q, kv: q >= kv, None, None, 2048, 2048 + ge: "b8[][]cuda:0" = torch.ops.aten.ge.Tensor(arg2_1, arg3_1); arg2_1 = arg3_1 = None + return ge + +V1003 10:10:53.843000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "5dd81e8de7669fa4f8da5f234591e1dc"} + { + "name": "compile_fx..fw_compiler_base", + "ts": 1727975453842696.2, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:10:54.072000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "df64f9080a1fb9082793b2c995d4ba13"} + { + "name": "compile_fx_inner", + "ts": 1727975454072213.5, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:10:54.072000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "6fe6927d5157dc6d0578283e86ead8a3"} + { + "name": "inductor_compile", + "ts": 1727975454072213.5, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:10:56.234000 2235078 torch/_inductor/compile_fx.py:731] {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "c507028eda974d39e4ca6593c6320460"} + + import torch + from torch import tensor, device + import torch.fx as fx + from torch._dynamo.testing import rand_strided + from math import inf + import torch._inductor.inductor_prims + + import torch._dynamo.config + import torch._inductor.config + import torch._functorch.config + import torch.fx.experimental._config + torch._dynamo.config.log_compilation_metrics = False + torch._dynamo.config.fake_tensor_cache_crosscheck_enabled = True + torch._inductor.config.fx_graph_remote_cache = False + torch._inductor.config.autotune_local_cache = False + torch._inductor.config.autotune_remote_cache = False + torch._functorch.config.unlift_effect_tokens = True + + + + isolate_fails_code_str = None + + + + # torch version: 2.5.0a0+git7647c39 + # torch cuda version: 12.0 + # torch git version: 7647c398ff87daf70260854cf0a7f7993b3abc76 + + + # CUDA Info: + # nvcc: NVIDIA (R) Cuda compiler driver + # Copyright (c) 2005-2023 NVIDIA Corporation + # Built on Fri_Jan__6_16:45:21_PST_2023 + # Cuda compilation tools, release 12.0, V12.0.140 + # Build cuda_12.0.r12.0/compiler.32267302_0 + + # GPU Hardware Info: + # NVIDIA PG509-210 : 8 + + + from torch.nn import * + class Repro(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self.sdpa_score0 = () + self.sdpa_mask0 = () + + + + def forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1): + sdpa_score0 = self.sdpa_score0 + sdpa_mask0 = self.sdpa_mask0 + flex_attention = torch.ops.higher_order.flex_attention(arg0_1, arg1_1, arg2_1, sdpa_score0, (arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, 128, 128, sdpa_mask0), 0.125, {'ROWS_GUARANTEED_SAFE': False, 'PRESCALE_QK': False, 'OUTPUT_LOGSUMEXP': False}, (), ()); arg0_1 = arg1_1 = arg2_1 = sdpa_score0 = arg3_1 = arg4_1 = arg5_1 = arg6_1 = arg7_1 = arg8_1 = arg9_1 = arg10_1 = sdpa_mask0 = None + getitem = flex_attention[0]; flex_attention = None + return (getitem,) + + def load_args(reader): + buf0 = reader.storage(None, 524288, device=device(type='cuda', index=0)) + reader.tensor(buf0, (1, 4, 512, 64), is_leaf=True) # arg0_1 + buf1 = reader.storage(None, 524288, device=device(type='cuda', index=0)) + reader.tensor(buf1, (1, 4, 512, 64), is_leaf=True) # arg1_1 + buf2 = reader.storage(None, 524288, device=device(type='cuda', index=0)) + reader.tensor(buf2, (1, 4, 512, 64), is_leaf=True) # arg2_1 + buf3 = reader.storage(None, 64, device=device(type='cuda', index=0), dtype_hint=torch.int32) + reader.tensor(buf3, (1, 1, 16), dtype=torch.int32, is_leaf=True) # arg3_1 + buf4 = reader.storage(None, 1024, device=device(type='cuda', index=0), dtype_hint=torch.int32) + reader.tensor(buf4, (1, 1, 16, 16), dtype=torch.int32, is_leaf=True) # arg4_1 + buf5 = reader.storage(None, 64, device=device(type='cuda', index=0), dtype_hint=torch.int32) + reader.tensor(buf5, (1, 1, 16), dtype=torch.int32, is_leaf=True) # arg5_1 + buf6 = reader.storage(None, 1024, device=device(type='cuda', index=0), dtype_hint=torch.int32) + reader.tensor(buf6, (1, 1, 16, 16), dtype=torch.int32, is_leaf=True) # arg6_1 + buf7 = reader.storage(None, 64, device=device(type='cuda', index=0), dtype_hint=torch.int32) + reader.tensor(buf7, (1, 1, 16), dtype=torch.int32, is_leaf=True) # arg7_1 + buf8 = reader.storage(None, 1024, device=device(type='cuda', index=0), dtype_hint=torch.int32) + reader.tensor(buf8, (1, 1, 16, 16), dtype=torch.int32, is_leaf=True) # arg8_1 + buf9 = reader.storage(None, 64, device=device(type='cuda', index=0), dtype_hint=torch.int32) + reader.tensor(buf9, (1, 1, 16), dtype=torch.int32, is_leaf=True) # arg9_1 + buf10 = reader.storage(None, 1024, device=device(type='cuda', index=0), dtype_hint=torch.int32) + reader.tensor(buf10, (1, 1, 16, 16), dtype=torch.int32, is_leaf=True) # arg10_1 + load_args._version = 0 + mod = Repro() + if __name__ == '__main__': + from torch._dynamo.repro.after_aot import run_repro + with torch.no_grad(): + run_repro(mod, load_args, accuracy=False, command='run', save_dir=None, tracing_mode='real', check_str=None) + # To run it separately, do + # mod, args = run_repro(mod, load_args, accuracy=False, command='get_args', save_dir=None, tracing_mode='real', check_str=None) + # mod(*args) +V1003 10:10:56.463000 2235078 torch/_inductor/compile_fx.py:795] {"inductor_post_grad_graph": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "f6cec159cb1cee52405afcfe40b62c03"} + class (torch.nn.Module): + def forward(self, arg0_1: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0", arg1_1: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0", arg2_1: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0", arg3_1: "i32[1, 1, 16][16, 16, 1]cuda:0", arg4_1: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0", arg5_1: "i32[1, 1, 16][16, 16, 1]cuda:0", arg6_1: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0", arg7_1: "i32[1, 1, 16][16, 16, 1]cuda:0", arg8_1: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0", arg9_1: "i32[1, 1, 16][16, 16, 1]cuda:0", arg10_1: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0"): + # File: /data/users/oulgen/pytorch/torch/nn/attention/flex_attention.py:1032 in flex_attention, code: out, lse = flex_attention_hop( + sdpa_score0 = self.sdpa_score0 + sdpa_mask0 = self.sdpa_mask0 + flex_attention = torch.ops.higher_order.flex_attention(arg0_1, arg1_1, arg2_1, sdpa_score0, (arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, 128, 128, sdpa_mask0), 0.125, {'ROWS_GUARANTEED_SAFE': False, 'PRESCALE_QK': False, 'OUTPUT_LOGSUMEXP': False}, (), ()); arg0_1 = arg1_1 = arg2_1 = sdpa_score0 = arg3_1 = arg4_1 = arg5_1 = arg6_1 = arg7_1 = arg8_1 = arg9_1 = arg10_1 = sdpa_mask0 = None + getitem: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0" = flex_attention[0]; flex_attention = None + return (getitem,) + + class sdpa_score0(torch.nn.Module): + def forward(self, arg0_1: "f32[][]cuda:0", arg1_1: "i32[][]cuda:0", arg2_1: "i32[][]cuda:0", arg3_1: "i32[][]cuda:0", arg4_1: "i32[][]cuda:0"): + # File: /data/users/oulgen/pytorch/torch/nn/attention/flex_attention.py:1032 in flex_attention, code: out, lse = flex_attention_hop( + sub: "i32[][]cuda:0" = torch.ops.aten.sub.Tensor(arg3_1, arg4_1); arg3_1 = arg4_1 = None + add: "f32[][]cuda:0" = torch.ops.aten.add.Tensor(arg0_1, sub); arg0_1 = sub = None + return add + + class sdpa_mask0(torch.nn.Module): + def forward(self, arg0_1: "i32[][]cuda:0", arg1_1: "i32[][]cuda:0", arg2_1: "i32[][]cuda:0", arg3_1: "i32[][]cuda:0"): + # File: /data/users/oulgen/pytorch/test/inductor/test_codecache.py:373 in , code: lambda b, h, q, kv: q >= kv, None, None, 2048, 2048 + ge: "b8[][]cuda:0" = torch.ops.aten.ge.Tensor(arg2_1, arg3_1); arg2_1 = arg3_1 = None + return ge + +V1003 10:10:56.496000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "fc5eec6dd74463eae278f61fbe01c218"} + { + "name": "GraphLowering.run", + "ts": 1727975456496145.2, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:10:56.814000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "ceafe5eb3d5703b5ca9729fc880cd47b"} + { + "name": "GraphLowering.run", + "ts": 1727975456813991.2, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 0, + "fxgraph_cache_miss": 1, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:10:56.814000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "5103da0c4e3f17344951cb8876a0ef08"} + { + "name": "GraphLowering.compile_to_module", + "ts": 1727975456814764.0, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:10:56.815000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "0cc1abc97b9fd26bc3c8bcfa0fb94399"} + { + "name": "code_gen", + "ts": 1727975456814764.0, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:10:56.820000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "b680f8a68ec3cd182eef8fd76116df1f"} + { + "name": "Scheduler.__init__", + "ts": 1727975456820242.5, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:10:56.826000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "3ea7d3d1a284720131038fa646c6b219"} + { + "name": "Scheduler.__init__", + "ts": 1727975456826429.2, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 0, + "fxgraph_cache_miss": 1, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:10:56.826000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "6328137516750744d43158caca6502ff"} + { + "name": "Scheduler.codegen", + "ts": 1727975456826813.2, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:10:56.837000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "ec3d70ea4b3cc1330ead5e46c08a16ac"} + { + "name": "Scheduler.codegen", + "ts": 1727975456837107.8, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 0, + "fxgraph_cache_miss": 1, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:10:56.837000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "bdcac968c06e063fd73d86a38c67b967"} + { + "name": "PythonWrapperCodegen.generate", + "ts": 1727975456837466.0, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:10:56.840000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "4f0fc9e676bebc5b3ec366bb1e137c82"} + { + "name": "PythonWrapperCodegen.generate", + "ts": 1727975456840311.0, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 0, + "fxgraph_cache_miss": 1, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:10:56.841000 2235078 torch/_inductor/graph.py:1899] {"inductor_output_code": {"filename": "/tmp/oulgen/tmp4z1i5ywe/kp/ckpysuucou6gm55terbvpynnfevubjpwkfm3ubzxyauw2bgggpvi.py"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "187381168c614ba1181a424c67fdb77c"} + # AOT ID: ['0_inference'] + from ctypes import c_void_p, c_long, c_int + import torch + import math + import random + import os + import tempfile + from math import inf, nan + from torch._inductor.hooks import run_intermediate_hooks + from torch._inductor.utils import maybe_profile + from torch._inductor.codegen.memory_planning import _align as align + from torch import device, empty_strided + from torch._inductor.async_compile import AsyncCompile + from torch._inductor.select_algorithm import extern_kernels + from torch._inductor.codegen.multi_kernel import MultiKernelCall + import torch._inductor.kernel.flex_attention + from torch._C import _cuda_getCurrentRawStream as get_raw_stream + import triton + import triton.language as tl + from torch._inductor.runtime.triton_heuristics import grid, split_scan_grid, grid_combo_kernels, start_graph, end_graph + + aten = torch.ops.aten + inductor_ops = torch.ops.inductor + _quantized = torch.ops._quantized + assert_size_stride = torch._C._dynamo.guards.assert_size_stride + empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu + empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda + empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu + reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor + alloc_from_pool = torch.ops.inductor._alloc_from_pool + async_compile = AsyncCompile() + + + # kernel path: /tmp/oulgen/tmp4z1i5ywe/ke/ckedjitfju7kpaxnpatsaz3gzkdz6znc4u5v2e22w2xzbjdvecy5.py + # Topologically Sorted Source Nodes: [flex_attention], Original ATen: [] + # Source node to ATen node mapping: + # flex_attention => flex_attention + # Graph fragment: + # %flex_attention : [num_users=1] = call_function[target=torch.ops.higher_order.flex_attention](args = (%arg0_1, %arg1_1, %arg2_1, %sdpa_score0, (%arg3_1, %arg4_1, %arg5_1, %arg6_1, %arg7_1, %arg8_1, %arg9_1, %arg10_1, 128, 128, %sdpa_mask0), 0.125, {ROWS_GUARANTEED_SAFE: False, PRESCALE_QK: False, OUTPUT_LOGSUMEXP: False}, (), ()), kwargs = {}) + triton_tem_fused_0 = async_compile.triton('triton_tem_fused_0', ''' + import triton + import triton.language as tl + from triton.compiler.compiler import AttrsDescriptor + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, instance_descriptor, DeviceProperties + + @triton_heuristics.template( + num_stages=3, + num_warps=4, + triton_meta={'signature': {'arg_Q': '*fp32', 'arg_K': '*fp32', 'arg_V': '*fp32', 'arg_LSE': '*fp32', 'arg_KV_NUM_BLKS': '*i32', 'arg_KV_IDX': '*i32', 'arg_FULL_KV_NUM_BLKS': '*i32', 'arg_FULL_KV_IDX': '*i32', 'out_ptr0': '*fp32'}, 'device': DeviceProperties(type='cuda', index=0, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, multi_processor_count=108, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}, + inductor_meta={'kernel_name': 'triton_tem_fused_0', 'backend_hash': 'FB2CA426CF35F271C56C0D69873498391AC248E25890F2B631CA8B52D56952BD', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': False, 'autotune_pointwise': True, 'autotune_remote_cache': False, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}, + ) + @triton.jit + def triton_tem_fused_0(arg_Q, arg_K, arg_V, arg_LSE, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0): + ROWS_GUARANTEED_SAFE : tl.constexpr = False + PRESCALE_QK : tl.constexpr = False + OUTPUT_LOGSUMEXP : tl.constexpr = False + FLOAT32_PRECISION : tl.constexpr = 'ieee' + IS_DIVISIBLE : tl.constexpr = True + SM_SCALE : tl.constexpr = 0.125 + GQA_SHARED_HEADS : tl.constexpr = 1 + HAS_FULL_BLOCKS : tl.constexpr = True + QK_HEAD_DIM : tl.constexpr = 64 + V_HEAD_DIM : tl.constexpr = 64 + BLOCK_M : tl.constexpr = 128 + BLOCK_N : tl.constexpr = 32 + SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128 + SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128 + Q = arg_Q + K = arg_K + V = arg_V + LSE = arg_LSE + KV_NUM_BLKS = arg_KV_NUM_BLKS + KV_IDX = arg_KV_IDX + FULL_KV_NUM_BLKS = arg_FULL_KV_NUM_BLKS + FULL_KV_IDX = arg_FULL_KV_IDX + + # Sub notation for this kernel: + # + # Q: Query, K: Key, V: Value + # M: Number of queries, N: Number of keys/values, D: Model dimension + # QK_HEAD_DIM: The dimension of the query and key embeddings + # V_HEAD_DIM: The dimension of the value embeddings + # z: Batch size, h: Number of heads, m: Number of queries per head, k: Number of keys per head + # GQA_SHARED_HEADS: number of query heads sharing one kv head in GQA setups. + # + # The following FULL_* and PARTIAL_* is defined in the block sparse mask grid, rather than the thread block grid. + # KV_NUM_BLKS: The number of KV blocks (that may or may not require masking) for each query. + # KV_IDX: The indices of KV blocks (that may or may not require masking) for each query. + # FULL_KV_NUM_BLKS: The number of fully unmasked KV blocks (so we don't need masking) for each query. + # FULL_KV_IDX: The indices of fully unmasked KV blocks (so we don't need masking) for each query. + # + # OUTPUT_LOGSUMEXP: We only need to store the logsumexp if we require grad + # + # (Modifiable) Performance tuning options + # BLOCK_M: The thread block size across the seqlen dim of Q. + # BLOCK_N: Iterate over BLOCK_N across the seqlen dim of K/V in each thread block. + + # The below are kernel options that can be applied for certain score_mods, + # or involve a numerics vs. perf tradeoff + # PRESCALE_QK: Whether to pre-scale QK by 1/sqrt(d) and change of base. Has + # about 20% more numerical error, but slightly faster. + # ROWS_GUARANTEED_SAFE: Is it guaranteed that at least one value in each row + # is not masked out? If so, we can skip an extra safety check + + tl.static_assert(SPARSE_Q_BLOCK_SIZE >= BLOCK_M and SPARSE_Q_BLOCK_SIZE % BLOCK_M == 0) + tl.static_assert(SPARSE_KV_BLOCK_SIZE >= BLOCK_N and SPARSE_KV_BLOCK_SIZE % BLOCK_N == 0) + + # Define strides of inputs + stride_qz, stride_qh, stride_qm, stride_qk = 131072, 32768, 64, 1 + stride_kz, stride_kh, stride_kn, stride_kk = 131072, 32768, 64, 1 + stride_vz, stride_vh, stride_vn, stride_vk = 131072, 32768, 64, 1 + + ZQ = 1 + HQ = 4 + Q_LEN = 512 + ZKV = 1 + KV_LEN = 512 + + MATMUL_PRECISION = Q.dtype.element_ty + + q_start = tl.program_id(0) + off_zq = tl.program_id(1) // HQ + off_hq = tl.program_id(1) % HQ + + # We support two cases for batch dimension. a) (ZKV == ZQ) where off_zkv = off_zq. + # b) (ZKV == 1 and ZQ > 1) where KV is broadcasted along the batch dimension and off_zkv=0. + off_zkv = off_zq % ZKV + off_hkv = off_hq // GQA_SHARED_HEADS + off_g = off_hq % GQA_SHARED_HEADS + + q_offset = off_zq * stride_qz + off_hq * stride_qh + k_offset = off_zkv * stride_kz + off_hkv * stride_kh + v_offset = off_zkv * stride_vz + off_hkv * stride_vh + + Q = Q + q_offset + K = K + k_offset + V = V + v_offset + + SPARSE_Z = 1 + SPARSE_HQ = 1 + + sparse_idx_z = off_zq % SPARSE_Z + sparse_idx_hq = off_hq % SPARSE_HQ + + SPARSE_Q_MULTIPLE: tl.constexpr = (SPARSE_Q_BLOCK_SIZE // BLOCK_M) + SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N) + + stride_kv_num_blks_h = 16 + stride_kv_idx_h = 256 + stride_kv_idx_m = 16 + + # initialize pointer to m and l + m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") + l_i = tl.zeros([BLOCK_M], dtype=tl.float32) + acc = tl.zeros([BLOCK_M, V_HEAD_DIM], dtype=tl.float32) + + offs_m = q_start * BLOCK_M + tl.arange(0, BLOCK_M) + + # KV_IDX and KV_NUM_BLKS are always contiguous. + sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq + sparse_kv_num_blks_offset = sparse_hz_offset * stride_kv_num_blks_h + q_start // SPARSE_Q_MULTIPLE + sparse_kv_idx_offset = sparse_hz_offset * stride_kv_idx_h + (q_start // SPARSE_Q_MULTIPLE) * stride_kv_idx_m # noqa: B950 + + Q_block_ptr = tl.make_block_ptr( + base=Q, + shape=(Q_LEN, QK_HEAD_DIM), + strides=(stride_qm, stride_qk), + offsets=(q_start * BLOCK_M, 0), + block_shape=(BLOCK_M, QK_HEAD_DIM), + order=(1, 0) + ) + + # load q: it stays in SRAM throughout the inner loop. + if IS_DIVISIBLE: + q = tl.load(Q_block_ptr) + else: + # boundary check is not free, so we only do it when necessary. + q = tl.load(Q_block_ptr, boundary_check=(0,), padding_option = "zero") + + # ~~~~~~~~~~~~~~ normal blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # We don't know anything "special" about these blocks, so we need to apply + # both score_mod and mask_mod to it + kv_indices = KV_IDX + sparse_kv_idx_offset + kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading + kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_kv_num_blks_offset) + block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1)) + + K_block_ptr = tl.make_block_ptr( + base=K, + shape=(QK_HEAD_DIM, KV_LEN), + strides=(stride_kk, stride_kn), + offsets=(0, kv_start), + block_shape=(QK_HEAD_DIM, BLOCK_N), + order=(0, 1) + ) + V_block_ptr = tl.make_block_ptr( + base=V, + shape=(KV_LEN, V_HEAD_DIM), + strides=(stride_vn, stride_vk), + offsets=(kv_start, 0), + block_shape=(BLOCK_N, V_HEAD_DIM), + order=(1, 0) + ) + offs_n = kv_start + tl.arange(0, BLOCK_N) + + acc, l_i, m_i = forward_inner( + arg_Q, arg_K, arg_V, arg_LSE, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, + q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN, + acc, l_i, m_i, + off_zq, off_hq, offs_m[:, None], offs_n[None, :], + kv_indices, kv_num_blocks, + 0, block_n_end, + MATMUL_PRECISION, + IS_FULL_BLOCKS=False, + ) + + # ~~~~~~~~~~~~~~ "full" blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # We know these blocks are guaranteed to be "full", so we don't need to + # apply mask_mod to them - only score_mod + if HAS_FULL_BLOCKS: + # FULL_KV_IDX and FULL_KV_NUM_BLKS are always contiguous. + kv_indices = FULL_KV_IDX + sparse_kv_idx_offset + kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading + kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_kv_num_blks_offset) + block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1)) + + K_block_ptr = tl.make_block_ptr( + base=K, + shape=(QK_HEAD_DIM, KV_LEN), + strides=(stride_kk, stride_kn), + offsets=(0, kv_start), + block_shape=(QK_HEAD_DIM, BLOCK_N), + order=(0, 1) + ) + V_block_ptr = tl.make_block_ptr( + base=V, + shape=(KV_LEN, V_HEAD_DIM), + strides=(stride_vn, stride_vk), + offsets=(kv_start, 0), + block_shape=(BLOCK_N, V_HEAD_DIM), + order=(1, 0) + ) + offs_n = kv_start + tl.arange(0, BLOCK_N) + + acc, l_i, m_i = forward_inner( + arg_Q, arg_K, arg_V, arg_LSE, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, + q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN, + acc, l_i, m_i, + off_zq, off_hq, offs_m[:, None], offs_n[None, :], + kv_indices, kv_num_blocks, + 0, block_n_end, + MATMUL_PRECISION, + IS_FULL_BLOCKS=True, + ) + + + # [Note] Handle fully masked out rows: + # Li will be the sum(e^(-inf)) == 0.0 for masked out rows, mi will be -inf. + # We set Li to 1.0 which will result in lse/out = 0.0 | after the log(li) + mi(0.0) step + l_i = tl.where(l_i == 0.0, 1, l_i) + + acc = acc / l_i[:, None] + idx_zq = tl.program_id(1) // HQ + idx_hq = tl.program_id(1) % HQ + idx_m = offs_m[:, None] + idx_d = tl.arange(0, V_HEAD_DIM)[None, :] + + mask = idx_m < Q_LEN + # TODO generalize and add proper mask support + xindex = idx_d + (64*idx_m) + (32768*idx_hq) + (131072*idx_zq) + tl.store(out_ptr0 + (tl.broadcast_to(idx_d + (64*idx_m) + (32768*idx_hq), acc.shape)), acc, mask) + + # TODO dont want to write this if we dont require grad + if OUTPUT_LOGSUMEXP: + off_hz = tl.program_id(1) + l_ptrs = LSE + off_hz * Q_LEN + offs_m + lse = m_i + tl.math.log2(l_i) + if IS_DIVISIBLE: + tl.store(l_ptrs, lse) + else: + tl.store(l_ptrs, lse, mask=offs_m < Q_LEN) + + @triton.jit + def forward_inner( + arg_Q, arg_K, arg_V, arg_LSE, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, + q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN, + # accumulated values + acc, l_i, m_i, + # Offsets used as inputs to score_mod & mask_mod + # of size [BLOCK_M, BLOCK_N] or scalar. + off_z, off_h, offs_m, offs_n, + # blocksparse data + kv_indices, kv_num_blocks, + # start kv and end kv block + block_n_start, block_n_end, + MATMUL_PRECISION, + IS_FULL_BLOCKS, + ): + # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through + ROWS_GUARANTEED_SAFE : tl.constexpr = False + PRESCALE_QK : tl.constexpr = False + OUTPUT_LOGSUMEXP : tl.constexpr = False + FLOAT32_PRECISION : tl.constexpr = 'ieee' + IS_DIVISIBLE : tl.constexpr = True + SM_SCALE : tl.constexpr = 0.125 + GQA_SHARED_HEADS : tl.constexpr = 1 + HAS_FULL_BLOCKS : tl.constexpr = True + QK_HEAD_DIM : tl.constexpr = 64 + V_HEAD_DIM : tl.constexpr = 64 + BLOCK_M : tl.constexpr = 128 + BLOCK_N : tl.constexpr = 32 + SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128 + SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128 + + + SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N) + RCP_LN2: tl.constexpr = 1.44269504 + + if PRESCALE_QK: + q = (q * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION) + + # loop over k, v and update accumulator until block_n_end + for start_n in range(block_n_start, block_n_end): + if IS_DIVISIBLE: + acc, l_i, m_i = forward_block_mn( + arg_Q, arg_K, arg_V, arg_LSE, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, + q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN, + # accumulated values + acc, l_i, m_i, + # Offsets + off_z, off_h, offs_m, offs_n, + MATMUL_PRECISION, RCP_LN2, + IS_FULL_BLOCKS, + ) + else: + # Benchmark shows even we applied mod & mask to each block for non divisible seqlen, + # it's on par or slightly faster than only applying to the last block in fwd. + # However, we choose different strategy for bwd, where we only apply mod & mask + # to the last block because it's faster a lot. + acc, l_i, m_i = forward_block_mn( + arg_Q, arg_K, arg_V, arg_LSE, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, + q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN, + # accumulated values + acc, l_i, m_i, + # Offsets + off_z, off_h, offs_m, offs_n, + MATMUL_PRECISION, RCP_LN2, + IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=True, + ) + + # update pointers + offset = get_offset_for_next_block( + start_n, kv_indices, kv_num_blocks, + SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N + ) + + V_block_ptr = tl.advance(V_block_ptr, (offset, 0)) + K_block_ptr = tl.advance(K_block_ptr, (0, offset)) + + offs_n = offs_n + offset + + return acc, l_i, m_i + + + @triton.jit + def get_offset_for_next_block(loop_iter, col_indices, total_blocks, SPARSE_BLOCK, SPARSE_BLOCK_MULTIPLE, BLOCK): + cur_block_idx = loop_iter // SPARSE_BLOCK_MULTIPLE + cur_block = tl.load(col_indices + cur_block_idx, eviction_policy="evict_last") + next_block = tl.load(col_indices + cur_block_idx + 1, eviction_policy="evict_last", mask=cur_block_idx + 1 < total_blocks) + needs_jump = (loop_iter + 1) % SPARSE_BLOCK_MULTIPLE == 0 + jump_to_block = (next_block - cur_block ) * SPARSE_BLOCK - (SPARSE_BLOCK_MULTIPLE - 1) * BLOCK + + offset = jump_to_block * needs_jump + (1 - needs_jump) * BLOCK + return offset + + @triton.jit + def forward_block_mn( + arg_Q, arg_K, arg_V, arg_LSE, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, + q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN, + # accumulated values + acc, l_i, m_i, + # Offsets + off_z, off_h, offs_m, offs_n, + MATMUL_PRECISION, RCP_LN2, + IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=False, + ): + # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through + ROWS_GUARANTEED_SAFE : tl.constexpr = False + PRESCALE_QK : tl.constexpr = False + OUTPUT_LOGSUMEXP : tl.constexpr = False + FLOAT32_PRECISION : tl.constexpr = 'ieee' + IS_DIVISIBLE : tl.constexpr = True + SM_SCALE : tl.constexpr = 0.125 + GQA_SHARED_HEADS : tl.constexpr = 1 + HAS_FULL_BLOCKS : tl.constexpr = True + QK_HEAD_DIM : tl.constexpr = 64 + V_HEAD_DIM : tl.constexpr = 64 + BLOCK_M : tl.constexpr = 128 + BLOCK_N : tl.constexpr = 32 + SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128 + SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128 + + + # -- load k -- + if IS_DIVISIBLE: + k = tl.load(K_block_ptr) + else: + k = tl.load(K_block_ptr, boundary_check=(1,), padding_option = "zero") + # -- compute qk --- + qk = tl.dot(q, k, input_precision=FLOAT32_PRECISION) # TODO: use cuda matmul when q_len <= 2. + if not PRESCALE_QK: + qk *= SM_SCALE + # ~~~~~~~~~~~~~~~~~~~ Apply score modification ~~~~~~~~~~~~~~~~~~~ + if CHECK_BLOCK_BOUNDARY: + # If this is the last block of a non divisible seqlen, we still need to load [BLOCK_M, BLOCK_N] elements, + # which is larger than the actual number of elements. To avoid access memory out of bound, + # we need to mask out the elements that are out of Q_LEN & KV_LEN. + m = offs_m % Q_LEN + n = offs_n % KV_LEN + else: + m = offs_m + n = offs_n + + tmp0 = (m) - (n) + tmp1 = tmp0.to(tl.float32) + tmp2 = (qk) + tmp1 + post_mod_scores = tmp2 + + + if CHECK_BLOCK_BOUNDARY: + # Mask out the elements that are out of the KV_LEN for non divisible seqlen. + post_mod_scores = tl.where(offs_n < KV_LEN, post_mod_scores, float("-inf")) + + if not IS_FULL_BLOCKS: + tmp3 = (m) >= (n) + mask_mod_output = tmp3 + + + if CHECK_BLOCK_BOUNDARY: + mask_mod_output = tl.where(offs_n < KV_LEN, mask_mod_output, float("-inf")) + # apply mask for partially unmasked blocks + post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf")) + + # TODO: In the case that score_mod is linear, this can be LICMed + if not PRESCALE_QK: + post_mod_scores *= RCP_LN2 + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + # -- compute scaling constant --- + m_ij = tl.maximum(m_i, tl.max(post_mod_scores, 1)) + if not ROWS_GUARANTEED_SAFE: + masked_out_rows = (m_ij == float("-inf")) + m_ij_masked = tl.where(masked_out_rows, 0, m_ij) + else: + m_ij_masked = m_ij + + alpha = tl.math.exp2(m_i - m_ij_masked) + p = tl.math.exp2(post_mod_scores - m_ij_masked[:, None]) + + # NB: l_i update is pulled up here since it's a bit faster + # NB: For headdim=256, it's faster to move it back down to after m_i = + # m_ij + l_i = l_i * alpha + tl.sum(p, 1) + # # -- scale and update acc -- + acc = acc * alpha[:, None] + + if IS_DIVISIBLE: + v = tl.load(V_block_ptr) + else: + v = tl.load(V_block_ptr, boundary_check=(0,), padding_option = "zero") + acc = tl.dot(p.to(MATMUL_PRECISION), v, acc, input_precision=FLOAT32_PRECISION) + + # -- update m_i + m_i = m_ij + + return acc, l_i, m_i + ''', device_str='cuda') + meta0 = {'ROWS_GUARANTEED_SAFE': False, 'PRESCALE_QK': False, 'OUTPUT_LOGSUMEXP': False, 'FLOAT32_PRECISION': "'ieee'", 'IS_DIVISIBLE': True, 'SM_SCALE': 0.125, 'GQA_SHARED_HEADS': 1, 'HAS_FULL_BLOCKS': True, 'QK_HEAD_DIM': 64, 'V_HEAD_DIM': 64, 'BLOCK_M': 128, 'BLOCK_N': 32, 'SPARSE_Q_BLOCK_SIZE': 128, 'SPARSE_KV_BLOCK_SIZE': 128} + + + async_compile.wait(globals()) + del async_compile + + def call(args): + arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1 = args + args.clear() + assert_size_stride(arg0_1, (1, 4, 512, 64), (131072, 32768, 64, 1)) + assert_size_stride(arg1_1, (1, 4, 512, 64), (131072, 32768, 64, 1)) + assert_size_stride(arg2_1, (1, 4, 512, 64), (131072, 32768, 64, 1)) + assert_size_stride(arg3_1, (1, 1, 16), (16, 16, 1)) + assert_size_stride(arg4_1, (1, 1, 16, 16), (256, 256, 16, 1)) + assert_size_stride(arg5_1, (1, 1, 16), (16, 16, 1)) + assert_size_stride(arg6_1, (1, 1, 16, 16), (256, 256, 16, 1)) + assert_size_stride(arg7_1, (1, 1, 16), (16, 16, 1)) + assert_size_stride(arg8_1, (1, 1, 16, 16), (256, 256, 16, 1)) + assert_size_stride(arg9_1, (1, 1, 16), (16, 16, 1)) + assert_size_stride(arg10_1, (1, 1, 16, 16), (256, 256, 16, 1)) + buf0 = empty_strided_cuda((1, 4, 512), (2048, 512, 1), torch.float32) + with torch.cuda._DeviceGuard(0): + torch.cuda.set_device(0) + buf1 = empty_strided_cuda((1, 4, 512, 64), (131072, 32768, 64, 1), torch.float32) + # Topologically Sorted Source Nodes: [flex_attention], Original ATen: [] + stream0 = get_raw_stream(0) + triton_tem_fused_0.run(arg0_1, arg1_1, arg2_1, buf0, arg3_1, arg4_1, arg5_1, arg6_1, buf1, grid=torch._inductor.kernel.flex_attention.flex_attention_grid(1, 4, 512, 64, meta0), stream=stream0) + del arg0_1 + del arg1_1 + del arg2_1 + del arg3_1 + del arg4_1 + del arg5_1 + del arg6_1 + del buf0 + return (buf1, ) + + + def benchmark_compiled_module(times=10, repeat=10): + from torch._dynamo.testing import rand_strided + from torch._inductor.utils import print_performance + arg0_1 = rand_strided((1, 4, 512, 64), (131072, 32768, 64, 1), device='cuda:0', dtype=torch.float32) + arg1_1 = rand_strided((1, 4, 512, 64), (131072, 32768, 64, 1), device='cuda:0', dtype=torch.float32) + arg2_1 = rand_strided((1, 4, 512, 64), (131072, 32768, 64, 1), device='cuda:0', dtype=torch.float32) + arg3_1 = rand_strided((1, 1, 16), (16, 16, 1), device='cuda:0', dtype=torch.int32) + arg4_1 = rand_strided((1, 1, 16, 16), (256, 256, 16, 1), device='cuda:0', dtype=torch.int32) + arg5_1 = rand_strided((1, 1, 16), (16, 16, 1), device='cuda:0', dtype=torch.int32) + arg6_1 = rand_strided((1, 1, 16, 16), (256, 256, 16, 1), device='cuda:0', dtype=torch.int32) + arg7_1 = rand_strided((1, 1, 16), (16, 16, 1), device='cuda:0', dtype=torch.int32) + arg8_1 = rand_strided((1, 1, 16, 16), (256, 256, 16, 1), device='cuda:0', dtype=torch.int32) + arg9_1 = rand_strided((1, 1, 16), (16, 16, 1), device='cuda:0', dtype=torch.int32) + arg10_1 = rand_strided((1, 1, 16, 16), (256, 256, 16, 1), device='cuda:0', dtype=torch.int32) + fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1]) + return print_performance(fn, times=times, repeat=repeat) + + + if __name__ == "__main__": + from torch._inductor.wrapper_benchmark import compiled_module_main + compiled_module_main('None', benchmark_compiled_module) + +V1003 10:11:02.564000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "a51341d292b7eede808d8a68c1fd524e"} + { + "name": "code_gen", + "ts": 1727975462564030.8, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 0, + "fxgraph_cache_miss": 1, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:02.564000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "0723c01321e1ba5c17eafbd9000d9e2d"} + { + "name": "GraphLowering.compile_to_module", + "ts": 1727975462564643.5, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 0, + "fxgraph_cache_miss": 1, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:02.603000 2235078 torch/_dynamo/utils.py:1020] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "489dd15ea036ef0a9a0677625f691d7c"} + { + "name": "fx_graph_cache_miss", + "ts": 1727975456229537.0, + "args": { + "key": "f4lkea5y7lzhlshohvr3aqpd7bchdflfs7j5wn7mrurponawoutk", + "components": [ + "[n7x23yy6fih6vdcjzlzbhy3d6vx3ilu7ylp3zz6wkultu4yvnzn] gm: (\n (sdpa_score0): ()\n (sdpa_mask0): ()\n)\n\n\n\ndef forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1):\n sdpa_score0 = self.sdpa_score0\n sdpa_mask0 = self.sdpa_mask0\n flex_attention = torch.ops.higher_order.flex_attention(arg0_1, arg1_1, arg2_1, sdpa_score0, (arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, 128, 128, sdpa_mask0), 0.125, {'ROWS_GUARANTEED_SAFE': False, 'PRESCALE_QK': False, 'OUTPUT_LOGSUMEXP': False}, (), ()); arg0_1 = arg1_1 = arg2_1 = sdpa_score0 = arg3_1 = arg4_1 = arg5_1 = arg6_1 = arg7_1 = arg8_1 = arg9_1 = arg10_1 = sdpa_mask0 = None\n getitem = flex_attention[0]; flex_attention = None\n return (getitem,)\n \n# To see more debug info, please use `graph_module.print_readable()`", + "[avf2u3luxvyabchjhbddapcjn5gev47wfdtkrprayuhv6lf2z6u] example_inputs[0]: TensorMetadata(dtype=torch.float32, shape=torch.Size([1, 4, 512, 64]), stride=(131072, 32768, 64, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", + "[avf2u3luxvyabchjhbddapcjn5gev47wfdtkrprayuhv6lf2z6u] example_inputs[1]: TensorMetadata(dtype=torch.float32, shape=torch.Size([1, 4, 512, 64]), stride=(131072, 32768, 64, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", + "[avf2u3luxvyabchjhbddapcjn5gev47wfdtkrprayuhv6lf2z6u] example_inputs[2]: TensorMetadata(dtype=torch.float32, shape=torch.Size([1, 4, 512, 64]), stride=(131072, 32768, 64, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", + "[zsk3gejenkcvvwhiyk36u5zdnlrcs6wgy3pina3csuierfd2zri] example_inputs[3]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16]), stride=(16, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", + "[hnbjjzmb63q27mbr22eubaelyb423burv27meouma6ccysmwu6g] example_inputs[4]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16, 16]), stride=(256, 256, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", + "[zsk3gejenkcvvwhiyk36u5zdnlrcs6wgy3pina3csuierfd2zri] example_inputs[5]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16]), stride=(16, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", + "[hnbjjzmb63q27mbr22eubaelyb423burv27meouma6ccysmwu6g] example_inputs[6]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16, 16]), stride=(256, 256, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", + "[zsk3gejenkcvvwhiyk36u5zdnlrcs6wgy3pina3csuierfd2zri] example_inputs[7]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16]), stride=(16, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", + "[hnbjjzmb63q27mbr22eubaelyb423burv27meouma6ccysmwu6g] example_inputs[8]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16, 16]), stride=(256, 256, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", + "[zsk3gejenkcvvwhiyk36u5zdnlrcs6wgy3pina3csuierfd2zri] example_inputs[9]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16]), stride=(16, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", + "[hnbjjzmb63q27mbr22eubaelyb423burv27meouma6ccysmwu6g] example_inputs[10]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16, 16]), stride=(256, 256, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] fx_kwargs[aot_mode]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] fx_kwargs[cpp_wrapper]: False", + "[xq2hdkbfkbcuye6rgtypayrkhqf4cntij2dsd24rei3lsknakkf] fx_kwargs[cudagraphs]: BoxedBool(value=False)", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] fx_kwargs[extern_node_serializer]: None", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] fx_kwargs[is_backward]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] fx_kwargs[is_inference]: True", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] fx_kwargs[layout_opt]: None", + "[h25wqx6vliw4j5rtzzbv6latydxyei3deyg6v7wzvnzryfktuki] fx_kwargs[static_input_idxs]: []", + "[f44ag5aflby2bkxl7a4k6whljrk7jat7bmreuxklei4p3czhk7p] fx_kwargs[user_visible_outputs]: {'getitem': None}", + "[vrl5ktomgtzox5xucd3np6vug3vyj6hwwzahqijuwpmamlv7ohi] inputs_to_check[0]: 0", + "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inputs_to_check[1]: 1", + "[pr5nr4a7dthirgd2ljo3d2xakc63ywxugusu6mkmr6gmpeliyib] inputs_to_check[2]: 2", + "[kcuxe2zwm3mzv2uk6adm6iskoy35bqfv725twacrdewod2dbl5d] inputs_to_check[3]: 3", + "[lkkae3meylaixfif4thncru4hjqeaislawjoghffrbwuscaagei] inputs_to_check[4]: 4", + "[qs5hilycp4ew4ivtc7m5jaxp7q4pm5slioxw3fi3ur6ei65ybz4] inputs_to_check[5]: 5", + "[agkvbkaha53nbz3aeeuhvxjvvc4glhfjofzkg6g2qjoo2e5otcx] inputs_to_check[6]: 6", + "[j3s5elu6itwgjafc7rzhy4whrbufl6kfmlufjhh25grt643bk5f] inputs_to_check[7]: 7", + "[yttmfmxblgcbsvbokguzowcorrcxz5uunxtcvsbe6nijgcx45he] inputs_to_check[8]: 8", + "[qlgfiyqewrmkgqth2qm6wkq2ja5lzkapg3ypgnvoyfqqnidaoj3] inputs_to_check[9]: 9", + "[j6c55jha5r2sdys2rwq7uqhtleea5dgjcye7nicfgft36v7xfvp] inputs_to_check[10]: 10", + "[du4vyrfyozrfxcf6kk6ma7oqwatapifazeelfsawmsiu6gjdtxp] deterministic_algorithms_settings: (False, False, True)", + "[qiptf2633zubseuei4bkisoq3not35l6lud6p23p4qmcsxiw2uq] cuda_matmul_settings: (False, True, True)", + "[7uhqwjfn75ek3woo3k7em2mluon5hx2ojvzlevlvjvz6xfxjhzl] torch_version: ", + "[c3z7bmoxyo6gl5hi47v6dc7jwsl55b3asd75nr25uyengi5ah3p] system_info[device]: {'name': 'NVIDIA PG509-210'}", + "[3fb7kae6ogkdd4zcm3fkjoipdpybxhn4aoxzv7z7xsfwq233e4l] system_info[version]: {'triton': '3.1.0+5fe38ffd73dc767c8fadcf23ea82d79e257c37d44077eae7f681cf967565fd43e9c017937b-835d4fc33500e1accafc5c5e00f4f73d87432c114860c04b68849bf6f942b8e5-dc767c8fadcf23ea82d79e257c37d44077eae7f681cf967565fd43e9c017937b-23d635e690d670bf61798e1259674b78c0ed5ba222ab6a455f329f27a758fc2d-e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855-20b017e9c4d858ab05e783f77df50b86c6d6eee5d79f3f4b158562b4a54f8443-f44338a31e0534290b08653050804c3fabbde403a6d3004ae04f0c28495f0802-e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855-a979896b9c0acfd41dd953b90bdc4b10968f7c0b45a286eae3f829aaddb2bb55-da771298f7bc45d24a61f35ef51742304421df1ab49d50bf1fc510dd5a46ea4b-a8fb7be728d460b7ec64ab62edb8af1bbca8994fd718cde7178d46bad64530a1-71330f394e584b0df29595d49f6ac8ac0c5503db9147090dc58ad888cebac7be-f24adfd52383f7866791ebaa5d45a5d2cc826e56ee2fd285f438e85d201fe643-a34be0d3ae4b3ac9aede195cfda42f8a0a097b2bc9642fb59673ce6b3b607f10-36130a37af1b19a0dec569aa08d30b00c74c8f02b6b632999d86dea169146792-36d42f0429aae027cb985b53b9abc616fae4dad9e0ea03953e1e9fb46d0fb9a0-e5d2cb724c08d0ef4130f3ba858d22cf21f834bfd970a5388aa6ad2a6bab91f9', 'cuda': '12.0'}", + "[z5x2bdhir5lzlbti73vdbfulnuu5vinzpwgmmgf4rjb775tzl3h] system_info[hash]: 9698c97edde4a99a2f3b54bbd0db5291bbcdb75c83acb376ccff61fb0bf0ac1a", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[TYPE_CHECKING]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[abi_compatible]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aggressive_fusion]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[allow_buffer_reuse]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[allow_stack_allocation]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[always_keep_tensor_constants]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aot_inductor.debug_compile]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aot_inductor.debug_dump_consts_bin]: False", + "[ngkkx5e6z7erl6da23zb2cmsctz4yvaqyameyg5hbqln4wrhh7x] inductor_config[aot_inductor.debug_intermediate_value_printer]: 0", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[aot_inductor.filtered_kernel_names]: None", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aot_inductor.force_mmap_weights]: False", + "[4bryyl4ahh5whyg3zwqebpwmjnx6w77nqgqbdjlowju6lkqtn7w] inductor_config[aot_inductor.metadata]: {}", + "[v3hzzlv4tjgvp3pyhmzagjd25orl6n7nynoa7svlhhwk73b7u3c] inductor_config[aot_inductor.output_path]: ", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aot_inductor.package]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aot_inductor.package_cpp_only]: False", + "[v3hzzlv4tjgvp3pyhmzagjd25orl6n7nynoa7svlhhwk73b7u3c] inductor_config[aot_inductor.serialized_in_spec]: ", + "[v3hzzlv4tjgvp3pyhmzagjd25orl6n7nynoa7svlhhwk73b7u3c] inductor_config[aot_inductor.serialized_out_spec]: ", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aot_inductor.use_runtime_constant_folding]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[assert_indirect_indexing]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[assume_aligned_inputs]: False", + "[v3hzzlv4tjgvp3pyhmzagjd25orl6n7nynoa7svlhhwk73b7u3c] inductor_config[autoheuristic_collect]: ", + "[jvchmi66fvqzlemhr5fcqorz5trfdtdalzfagtj2aolmimwqhdq] inductor_config[autoheuristic_log_path]: DEFAULT", + "[jwbrgxes7vjqumngs5hyj6gn5nytv2whnppnzngvaagfmawhkkd] inductor_config[autoheuristic_use]: mixed_mm", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[autotune_fallback_to_aten]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[autotune_in_subproc]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[autotune_local_cache]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[autotune_multi_device]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[autotune_remote_cache]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[b2b_gemm_pass]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[batch_fusion]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[benchmark_combo_kernel]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[benchmark_epilogue_fusion]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[benchmark_fusion]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[benchmark_harness]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[benchmark_kernel]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[bw_outputs_user_visible]: True", + "[b4ha3ravs3qv237q65hpfqegbnoww7tf2ahcbu2i7xo6te5spqs] inductor_config[c_shim_version]: 2", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[check_stack_no_cycles_TESTING_ONLY]: False", + "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inductor_config[combo_kernel_allow_mixed_sizes]: 1", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[combo_kernel_foreach_dynamic_shapes]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[combo_kernels]: False", + "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inductor_config[combo_kernels_autotune]: 1", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[comment_origin]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[comprehensive_padding]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[compute_all_bounds]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[constant_and_index_propagation]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[conv_1x1_as_mm]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[coordinate_descent_check_all_directions]: False", + "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inductor_config[coordinate_descent_search_radius]: 1", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[coordinate_descent_tuning]: False", + "[c7zj4qytmety6keurs3hsh5wn7foxp3dqx4kym2ucszzcb2ngrf] inductor_config[cpp.cxx]: (None, 'g++')", + "[yrty22bseefglnysuoec4ji7j2rnaggdj3g33zzj7avogwfmgdw] inductor_config[cpp.descriptive_names]: original_aten", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cpp.dynamic_threads]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cpp.enable_floating_point_contract_flag]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cpp.enable_kernel_profile]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[cpp.enable_loop_tail_vec]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[cpp.enable_tiling_heuristics]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cpp.enable_unsafe_math_opt_flag]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[cpp.fallback_scatter_reduce_sum]: True", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cpp.gemm_cache_blocking]: None", + "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inductor_config[cpp.gemm_max_k_slices]: 1", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cpp.gemm_thread_factors]: None", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cpp.inject_log1p_bug_TESTING_ONLY]: None", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cpp.inject_relu_bug_TESTING_ONLY]: None", + "[ebt2ncs4f5y7dn7btzi76mnouepvzad474tmp5iju4wiuumjl4s] inductor_config[cpp.max_horizontal_fusion_size]: 16", + "[g7rrnbg5yonzux3cfj5ovre5lob3ayda7qcfpxjvtwmiz4uicii] inductor_config[cpp.min_chunk_size]: 4096", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[cpp.no_redundant_loops]: True", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cpp.simdlen]: None", + "[sz3im5ogc6asp7g4uqocnovype63tkdexzfrniv6hn2oank3biu] inductor_config[cpp.threads]: -1", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cpp.vec_isa_ok]: None", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[cpp.weight_prepack]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cpp_wrapper]: False", + "[bsvfcwwoczx2rlkdz2eta6doujsymyihmi46hhwk6clrrvwcb6m] inductor_config[cpu_backend]: cpp", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cuda.arch]: None", + "[tvyftmtdmezlejo2xllu7awzv4pzc4vm4fub4b3gpl5jptjkosi] inductor_config[cuda.compile_opt_level]: -O1", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cuda.cuda_cxx]: None", + "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inductor_config[cuda.cutlass_backend_min_gemm_size]: 1", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cuda.cutlass_max_profiling_configs]: None", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cuda.cutlass_op_allowlist_regex]: None", + "[lwkz5chtpji756gurqw4foijfi7zfgljtnn5nmnvdi2skpt4mgh] inductor_config[cuda.cutlass_op_denylist_regex]: pingpong", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cuda.enable_cuda_lto]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cuda.enable_debug_info]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cuda.enable_ptxas_info]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[cuda.generate_test_runner]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cuda.use_fast_math]: False", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cuda.version]: None", + "[caw4ly2z672k6kjfahoxwpajp5idhhtrpgf3ma2clylcp7c7aid] inductor_config[cuda_backend]: triton", + "[pikr7bbcoixfzftsazp5ggufhdklj24babfry77bl4nuvyrrcp4] inductor_config[custom_op_default_layout_constraint]: needs_fixed_stride_order", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[dce]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[debug]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[debug_fusion]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[debug_index_asserts]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[debug_ir_traceback]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[decompose_mem_bound_mm]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[developer_warnings]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[disable_cpp_codegen]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[disable_padding_cpu]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[disable_progress]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[dynamic_scale_rblock]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[efficient_conv_bn_eval_fx_passes]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[emulate_precision_casts]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[enable_auto_functionalized_v2]: False", + "[v3hzzlv4tjgvp3pyhmzagjd25orl6n7nynoa7svlhhwk73b7u3c] inductor_config[enabled_metric_tables]: ", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[epilogue_fusion]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[epilogue_fusion_first]: False", + "[lxxtoqhcoepwfokeiibd575gnxo3uzwiv4hmpomlwkpzqz3qzsh] inductor_config[estimate_op_runtime]: default", + "[h25wqx6vliw4j5rtzzbv6latydxyei3deyg6v7wzvnzryfktuki] inductor_config[external_matmul]: []", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[fallback_random]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[force_disable_caches]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[force_fuse_int_mm_with_mul]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[force_layout_optimization]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[force_same_precision]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[force_shape_pad]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[freezing]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[freezing_discard_parameters]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[fx_graph_cache]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[fx_graph_remote_cache]: False", + "[62lrdx35b7hnumwb7mp5oc5y5csm2abylvtdzfloct3noaqov3n] inductor_config[fx_passes_numeric_check]: {'pre_grad': False, 'post_grad': False, 'precision': 0.0001, 'num_iterations': 1, 'requires_optimizer': True}", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[generate_intermediate_hooks]: False", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[global_cache_dir]: None", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[group_fusion]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[halide.asserts]: False", + "[ljhgflgihidopsfsdcbqynv27nceykby3nutyd5jlcpq7n6e7l4] inductor_config[halide.cpu_target]: host", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[halide.debug]: False", + "[wx7vmsmrdpk5ue2txlywp3lj3faqmdjphs5fgg2ehzsyno7uovg] inductor_config[halide.gpu_target]: host-cuda", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[halide.scan_kernels]: False", + "[k5ogk6345jvklsnu7g2njqstiz2g6pm5wmqpgg3kasrmuqwjvl6] inductor_config[halide.scheduler_cpu]: Adams2019", + "[svgytlua5wcyeia7wq7e6zgh5tsueikrnzchmdmouvmkpfsc2zq] inductor_config[halide.scheduler_cuda]: Anderson2021", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[implicit_fallbacks]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[inplace_buffers]: True", + "[5fxczt3ciyxitdhizb7sfsgn7fhpczcqsngttnt5ot2wyctk7co] inductor_config[inter_node_bw]: 25", + "[yezuzjtg4h3jjur4jwtwiehbyixa7eonq4tqsqmwqve2lvvmrem] inductor_config[intra_node_bw]: 300", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[is_nightly_or_source]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[is_predispatch]: False", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[joint_custom_post_pass]: None", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[joint_custom_pre_pass]: None", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[joint_graph_constant_folding]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[keep_output_stride]: True", + "[j6c55jha5r2sdys2rwq7uqhtleea5dgjcye7nicfgft36v7xfvp] inductor_config[kernel_name_max_ops]: 10", + "[4p2fdjlvxrcw7c7fvzm5huhtqxnro4kvkx56f7p5zyrxqkwooov] inductor_config[layout_opt_default]: 1", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[layout_optimization]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[loop_ordering_after_fusion]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[max_autotune]: False", + "[uqlsbif4zxd75vt522p52txyuguieipi2lwz5g5awt56lccqk7s] inductor_config[max_autotune_conv_backends]: ATEN,TRITON", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[max_autotune_gemm]: False", + "[2y7luesktjrque3nr7qtxnum2mkbeegzdrsvkm3rvdlhqboajhx] inductor_config[max_autotune_gemm_backends]: ATEN,TRITON,CPP", + "[jvchmi66fvqzlemhr5fcqorz5trfdtdalzfagtj2aolmimwqhdq] inductor_config[max_autotune_gemm_search_space]: DEFAULT", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[max_autotune_pointwise]: False", + "[bh33ranllcgilhgmgr3qvygzxjm6isq5iexnfm3zx6fnr2zwlp2] inductor_config[max_autotune_subproc_graceful_timeout_seconds]: 1.0", + "[iglov24t7x5ruci344aer2tm6nqshi4veuw4wxlssxtu46cx76m] inductor_config[max_autotune_subproc_result_timeout_seconds]: 60.0", + "[pwoh5aypf4fxbntdvwt67rppxorqos6xr3w7qzeun6kblbfg2ga] inductor_config[max_autotune_subproc_terminate_timeout_seconds]: 2.0", + "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inductor_config[max_epilogue_benchmarked_choices]: 1", + "[jykiys6ynafs3zdylwa5ggq6j655mxeh42d6mtdi22gffkrmiac] inductor_config[max_fusion_size]: 64", + "[yttmfmxblgcbsvbokguzowcorrcxz5uunxtcvsbe6nijgcx45he] inductor_config[max_pointwise_cat_inputs]: 8", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[memory_planning]: False", + "[x75won4jmsgeb63pcvwr2y4eteyzzdhmf5rv6xhjppie4hx2yu5] inductor_config[memory_pool]: intermediates", + "[v2td5s4lnsvyxvaevy4chx6kc5h3mm2axazbgwimqule5zrzao7] inductor_config[mixed_mm_choice]: heuristic", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[nan_asserts]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[optimize_scatter_upon_const_tensor]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[pad_channels_last]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[pad_outputs]: False", + "[ljdqgtysl3vdf7j6attlz5gmjg2ncihnveojfyubosplmkrjgra] inductor_config[padding_alignment_bytes]: 128", + "[dnnw5ks3yxrp7mwvihb2hh4tqx35ye637xt33x64kw4fvz2nyzg] inductor_config[padding_stride_threshold]: 1024", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[pattern_matcher]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[permute_fusion]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[pick_loop_orders]: True", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[post_grad_custom_post_pass]: None", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[post_grad_custom_pre_pass]: None", + "[4bryyl4ahh5whyg3zwqebpwmjnx6w77nqgqbdjlowju6lkqtn7w] inductor_config[post_grad_fusion_options]: {}", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[pre_grad_custom_pass]: None", + "[4bryyl4ahh5whyg3zwqebpwmjnx6w77nqgqbdjlowju6lkqtn7w] inductor_config[pre_grad_fusion_options]: {}", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[profile_bandwidth]: False", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[profile_bandwidth_output]: None", + "[v3hzzlv4tjgvp3pyhmzagjd25orl6n7nynoa7svlhhwk73b7u3c] inductor_config[profile_bandwidth_regex]: ", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[profile_bandwidth_with_do_bench_using_profiling]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[profiler_mark_wrapper_call]: False", + "[yttmfmxblgcbsvbokguzowcorrcxz5uunxtcvsbe6nijgcx45he] inductor_config[realize_acc_reads_threshold]: 8", + "[rr5m5hsocoyodldz7vcvaizdwvm2rt34evmqdxvng7wz3tufvo6] inductor_config[realize_opcount_threshold]: 30", + "[lkkae3meylaixfif4thncru4hjqeaislawjoghffrbwuscaagei] inductor_config[realize_reads_threshold]: 4", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[reorder_for_compute_comm_overlap]: False", + "[ssupi7bu3rrhdpg2jyegzncu3kg3nnhklyliqvutaxgs7y7k3dx] inductor_config[reorder_for_compute_comm_overlap_passes]: ['reorder_compute_for_overlap', 'sink_waits', 'raise_comms']", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[reorder_for_locality]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[reorder_for_peak_memory]: False", + "[h25wqx6vliw4j5rtzzbv6latydxyei3deyg6v7wzvnzryfktuki] inductor_config[rocm.arch]: []", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[rocm.ck_dir]: None", + "[oartxnko2l7d67tzwwm2otcumaut3n4wwcfgz3o377hmcveu5ft] inductor_config[rocm.ck_supported_arch]: ['gfx90a', 'gfx940', 'gfx941', 'gfx942']", + "[klfqjprnpfhcdurgvuikvc4rpd5ynkpk77toousr5h3u5roty6p] inductor_config[rocm.compile_opt_level]: -O2", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[rocm.flush_denormals]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[rocm.is_debug]: False", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[rocm.n_max_profiling_configs]: None", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[rocm.print_kernel_resource_usage]: False", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[rocm.rocm_home]: None", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[rocm.save_temps]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[rocm.use_fast_math]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[rocm.use_preselected_instances]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[save_args]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[search_autotune_cache]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[shape_padding]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[size_asserts]: True", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[sleep_sec_TESTING_ONLY]: None", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[split_cat_fx_passes]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[split_reductions]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[static_weight_shapes]: True", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[triton.autotune_at_compile_time]: None", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.autotune_cublasLt]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.autotune_pointwise]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.codegen_upcast_to_fp32]: True", + "[tuax46wac7rfv2trf5gcps6vleo3cq44lbnrdxtprvo3ljjaddj] inductor_config[triton.cudagraph_dynamic_shape_warn_limit]: 50", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.cudagraph_skip_dynamic_graphs]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.cudagraph_support_input_mutation]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.cudagraph_trees]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.cudagraph_trees_history_recording]: False", + "[ljdqgtysl3vdf7j6attlz5gmjg2ncihnveojfyubosplmkrjgra] inductor_config[triton.cudagraph_unexpected_rerecord_limit]: 128", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.cudagraphs]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.debug_sync_graph]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.debug_sync_kernel]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.dense_indexing]: False", + "[yrty22bseefglnysuoec4ji7j2rnaggdj3g33zzj7avogwfmgdw] inductor_config[triton.descriptive_names]: original_aten", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.divisible_by_16]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.fast_path_cudagraph_asserts]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.force_cudagraph_sync]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.force_cudagraphs_warmup]: False", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[triton.inject_relu_bug_TESTING_ONLY]: None", + "[pr5nr4a7dthirgd2ljo3d2xakc63ywxugusu6mkmr6gmpeliyib] inductor_config[triton.max_tiles]: 2", + "[fv6slhtedtydps5s5u2etitscliblzcidyitqf7krsv4e23fzk6] inductor_config[triton.min_split_scan_rblock]: 256", + "[vrl5ktomgtzox5xucd3np6vug3vyj6hwwzahqijuwpmamlv7ohi] inductor_config[triton.multi_kernel]: 0", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.persistent_reductions]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.prefer_nd_tiling]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.skip_cudagraph_warmup]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.slow_path_cudagraph_asserts]: True", + "[ebt2ncs4f5y7dn7btzi76mnouepvzad474tmp5iju4wiuumjl4s] inductor_config[triton.spill_threshold]: 16", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.store_cubin]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.tiling_prevents_pointwise_fusion]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.tiling_prevents_reduction_fusion]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.unique_kernel_names]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.use_block_ptr]: False", + "[vzzema5ityqj2wepdmkulue7q5pcevdr5h27oxxutf35d4tjume] inductor_config[triton_kernel_default_layout_constraint]: flexible_layout", + "[wft6ljqsfr3x4m7fa5zuyb7cwknky4irrxz4bjr6uzr2yiopxqj] inductor_config[unbacked_symint_fallback]: 8192", + "[yttmfmxblgcbsvbokguzowcorrcxz5uunxtcvsbe6nijgcx45he] inductor_config[unroll_reductions_threshold]: 8", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[unsafe_ignore_unsupported_triton_autotune_args]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[use_minimal_arrayref_interface]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[use_mixed_mm]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[verbose_progress]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[warn_mix_layout]: False" + ], + "cache_event_time": 1727975456229537044, + "cache_state": "miss", + "time_taken_ns": 6372999079 + }, + "ph": "i", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0, + "s": "p" + } +V1003 10:11:02.604000 2235078 torch/_inductor/codecache.py:1463] {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "785133baf5b9f8dad73506774a2a34dc"} + {"key": "f4lkea5y7lzhlshohvr3aqpd7bchdflfs7j5wn7mrurponawoutk", "components": ["[n7x23yy6fih6vdcjzlzbhy3d6vx3ilu7ylp3zz6wkultu4yvnzn] gm: (\n (sdpa_score0): ()\n (sdpa_mask0): ()\n)\n\n\n\ndef forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1):\n sdpa_score0 = self.sdpa_score0\n sdpa_mask0 = self.sdpa_mask0\n flex_attention = torch.ops.higher_order.flex_attention(arg0_1, arg1_1, arg2_1, sdpa_score0, (arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, 128, 128, sdpa_mask0), 0.125, {'ROWS_GUARANTEED_SAFE': False, 'PRESCALE_QK': False, 'OUTPUT_LOGSUMEXP': False}, (), ()); arg0_1 = arg1_1 = arg2_1 = sdpa_score0 = arg3_1 = arg4_1 = arg5_1 = arg6_1 = arg7_1 = arg8_1 = arg9_1 = arg10_1 = sdpa_mask0 = None\n getitem = flex_attention[0]; flex_attention = None\n return (getitem,)\n \n# To see more debug info, please use `graph_module.print_readable()`", "[avf2u3luxvyabchjhbddapcjn5gev47wfdtkrprayuhv6lf2z6u] example_inputs[0]: TensorMetadata(dtype=torch.float32, shape=torch.Size([1, 4, 512, 64]), stride=(131072, 32768, 64, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", "[avf2u3luxvyabchjhbddapcjn5gev47wfdtkrprayuhv6lf2z6u] example_inputs[1]: TensorMetadata(dtype=torch.float32, shape=torch.Size([1, 4, 512, 64]), stride=(131072, 32768, 64, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", "[avf2u3luxvyabchjhbddapcjn5gev47wfdtkrprayuhv6lf2z6u] example_inputs[2]: TensorMetadata(dtype=torch.float32, shape=torch.Size([1, 4, 512, 64]), stride=(131072, 32768, 64, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", "[zsk3gejenkcvvwhiyk36u5zdnlrcs6wgy3pina3csuierfd2zri] example_inputs[3]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16]), stride=(16, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", "[hnbjjzmb63q27mbr22eubaelyb423burv27meouma6ccysmwu6g] example_inputs[4]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16, 16]), stride=(256, 256, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", "[zsk3gejenkcvvwhiyk36u5zdnlrcs6wgy3pina3csuierfd2zri] example_inputs[5]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16]), stride=(16, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", "[hnbjjzmb63q27mbr22eubaelyb423burv27meouma6ccysmwu6g] example_inputs[6]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16, 16]), stride=(256, 256, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", "[zsk3gejenkcvvwhiyk36u5zdnlrcs6wgy3pina3csuierfd2zri] example_inputs[7]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16]), stride=(16, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", "[hnbjjzmb63q27mbr22eubaelyb423burv27meouma6ccysmwu6g] example_inputs[8]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16, 16]), stride=(256, 256, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", "[zsk3gejenkcvvwhiyk36u5zdnlrcs6wgy3pina3csuierfd2zri] example_inputs[9]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16]), stride=(16, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", "[hnbjjzmb63q27mbr22eubaelyb423burv27meouma6ccysmwu6g] example_inputs[10]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16, 16]), stride=(256, 256, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] fx_kwargs[aot_mode]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] fx_kwargs[cpp_wrapper]: False", "[xq2hdkbfkbcuye6rgtypayrkhqf4cntij2dsd24rei3lsknakkf] fx_kwargs[cudagraphs]: BoxedBool(value=False)", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] fx_kwargs[extern_node_serializer]: None", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] fx_kwargs[is_backward]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] fx_kwargs[is_inference]: True", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] fx_kwargs[layout_opt]: None", "[h25wqx6vliw4j5rtzzbv6latydxyei3deyg6v7wzvnzryfktuki] fx_kwargs[static_input_idxs]: []", "[f44ag5aflby2bkxl7a4k6whljrk7jat7bmreuxklei4p3czhk7p] fx_kwargs[user_visible_outputs]: {'getitem': None}", "[vrl5ktomgtzox5xucd3np6vug3vyj6hwwzahqijuwpmamlv7ohi] inputs_to_check[0]: 0", "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inputs_to_check[1]: 1", "[pr5nr4a7dthirgd2ljo3d2xakc63ywxugusu6mkmr6gmpeliyib] inputs_to_check[2]: 2", "[kcuxe2zwm3mzv2uk6adm6iskoy35bqfv725twacrdewod2dbl5d] inputs_to_check[3]: 3", "[lkkae3meylaixfif4thncru4hjqeaislawjoghffrbwuscaagei] inputs_to_check[4]: 4", "[qs5hilycp4ew4ivtc7m5jaxp7q4pm5slioxw3fi3ur6ei65ybz4] inputs_to_check[5]: 5", "[agkvbkaha53nbz3aeeuhvxjvvc4glhfjofzkg6g2qjoo2e5otcx] inputs_to_check[6]: 6", "[j3s5elu6itwgjafc7rzhy4whrbufl6kfmlufjhh25grt643bk5f] inputs_to_check[7]: 7", "[yttmfmxblgcbsvbokguzowcorrcxz5uunxtcvsbe6nijgcx45he] inputs_to_check[8]: 8", "[qlgfiyqewrmkgqth2qm6wkq2ja5lzkapg3ypgnvoyfqqnidaoj3] inputs_to_check[9]: 9", "[j6c55jha5r2sdys2rwq7uqhtleea5dgjcye7nicfgft36v7xfvp] inputs_to_check[10]: 10", "[du4vyrfyozrfxcf6kk6ma7oqwatapifazeelfsawmsiu6gjdtxp] deterministic_algorithms_settings: (False, False, True)", "[qiptf2633zubseuei4bkisoq3not35l6lud6p23p4qmcsxiw2uq] cuda_matmul_settings: (False, True, True)", "[7uhqwjfn75ek3woo3k7em2mluon5hx2ojvzlevlvjvz6xfxjhzl] torch_version: ", "[c3z7bmoxyo6gl5hi47v6dc7jwsl55b3asd75nr25uyengi5ah3p] system_info[device]: {'name': 'NVIDIA PG509-210'}", "[3fb7kae6ogkdd4zcm3fkjoipdpybxhn4aoxzv7z7xsfwq233e4l] system_info[version]: {'triton': '3.1.0+5fe38ffd73dc767c8fadcf23ea82d79e257c37d44077eae7f681cf967565fd43e9c017937b-835d4fc33500e1accafc5c5e00f4f73d87432c114860c04b68849bf6f942b8e5-dc767c8fadcf23ea82d79e257c37d44077eae7f681cf967565fd43e9c017937b-23d635e690d670bf61798e1259674b78c0ed5ba222ab6a455f329f27a758fc2d-e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855-20b017e9c4d858ab05e783f77df50b86c6d6eee5d79f3f4b158562b4a54f8443-f44338a31e0534290b08653050804c3fabbde403a6d3004ae04f0c28495f0802-e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855-a979896b9c0acfd41dd953b90bdc4b10968f7c0b45a286eae3f829aaddb2bb55-da771298f7bc45d24a61f35ef51742304421df1ab49d50bf1fc510dd5a46ea4b-a8fb7be728d460b7ec64ab62edb8af1bbca8994fd718cde7178d46bad64530a1-71330f394e584b0df29595d49f6ac8ac0c5503db9147090dc58ad888cebac7be-f24adfd52383f7866791ebaa5d45a5d2cc826e56ee2fd285f438e85d201fe643-a34be0d3ae4b3ac9aede195cfda42f8a0a097b2bc9642fb59673ce6b3b607f10-36130a37af1b19a0dec569aa08d30b00c74c8f02b6b632999d86dea169146792-36d42f0429aae027cb985b53b9abc616fae4dad9e0ea03953e1e9fb46d0fb9a0-e5d2cb724c08d0ef4130f3ba858d22cf21f834bfd970a5388aa6ad2a6bab91f9', 'cuda': '12.0'}", "[z5x2bdhir5lzlbti73vdbfulnuu5vinzpwgmmgf4rjb775tzl3h] system_info[hash]: 9698c97edde4a99a2f3b54bbd0db5291bbcdb75c83acb376ccff61fb0bf0ac1a", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[TYPE_CHECKING]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[abi_compatible]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aggressive_fusion]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[allow_buffer_reuse]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[allow_stack_allocation]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[always_keep_tensor_constants]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aot_inductor.debug_compile]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aot_inductor.debug_dump_consts_bin]: False", "[ngkkx5e6z7erl6da23zb2cmsctz4yvaqyameyg5hbqln4wrhh7x] inductor_config[aot_inductor.debug_intermediate_value_printer]: 0", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[aot_inductor.filtered_kernel_names]: None", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aot_inductor.force_mmap_weights]: False", "[4bryyl4ahh5whyg3zwqebpwmjnx6w77nqgqbdjlowju6lkqtn7w] inductor_config[aot_inductor.metadata]: {}", "[v3hzzlv4tjgvp3pyhmzagjd25orl6n7nynoa7svlhhwk73b7u3c] inductor_config[aot_inductor.output_path]: ", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aot_inductor.package]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aot_inductor.package_cpp_only]: False", "[v3hzzlv4tjgvp3pyhmzagjd25orl6n7nynoa7svlhhwk73b7u3c] inductor_config[aot_inductor.serialized_in_spec]: ", "[v3hzzlv4tjgvp3pyhmzagjd25orl6n7nynoa7svlhhwk73b7u3c] inductor_config[aot_inductor.serialized_out_spec]: ", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aot_inductor.use_runtime_constant_folding]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[assert_indirect_indexing]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[assume_aligned_inputs]: False", "[v3hzzlv4tjgvp3pyhmzagjd25orl6n7nynoa7svlhhwk73b7u3c] inductor_config[autoheuristic_collect]: ", "[jvchmi66fvqzlemhr5fcqorz5trfdtdalzfagtj2aolmimwqhdq] inductor_config[autoheuristic_log_path]: DEFAULT", "[jwbrgxes7vjqumngs5hyj6gn5nytv2whnppnzngvaagfmawhkkd] inductor_config[autoheuristic_use]: mixed_mm", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[autotune_fallback_to_aten]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[autotune_in_subproc]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[autotune_local_cache]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[autotune_multi_device]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[autotune_remote_cache]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[b2b_gemm_pass]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[batch_fusion]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[benchmark_combo_kernel]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[benchmark_epilogue_fusion]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[benchmark_fusion]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[benchmark_harness]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[benchmark_kernel]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[bw_outputs_user_visible]: True", "[b4ha3ravs3qv237q65hpfqegbnoww7tf2ahcbu2i7xo6te5spqs] inductor_config[c_shim_version]: 2", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[check_stack_no_cycles_TESTING_ONLY]: False", "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inductor_config[combo_kernel_allow_mixed_sizes]: 1", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[combo_kernel_foreach_dynamic_shapes]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[combo_kernels]: False", "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inductor_config[combo_kernels_autotune]: 1", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[comment_origin]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[comprehensive_padding]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[compute_all_bounds]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[constant_and_index_propagation]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[conv_1x1_as_mm]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[coordinate_descent_check_all_directions]: False", "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inductor_config[coordinate_descent_search_radius]: 1", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[coordinate_descent_tuning]: False", "[c7zj4qytmety6keurs3hsh5wn7foxp3dqx4kym2ucszzcb2ngrf] inductor_config[cpp.cxx]: (None, 'g++')", "[yrty22bseefglnysuoec4ji7j2rnaggdj3g33zzj7avogwfmgdw] inductor_config[cpp.descriptive_names]: original_aten", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cpp.dynamic_threads]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cpp.enable_floating_point_contract_flag]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cpp.enable_kernel_profile]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[cpp.enable_loop_tail_vec]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[cpp.enable_tiling_heuristics]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cpp.enable_unsafe_math_opt_flag]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[cpp.fallback_scatter_reduce_sum]: True", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cpp.gemm_cache_blocking]: None", "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inductor_config[cpp.gemm_max_k_slices]: 1", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cpp.gemm_thread_factors]: None", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cpp.inject_log1p_bug_TESTING_ONLY]: None", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cpp.inject_relu_bug_TESTING_ONLY]: None", "[ebt2ncs4f5y7dn7btzi76mnouepvzad474tmp5iju4wiuumjl4s] inductor_config[cpp.max_horizontal_fusion_size]: 16", "[g7rrnbg5yonzux3cfj5ovre5lob3ayda7qcfpxjvtwmiz4uicii] inductor_config[cpp.min_chunk_size]: 4096", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[cpp.no_redundant_loops]: True", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cpp.simdlen]: None", "[sz3im5ogc6asp7g4uqocnovype63tkdexzfrniv6hn2oank3biu] inductor_config[cpp.threads]: -1", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cpp.vec_isa_ok]: None", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[cpp.weight_prepack]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cpp_wrapper]: False", "[bsvfcwwoczx2rlkdz2eta6doujsymyihmi46hhwk6clrrvwcb6m] inductor_config[cpu_backend]: cpp", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cuda.arch]: None", "[tvyftmtdmezlejo2xllu7awzv4pzc4vm4fub4b3gpl5jptjkosi] inductor_config[cuda.compile_opt_level]: -O1", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cuda.cuda_cxx]: None", "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inductor_config[cuda.cutlass_backend_min_gemm_size]: 1", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cuda.cutlass_max_profiling_configs]: None", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cuda.cutlass_op_allowlist_regex]: None", "[lwkz5chtpji756gurqw4foijfi7zfgljtnn5nmnvdi2skpt4mgh] inductor_config[cuda.cutlass_op_denylist_regex]: pingpong", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cuda.enable_cuda_lto]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cuda.enable_debug_info]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cuda.enable_ptxas_info]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[cuda.generate_test_runner]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cuda.use_fast_math]: False", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cuda.version]: None", "[caw4ly2z672k6kjfahoxwpajp5idhhtrpgf3ma2clylcp7c7aid] inductor_config[cuda_backend]: triton", "[pikr7bbcoixfzftsazp5ggufhdklj24babfry77bl4nuvyrrcp4] inductor_config[custom_op_default_layout_constraint]: needs_fixed_stride_order", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[dce]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[debug]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[debug_fusion]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[debug_index_asserts]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[debug_ir_traceback]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[decompose_mem_bound_mm]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[developer_warnings]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[disable_cpp_codegen]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[disable_padding_cpu]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[disable_progress]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[dynamic_scale_rblock]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[efficient_conv_bn_eval_fx_passes]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[emulate_precision_casts]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[enable_auto_functionalized_v2]: False", "[v3hzzlv4tjgvp3pyhmzagjd25orl6n7nynoa7svlhhwk73b7u3c] inductor_config[enabled_metric_tables]: ", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[epilogue_fusion]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[epilogue_fusion_first]: False", "[lxxtoqhcoepwfokeiibd575gnxo3uzwiv4hmpomlwkpzqz3qzsh] inductor_config[estimate_op_runtime]: default", "[h25wqx6vliw4j5rtzzbv6latydxyei3deyg6v7wzvnzryfktuki] inductor_config[external_matmul]: []", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[fallback_random]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[force_disable_caches]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[force_fuse_int_mm_with_mul]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[force_layout_optimization]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[force_same_precision]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[force_shape_pad]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[freezing]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[freezing_discard_parameters]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[fx_graph_cache]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[fx_graph_remote_cache]: False", "[62lrdx35b7hnumwb7mp5oc5y5csm2abylvtdzfloct3noaqov3n] inductor_config[fx_passes_numeric_check]: {'pre_grad': False, 'post_grad': False, 'precision': 0.0001, 'num_iterations': 1, 'requires_optimizer': True}", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[generate_intermediate_hooks]: False", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[global_cache_dir]: None", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[group_fusion]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[halide.asserts]: False", "[ljhgflgihidopsfsdcbqynv27nceykby3nutyd5jlcpq7n6e7l4] inductor_config[halide.cpu_target]: host", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[halide.debug]: False", "[wx7vmsmrdpk5ue2txlywp3lj3faqmdjphs5fgg2ehzsyno7uovg] inductor_config[halide.gpu_target]: host-cuda", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[halide.scan_kernels]: False", "[k5ogk6345jvklsnu7g2njqstiz2g6pm5wmqpgg3kasrmuqwjvl6] inductor_config[halide.scheduler_cpu]: Adams2019", "[svgytlua5wcyeia7wq7e6zgh5tsueikrnzchmdmouvmkpfsc2zq] inductor_config[halide.scheduler_cuda]: Anderson2021", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[implicit_fallbacks]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[inplace_buffers]: True", "[5fxczt3ciyxitdhizb7sfsgn7fhpczcqsngttnt5ot2wyctk7co] inductor_config[inter_node_bw]: 25", "[yezuzjtg4h3jjur4jwtwiehbyixa7eonq4tqsqmwqve2lvvmrem] inductor_config[intra_node_bw]: 300", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[is_nightly_or_source]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[is_predispatch]: False", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[joint_custom_post_pass]: None", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[joint_custom_pre_pass]: None", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[joint_graph_constant_folding]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[keep_output_stride]: True", "[j6c55jha5r2sdys2rwq7uqhtleea5dgjcye7nicfgft36v7xfvp] inductor_config[kernel_name_max_ops]: 10", "[4p2fdjlvxrcw7c7fvzm5huhtqxnro4kvkx56f7p5zyrxqkwooov] inductor_config[layout_opt_default]: 1", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[layout_optimization]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[loop_ordering_after_fusion]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[max_autotune]: False", "[uqlsbif4zxd75vt522p52txyuguieipi2lwz5g5awt56lccqk7s] inductor_config[max_autotune_conv_backends]: ATEN,TRITON", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[max_autotune_gemm]: False", "[2y7luesktjrque3nr7qtxnum2mkbeegzdrsvkm3rvdlhqboajhx] inductor_config[max_autotune_gemm_backends]: ATEN,TRITON,CPP", "[jvchmi66fvqzlemhr5fcqorz5trfdtdalzfagtj2aolmimwqhdq] inductor_config[max_autotune_gemm_search_space]: DEFAULT", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[max_autotune_pointwise]: False", "[bh33ranllcgilhgmgr3qvygzxjm6isq5iexnfm3zx6fnr2zwlp2] inductor_config[max_autotune_subproc_graceful_timeout_seconds]: 1.0", "[iglov24t7x5ruci344aer2tm6nqshi4veuw4wxlssxtu46cx76m] inductor_config[max_autotune_subproc_result_timeout_seconds]: 60.0", "[pwoh5aypf4fxbntdvwt67rppxorqos6xr3w7qzeun6kblbfg2ga] inductor_config[max_autotune_subproc_terminate_timeout_seconds]: 2.0", "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inductor_config[max_epilogue_benchmarked_choices]: 1", "[jykiys6ynafs3zdylwa5ggq6j655mxeh42d6mtdi22gffkrmiac] inductor_config[max_fusion_size]: 64", "[yttmfmxblgcbsvbokguzowcorrcxz5uunxtcvsbe6nijgcx45he] inductor_config[max_pointwise_cat_inputs]: 8", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[memory_planning]: False", "[x75won4jmsgeb63pcvwr2y4eteyzzdhmf5rv6xhjppie4hx2yu5] inductor_config[memory_pool]: intermediates", "[v2td5s4lnsvyxvaevy4chx6kc5h3mm2axazbgwimqule5zrzao7] inductor_config[mixed_mm_choice]: heuristic", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[nan_asserts]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[optimize_scatter_upon_const_tensor]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[pad_channels_last]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[pad_outputs]: False", "[ljdqgtysl3vdf7j6attlz5gmjg2ncihnveojfyubosplmkrjgra] inductor_config[padding_alignment_bytes]: 128", "[dnnw5ks3yxrp7mwvihb2hh4tqx35ye637xt33x64kw4fvz2nyzg] inductor_config[padding_stride_threshold]: 1024", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[pattern_matcher]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[permute_fusion]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[pick_loop_orders]: True", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[post_grad_custom_post_pass]: None", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[post_grad_custom_pre_pass]: None", "[4bryyl4ahh5whyg3zwqebpwmjnx6w77nqgqbdjlowju6lkqtn7w] inductor_config[post_grad_fusion_options]: {}", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[pre_grad_custom_pass]: None", "[4bryyl4ahh5whyg3zwqebpwmjnx6w77nqgqbdjlowju6lkqtn7w] inductor_config[pre_grad_fusion_options]: {}", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[profile_bandwidth]: False", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[profile_bandwidth_output]: None", "[v3hzzlv4tjgvp3pyhmzagjd25orl6n7nynoa7svlhhwk73b7u3c] inductor_config[profile_bandwidth_regex]: ", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[profile_bandwidth_with_do_bench_using_profiling]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[profiler_mark_wrapper_call]: False", "[yttmfmxblgcbsvbokguzowcorrcxz5uunxtcvsbe6nijgcx45he] inductor_config[realize_acc_reads_threshold]: 8", "[rr5m5hsocoyodldz7vcvaizdwvm2rt34evmqdxvng7wz3tufvo6] inductor_config[realize_opcount_threshold]: 30", "[lkkae3meylaixfif4thncru4hjqeaislawjoghffrbwuscaagei] inductor_config[realize_reads_threshold]: 4", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[reorder_for_compute_comm_overlap]: False", "[ssupi7bu3rrhdpg2jyegzncu3kg3nnhklyliqvutaxgs7y7k3dx] inductor_config[reorder_for_compute_comm_overlap_passes]: ['reorder_compute_for_overlap', 'sink_waits', 'raise_comms']", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[reorder_for_locality]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[reorder_for_peak_memory]: False", "[h25wqx6vliw4j5rtzzbv6latydxyei3deyg6v7wzvnzryfktuki] inductor_config[rocm.arch]: []", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[rocm.ck_dir]: None", "[oartxnko2l7d67tzwwm2otcumaut3n4wwcfgz3o377hmcveu5ft] inductor_config[rocm.ck_supported_arch]: ['gfx90a', 'gfx940', 'gfx941', 'gfx942']", "[klfqjprnpfhcdurgvuikvc4rpd5ynkpk77toousr5h3u5roty6p] inductor_config[rocm.compile_opt_level]: -O2", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[rocm.flush_denormals]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[rocm.is_debug]: False", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[rocm.n_max_profiling_configs]: None", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[rocm.print_kernel_resource_usage]: False", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[rocm.rocm_home]: None", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[rocm.save_temps]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[rocm.use_fast_math]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[rocm.use_preselected_instances]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[save_args]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[search_autotune_cache]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[shape_padding]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[size_asserts]: True", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[sleep_sec_TESTING_ONLY]: None", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[split_cat_fx_passes]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[split_reductions]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[static_weight_shapes]: True", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[triton.autotune_at_compile_time]: None", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.autotune_cublasLt]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.autotune_pointwise]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.codegen_upcast_to_fp32]: True", "[tuax46wac7rfv2trf5gcps6vleo3cq44lbnrdxtprvo3ljjaddj] inductor_config[triton.cudagraph_dynamic_shape_warn_limit]: 50", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.cudagraph_skip_dynamic_graphs]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.cudagraph_support_input_mutation]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.cudagraph_trees]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.cudagraph_trees_history_recording]: False", "[ljdqgtysl3vdf7j6attlz5gmjg2ncihnveojfyubosplmkrjgra] inductor_config[triton.cudagraph_unexpected_rerecord_limit]: 128", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.cudagraphs]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.debug_sync_graph]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.debug_sync_kernel]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.dense_indexing]: False", "[yrty22bseefglnysuoec4ji7j2rnaggdj3g33zzj7avogwfmgdw] inductor_config[triton.descriptive_names]: original_aten", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.divisible_by_16]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.fast_path_cudagraph_asserts]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.force_cudagraph_sync]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.force_cudagraphs_warmup]: False", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[triton.inject_relu_bug_TESTING_ONLY]: None", "[pr5nr4a7dthirgd2ljo3d2xakc63ywxugusu6mkmr6gmpeliyib] inductor_config[triton.max_tiles]: 2", "[fv6slhtedtydps5s5u2etitscliblzcidyitqf7krsv4e23fzk6] inductor_config[triton.min_split_scan_rblock]: 256", "[vrl5ktomgtzox5xucd3np6vug3vyj6hwwzahqijuwpmamlv7ohi] inductor_config[triton.multi_kernel]: 0", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.persistent_reductions]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.prefer_nd_tiling]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.skip_cudagraph_warmup]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.slow_path_cudagraph_asserts]: True", "[ebt2ncs4f5y7dn7btzi76mnouepvzad474tmp5iju4wiuumjl4s] inductor_config[triton.spill_threshold]: 16", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.store_cubin]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.tiling_prevents_pointwise_fusion]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.tiling_prevents_reduction_fusion]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.unique_kernel_names]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.use_block_ptr]: False", "[vzzema5ityqj2wepdmkulue7q5pcevdr5h27oxxutf35d4tjume] inductor_config[triton_kernel_default_layout_constraint]: flexible_layout", "[wft6ljqsfr3x4m7fa5zuyb7cwknky4irrxz4bjr6uzr2yiopxqj] inductor_config[unbacked_symint_fallback]: 8192", "[yttmfmxblgcbsvbokguzowcorrcxz5uunxtcvsbe6nijgcx45he] inductor_config[unroll_reductions_threshold]: 8", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[unsafe_ignore_unsupported_triton_autotune_args]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[use_minimal_arrayref_interface]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[use_mixed_mm]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[verbose_progress]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[warn_mix_layout]: False"], "cache_event_time": 1727975456229537044, "cache_state": "miss", "time_taken_ns": 6372999079} +V1003 10:11:02.605000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "20079c82f956785a2eeb02be390ae599"} + { + "name": "inductor_compile", + "ts": 1727975462605207.0, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 0, + "fxgraph_cache_miss": 1, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:02.605000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "194073b2ec42e1f2dbb4aff2a005e416"} + { + "name": "compile_fx_inner", + "ts": 1727975462605545.2, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 0, + "fxgraph_cache_miss": 1, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:02.606000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "ae70f86dc28b7efeea5cae51d87a7327"} + { + "name": "compile_fx..fw_compiler_base", + "ts": 1727975462606301.8, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 0, + "fxgraph_cache_miss": 1, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:02.609000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "7d733902132e302c6ed3910c31586fe0"} + { + "name": "create_aot_dispatcher_function", + "ts": 1727975462609845.0, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 0, + "fxgraph_cache_miss": 1, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:02.610000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "c2cdcd6e0e2a687934149efb13d13ca8"} + { + "name": "backend_compile", + "ts": 1727975462610479.8, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 0, + "fxgraph_cache_miss": 1, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:02.610000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "d20e4a17a90e3e33d30bd927bed546fc"} + { + "name": "OutputGraph.call_user_compiler", + "ts": 1727975462610762.5, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 0, + "fxgraph_cache_miss": 1, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:02.642000 2235078 torch/_dynamo/guards.py:2311] {"dynamo_cpp_guards_str": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "35a49d1f2da1768efdba707d805b4b97"} + + TREE_GUARD_MANAGER: + +- RootGuardManager + | +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:471 in init_ambient_guards + | +- GLOBAL_STATE: ___check_global_state() + | +- TORCH_FUNCTION_MODE_STACK: ___check_torch_function_mode_stack() + | +- GuardManager: source=L['k'], accessed_by=DictGetItemGuardAccessor(k) + | | +- TYPE_MATCH: ___check_type_id(L['k'], 82028112) + | | +- TENSOR_MATCH: check_tensor(L['k'], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.float32, device=0, requires_grad=False, size=[1, 4, 512, 64], stride=[131072, 32768, 64, 1]) + | | +- NO_HASATTR: hasattr(L['k'], '_dynamo_dynamic_indices') == False + | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['k'], L['q'], L['v'], L['block_mask'].q_indices, L['block_mask'].kv_indices, L['block_mask'].q_num_blocks, L['block_mask'].kv_num_blocks, L['block_mask'].full_q_indices, L['block_mask'].full_kv_indices, L['block_mask'].full_q_num_blocks, L['block_mask'].full_kv_num_blocks) + | +- GuardManager: source=L['q'], accessed_by=DictGetItemGuardAccessor(q) + | | +- TYPE_MATCH: ___check_type_id(L['q'], 82028112) + | | +- TENSOR_MATCH: check_tensor(L['q'], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.float32, device=0, requires_grad=False, size=[1, 4, 512, 64], stride=[131072, 32768, 64, 1]) + | | +- NO_HASATTR: hasattr(L['q'], '_dynamo_dynamic_indices') == False + | | +- NO_TENSOR_ALIASING + | +- GuardManager: source=L['v'], accessed_by=DictGetItemGuardAccessor(v) + | | +- TYPE_MATCH: ___check_type_id(L['v'], 82028112) + | | +- TENSOR_MATCH: check_tensor(L['v'], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.float32, device=0, requires_grad=False, size=[1, 4, 512, 64], stride=[131072, 32768, 64, 1]) + | | +- NO_HASATTR: hasattr(L['v'], '_dynamo_dynamic_indices') == False + | | +- NO_TENSOR_ALIASING + | +- GuardManager: source=L['score_mod'], accessed_by=DictGetItemGuardAccessor(score_mod) + | | +- GuardManager: source=L['score_mod'].__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | +- ID_MATCH: ___check_obj_id(L['score_mod'].__code__, 140413271879296) + | +- GuardManager: source=L['block_mask'], accessed_by=DictGetItemGuardAccessor(block_mask) + | | +- TYPE_MATCH: ___check_type_id(L['block_mask'], 387600320) + | | +- GuardManager: source=L['block_mask'].mask_mod, accessed_by=GetAttrGuardAccessor(mask_mod) + | | | +- GuardManager: source=L['block_mask'].mask_mod.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | +- ID_MATCH: ___check_obj_id(L['block_mask'].mask_mod.__code__, 140413271880128) + | | +- GuardManager: source=L['block_mask'].q_indices, accessed_by=GetAttrGuardAccessor(q_indices) + | | | +- TENSOR_MATCH: check_tensor(L['block_mask'].q_indices, Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16, 16], stride=[256, 256, 16, 1]) + | | | +- NO_HASATTR: hasattr(L['block_mask'].q_indices, '_dynamo_dynamic_indices') == False + | | | +- NO_TENSOR_ALIASING + | | +- GuardManager: source=L['block_mask'].BLOCK_SIZE, accessed_by=GetAttrGuardAccessor(BLOCK_SIZE) + | | | +- TYPE_MATCH: ___check_type_id(L['block_mask'].BLOCK_SIZE, 8815232) + | | | +- LENGTH_CHECK: len(L['block_mask'].BLOCK_SIZE) == 2 + | | | +- GuardManager: source=L['block_mask'].BLOCK_SIZE[0], accessed_by=TupleGetItemGuardAccessor(0) + | | | | +- EQUALS_MATCH: L['block_mask'].BLOCK_SIZE[0] == 128 + | | | +- GuardManager: source=L['block_mask'].BLOCK_SIZE[1], accessed_by=TupleGetItemGuardAccessor(1) + | | | | +- EQUALS_MATCH: L['block_mask'].BLOCK_SIZE[1] == 128 + | | +- GuardManager: source=L['block_mask'].kv_indices, accessed_by=GetAttrGuardAccessor(kv_indices) + | | | +- TENSOR_MATCH: check_tensor(L['block_mask'].kv_indices, Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16, 16], stride=[256, 256, 16, 1]) + | | | +- NO_HASATTR: hasattr(L['block_mask'].kv_indices, '_dynamo_dynamic_indices') == False + | | | +- NO_TENSOR_ALIASING + | | +- GuardManager: source=L['block_mask'].q_num_blocks, accessed_by=GetAttrGuardAccessor(q_num_blocks) + | | | +- TENSOR_MATCH: check_tensor(L['block_mask'].q_num_blocks, Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16], stride=[16, 16, 1]) + | | | +- NO_HASATTR: hasattr(L['block_mask'].q_num_blocks, '_dynamo_dynamic_indices') == False + | | | +- NO_TENSOR_ALIASING + | | +- GuardManager: source=L['block_mask'].kv_num_blocks, accessed_by=GetAttrGuardAccessor(kv_num_blocks) + | | | +- TENSOR_MATCH: check_tensor(L['block_mask'].kv_num_blocks, Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16], stride=[16, 16, 1]) + | | | +- NO_HASATTR: hasattr(L['block_mask'].kv_num_blocks, '_dynamo_dynamic_indices') == False + | | | +- NO_TENSOR_ALIASING + | | +- GuardManager: source=L['block_mask'].full_q_indices, accessed_by=GetAttrGuardAccessor(full_q_indices) + | | | +- TENSOR_MATCH: check_tensor(L['block_mask'].full_q_indices, Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16, 16], stride=[256, 256, 16, 1]) + | | | +- NO_HASATTR: hasattr(L['block_mask'].full_q_indices, '_dynamo_dynamic_indices') == False + | | | +- NO_TENSOR_ALIASING + | | +- GuardManager: source=L['block_mask'].full_kv_indices, accessed_by=GetAttrGuardAccessor(full_kv_indices) + | | | +- TENSOR_MATCH: check_tensor(L['block_mask'].full_kv_indices, Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16, 16], stride=[256, 256, 16, 1]) + | | | +- NO_HASATTR: hasattr(L['block_mask'].full_kv_indices, '_dynamo_dynamic_indices') == False + | | | +- NO_TENSOR_ALIASING + | | +- GuardManager: source=L['block_mask'].full_q_num_blocks, accessed_by=GetAttrGuardAccessor(full_q_num_blocks) + | | | +- TENSOR_MATCH: check_tensor(L['block_mask'].full_q_num_blocks, Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16], stride=[16, 16, 1]) + | | | +- NO_HASATTR: hasattr(L['block_mask'].full_q_num_blocks, '_dynamo_dynamic_indices') == False + | | | +- NO_TENSOR_ALIASING + | | +- GuardManager: source=L['block_mask'].full_kv_num_blocks, accessed_by=GetAttrGuardAccessor(full_kv_num_blocks) + | | | +- TENSOR_MATCH: check_tensor(L['block_mask'].full_kv_num_blocks, Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16], stride=[16, 16, 1]) + | | | +- NO_HASATTR: hasattr(L['block_mask'].full_kv_num_blocks, '_dynamo_dynamic_indices') == False + | | | +- NO_TENSOR_ALIASING + | | +- GuardManager: source=L['block_mask'].as_tuple, accessed_by=GetAttrGuardAccessor(as_tuple) + | | | +- GuardManager: source=L['block_mask'].as_tuple, accessed_by=FuncDefaultsGuardAccessor + | | | | +- GuardManager: source=L['block_mask'].as_tuple.__defaults__[0], accessed_by=GetItemGuardAccessor(0) + | | | | | +- ID_MATCH: ___check_obj_id(L['block_mask'].as_tuple.__defaults__[0], 8911040) + | +- GuardManager: source=L['flex_attention'], accessed_by=DictGetItemGuardAccessor(flex_attention) + | | +- GuardManager: source=L['flex_attention'].__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | +- ID_MATCH: ___check_obj_id(L['flex_attention'].__code__, 387082992) + | | +- GuardManager: source=L['flex_attention'], accessed_by=FuncDefaultsGuardAccessor + | | | +- GuardManager: source=L['flex_attention'].__defaults__[2], accessed_by=GetItemGuardAccessor(2) + | | | | +- ID_MATCH: ___check_obj_id(L['flex_attention'].__defaults__[2], 8825760) + | | | +- GuardManager: source=L['flex_attention'].__defaults__[3], accessed_by=GetItemGuardAccessor(3) + | | | | +- ID_MATCH: ___check_obj_id(L['flex_attention'].__defaults__[3], 8910592) + | | | +- GuardManager: source=L['flex_attention'].__defaults__[4], accessed_by=GetItemGuardAccessor(4) + | | | | +- ID_MATCH: ___check_obj_id(L['flex_attention'].__defaults__[4], 8910592) + | | | +- GuardManager: source=L['flex_attention'].__defaults__[5], accessed_by=GetItemGuardAccessor(5) + | | | | +- ID_MATCH: ___check_obj_id(L['flex_attention'].__defaults__[5], 8825760) + | +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor + | | +- GuardManager: source=G['__builtins_dict___2'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___2) + | | | +- GuardManager: source=G['__builtins_dict___2']['len'], accessed_by=DictGetItemGuardAccessor(len) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___2']['len'], 140413275558816) + | | | +- GuardManager: source=G['__builtins_dict___2']['sum'], accessed_by=DictGetItemGuardAccessor(sum) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___2']['sum'], 140413275559936) + | | | +- GuardManager: source=G['__builtins_dict___2']['list'], accessed_by=DictGetItemGuardAccessor(list) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___2']['list'], 8844320) + | | | +- GuardManager: source=G['__builtins_dict___2']['type'], accessed_by=DictGetItemGuardAccessor(type) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___2']['type'], 8813248) + | | | +- GuardManager: source=G['__builtins_dict___2']['tuple'], accessed_by=DictGetItemGuardAccessor(tuple) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___2']['tuple'], 8815232) + | | | +- GuardManager: source=G['__builtins_dict___2']['object'], accessed_by=DictGetItemGuardAccessor(object) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___2']['object'], 8813984) + | | | +- GuardManager: source=G['__builtins_dict___2']['isinstance'], accessed_by=DictGetItemGuardAccessor(isinstance) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___2']['isinstance'], 140413275558496) + | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot_utils_dot__pytree) + | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree'], 140411217627952) + | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree'].TreeSpec, accessed_by=GetAttrGuardAccessor(TreeSpec) + | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree'].TreeSpec, 84866496) + | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._is_leaf, accessed_by=GetAttrGuardAccessor(_is_leaf) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._is_leaf.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree']._is_leaf.__code__, 140411217262720) + | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC, accessed_by=GetAttrGuardAccessor(_LEAF_SPEC) + | | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC, 85171104) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.type, accessed_by=GetAttrGuardAccessor(type) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.type, 8825760) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.context, accessed_by=GetAttrGuardAccessor(context) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.context, 8825760) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.num_nodes, accessed_by=GetAttrGuardAccessor(num_nodes) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.num_nodes == 1 + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.num_leaves, accessed_by=GetAttrGuardAccessor(num_leaves) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.num_leaves == 1 + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.num_children, accessed_by=GetAttrGuardAccessor(num_children) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.num_children == 0 + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.children_specs, accessed_by=GetAttrGuardAccessor(children_specs) + | | | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.children_specs, 8844320) + | | | | | +- LENGTH_CHECK: not G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.children_specs + | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._get_node_type, accessed_by=GetAttrGuardAccessor(_get_node_type) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._get_node_type.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree']._get_node_type.__code__, 140411217262448) + | | | +- DictGuardManager: source=G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES, accessed_by=GetAttrGuardAccessor(SUPPORTED_NODES) + | | | | +- DICT_VERSION: ___dict_version(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES) == 519596 + | | | | +- KeyValueManager pair at index=1 + | | | | | +- ValueManager: GuardManager: source=G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]] + | | | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]].flatten_fn, accessed_by=GetAttrGuardAccessor(flatten_fn) + | | | | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]].flatten_fn.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]].flatten_fn.__code__, 140411196281984) + | | | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]].unflatten_fn, accessed_by=GetAttrGuardAccessor(unflatten_fn) + | | | | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]].unflatten_fn.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]].unflatten_fn.__code__, 140411217182288) + | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._tree_flatten_helper, accessed_by=GetAttrGuardAccessor(_tree_flatten_helper) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._tree_flatten_helper.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree']._tree_flatten_helper.__code__, 140411217413040) + | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._is_namedtuple_instance, accessed_by=GetAttrGuardAccessor(_is_namedtuple_instance) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._is_namedtuple_instance.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree']._is_namedtuple_instance.__code__, 140411217412592) + | | +- GuardManager: source=G['__import_torch_dot__dynamo_dot_comptime'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot__dynamo_dot_comptime) + | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot__dynamo_dot_comptime'], 140410226912176) + | | +- GuardManager: source=G['__import_torch_dot__dynamo_dot_decorators'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot__dynamo_dot_decorators) + | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot__dynamo_dot_decorators'], 140410226910096) + | | | +- GuardManager: source=G['__import_torch_dot__dynamo_dot_decorators'].is_compiling, accessed_by=GetAttrGuardAccessor(is_compiling) + | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot__dynamo_dot_decorators'].is_compiling, 140410376252096) + | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot__utils'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot_nn_dot_attention_dot__utils) + | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot__utils'], 140409673896784) + | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS, accessed_by=GetAttrGuardAccessor(_SUPPORTED_HEAD_DIMS) + | | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS, 8844320) + | | | | +- LENGTH_CHECK: len(G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS) == 10 + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS[0], accessed_by=ListGetItemGuardAccessor(0) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS[0] == 2 + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS[1], accessed_by=ListGetItemGuardAccessor(1) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS[1] == 4 + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS[2], accessed_by=ListGetItemGuardAccessor(2) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS[2] == 8 + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS[3], accessed_by=ListGetItemGuardAccessor(3) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS[3] == 16 + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS[4], accessed_by=ListGetItemGuardAccessor(4) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS[4] == 32 + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS[5], accessed_by=ListGetItemGuardAccessor(5) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS[5] == 64 + | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot_nn_dot_attention_dot_flex_attention) + | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention'], 140409673895824) + | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].math, accessed_by=GetAttrGuardAccessor(math) + | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].math, 140413266939392) + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].math.sqrt, accessed_by=GetAttrGuardAccessor(sqrt) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].math.sqrt, 140413266943072) + | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].torch, accessed_by=GetAttrGuardAccessor(torch) + | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].torch, 140413267918368) + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].torch._dynamo, accessed_by=GetAttrGuardAccessor(_dynamo) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].torch._dynamo, 140413260098400) + | | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].torch._dynamo.mark_static, accessed_by=GetAttrGuardAccessor(mark_static) + | | | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].torch._dynamo.mark_static.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].torch._dynamo.mark_static.__code__, 123166432) + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].torch.compiler, accessed_by=GetAttrGuardAccessor(compiler) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].torch.compiler, 140410826010400) + | | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].torch.compiler.is_dynamo_compiling, accessed_by=GetAttrGuardAccessor(is_dynamo_compiling) + | | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].torch.compiler.is_dynamo_compiling, 140410826132992) + | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._validate_device, accessed_by=GetAttrGuardAccessor(_validate_device) + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._validate_device.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._validate_device.__code__, 140409673611088) + | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].flex_attention_hop, accessed_by=GetAttrGuardAccessor(flex_attention_hop) + | | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].flex_attention_hop, 96992544) + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].flex_attention_hop.__name__, accessed_by=GetAttrGuardAccessor(__name__) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].flex_attention_hop.__name__ == 'flex_attention' + | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._supported_head_dim, accessed_by=GetAttrGuardAccessor(_supported_head_dim) + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._supported_head_dim.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._supported_head_dim.__code__, 140409673231376) + | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._validate_embed_dim, accessed_by=GetAttrGuardAccessor(_validate_embed_dim) + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._validate_embed_dim.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._validate_embed_dim.__code__, 388086512) + | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._validate_sdpa_input, accessed_by=GetAttrGuardAccessor(_validate_sdpa_input) + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._validate_sdpa_input.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._validate_sdpa_input.__code__, 387915104) + | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._apply_kernel_options, accessed_by=GetAttrGuardAccessor(_apply_kernel_options) + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._apply_kernel_options.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._apply_kernel_options.__code__, 140409683680752) + +V1003 10:11:02.643000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "3937c61e6c98ab6b52d8e00fc2327f9d"} + { + "name": "entire_frame_compile", + "ts": 1727975462642948.2, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 0, + "fxgraph_cache_miss": 1, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:02.643000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "af7756663bbea703738c5d774f3ca273"} + { + "name": "_compile.compile_inner", + "ts": 1727975462643261.5, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 0, + "fxgraph_cache_miss": 1, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:02.643000 2235078 torch/_dynamo/utils.py:840] {"compilation_metrics": {"compile_id": "1/0", "frame_key": "2", "co_name": "fn", "co_filename": "/data/users/oulgen/pytorch/test/inductor/test_codecache.py", "co_firstlineno": 379, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 80, "shape_env_guard_count": 0, "graph_op_count": 11, "graph_node_count": 25, "graph_input_count": 11, "start_time": 1727975453.4995801, "entire_frame_compile_time_s": 9.143243551254272, "backend_compile_time_s": 8.912311315536499, "inductor_compile_time_s": 8.53272271156311, "code_gen_time_s": 5.749065160751343, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0, "has_guarded_code": true, "possibly_missed_reinplacing_opportunities": 0, "remote_cache_time_saved_s": 0, "structured_logging_overhead_s": 0.034261202, "config_suppress_errors": false, "config_inline_inbuilt_nn_modules": true, "specialize_float": true}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.649000 2235078 torch/_dynamo/convert_frame.py:915] {"dynamo_start": {"stack": [{"line": 916, "name": "", "filename": 1}, {"line": 14, "name": "run_tests", "filename": 2}, {"line": 38, "name": "run_tests", "filename": 3}, {"line": 1273, "name": "run_tests", "filename": 4}, {"line": 102, "name": "__init__", "filename": 5}, {"line": 274, "name": "runTests", "filename": 5}, {"line": 217, "name": "run", "filename": 6}, {"line": 84, "name": "__call__", "filename": 7}, {"line": 122, "name": "run", "filename": 7}, {"line": 84, "name": "__call__", "filename": 7}, {"line": 122, "name": "run", "filename": 7}, {"line": 678, "name": "__call__", "filename": 8}, {"line": 3116, "name": "run", "filename": 4}, {"line": 3088, "name": "_run_custom", "filename": 4}, {"line": 623, "name": "run", "filename": 8}, {"line": 579, "name": "_callTestMethod", "filename": 8}, {"line": 2983, "name": "wrapper", "filename": 4}, {"line": 81, "name": "inner", "filename": 9}, {"line": 81, "name": "inner", "filename": 9}, {"line": 405, "name": "test_flex_attention_caching", "filename": 1}, {"line": 380, "name": "fn", "filename": 1}, {"line": 1062, "name": "flex_attention", "filename": 10}, {"line": 1049, "name": "_flex_attention_hop_wrapper", "filename": 10}]}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.649000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "a2ef97fb78450a26e1e87b56945ab331"} + { + "name": "_compile.compile_inner", + "ts": 1727975462649375.8, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:02.649000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "3a33794e999255e5aaa28e163fcdc260"} + { + "name": "entire_frame_compile", + "ts": 1727975462649375.8, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:02.652000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 0, "describer_id": 240, "size": 524288}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.652000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 0, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1, 4, 512, 64], "is_leaf": true, "stride": [131072, 32768, 64, 1], "storage": 0, "view_func": "", "describer_id": 240}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.653000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 240, "id": 0, "source": "L['args'][0]"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.768000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 1, "describer_id": 240, "size": 524288}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.769000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 4, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1, 4, 512, 64], "is_leaf": true, "stride": [131072, 32768, 64, 1], "storage": 1, "view_func": "", "describer_id": 240}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.769000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 240, "id": 4, "source": "L['args'][1]"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.770000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 2, "describer_id": 240, "size": 524288}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.770000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 5, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1, 4, 512, 64], "is_leaf": true, "stride": [131072, 32768, 64, 1], "storage": 2, "view_func": "", "describer_id": 240}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.771000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 240, "id": 5, "source": "L['args'][2]"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.772000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 3, "describer_id": 240, "size": 64}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.772000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 6, "ndim": 3, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16], "is_leaf": true, "stride": [16, 16, 1], "storage": 3, "view_func": "", "describer_id": 240}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.772000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 240, "id": 6, "source": "L['args'][4][0]"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.773000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 4, "describer_id": 240, "size": 1024}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.774000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 7, "ndim": 4, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16, 16], "is_leaf": true, "stride": [256, 256, 16, 1], "storage": 4, "view_func": "", "describer_id": 240}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.774000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 240, "id": 7, "source": "L['args'][4][1]"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.775000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 5, "describer_id": 240, "size": 64}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.775000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 8, "ndim": 3, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16], "is_leaf": true, "stride": [16, 16, 1], "storage": 5, "view_func": "", "describer_id": 240}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.775000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 240, "id": 8, "source": "L['args'][4][2]"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.776000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 6, "describer_id": 240, "size": 1024}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.777000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 9, "ndim": 4, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16, 16], "is_leaf": true, "stride": [256, 256, 16, 1], "storage": 6, "view_func": "", "describer_id": 240}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.777000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 240, "id": 9, "source": "L['args'][4][3]"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.778000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 7, "describer_id": 240, "size": 64}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.778000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 10, "ndim": 3, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16], "is_leaf": true, "stride": [16, 16, 1], "storage": 7, "view_func": "", "describer_id": 240}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.778000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 240, "id": 10, "source": "L['args'][4][4]"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.779000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 8, "describer_id": 240, "size": 1024}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.780000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 11, "ndim": 4, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16, 16], "is_leaf": true, "stride": [256, 256, 16, 1], "storage": 8, "view_func": "", "describer_id": 240}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.780000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 240, "id": 11, "source": "L['args'][4][5]"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.781000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 9, "describer_id": 240, "size": 64}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.781000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 12, "ndim": 3, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16], "is_leaf": true, "stride": [16, 16, 1], "storage": 9, "view_func": "", "describer_id": 240}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.781000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 240, "id": 12, "source": "L['args'][4][6]"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.782000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 10, "describer_id": 240, "size": 1024}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.783000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 13, "ndim": 4, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16, 16], "is_leaf": true, "stride": [256, 256, 16, 1], "storage": 10, "view_func": "", "describer_id": 240}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.783000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 240, "id": 13, "source": "L['args'][4][7]"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.791000 2235078 torch/_dynamo/output_graph.py:1347] {"dynamo_output_graph": {"sizes": {"l_args_0_": [1, 4, 512, 64], "l_args_1_": [1, 4, 512, 64], "l_args_2_": [1, 4, 512, 64], "l_args_4_0_": [1, 1, 16], "l_args_4_1_": [1, 1, 16, 16], "l_args_4_2_": [1, 1, 16], "l_args_4_3_": [1, 1, 16, 16], "l_args_4_4_": [1, 1, 16], "l_args_4_5_": [1, 1, 16, 16], "l_args_4_6_": [1, 1, 16], "l_args_4_7_": [1, 1, 16, 16], "child_1": [], "child_2": [], "child_3": [], "child_4": [], "child": [], "child_5": [], "child_6": [], "child_7": [], "child_8": [], "getitem": [1, 4, 512, 64], "getitem_1": [1, 4, 512]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "36de5ad6eb1efc648a27dc62c107a2ca"} + class GraphModule(torch.nn.Module): + def forward(self, L_args_0_: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0", L_args_1_: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0", L_args_2_: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0", L_args_4_0_: "i32[1, 1, 16][16, 16, 1]cuda:0", L_args_4_1_: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0", L_args_4_2_: "i32[1, 1, 16][16, 16, 1]cuda:0", L_args_4_3_: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0", L_args_4_4_: "i32[1, 1, 16][16, 16, 1]cuda:0", L_args_4_5_: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0", L_args_4_6_: "i32[1, 1, 16][16, 16, 1]cuda:0", L_args_4_7_: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0"): + l_args_0_ = L_args_0_ + l_args_1_ = L_args_1_ + l_args_2_ = L_args_2_ + l_args_4_0_ = L_args_4_0_ + l_args_4_1_ = L_args_4_1_ + l_args_4_2_ = L_args_4_2_ + l_args_4_3_ = L_args_4_3_ + l_args_4_4_ = L_args_4_4_ + l_args_4_5_ = L_args_4_5_ + l_args_4_6_ = L_args_4_6_ + l_args_4_7_ = L_args_4_7_ + + # File: /data/users/oulgen/pytorch/torch/nn/attention/flex_attention.py:1050 in _flex_attention_hop_wrapper, code: return flex_attention_hop(*args, **kwargs) + child_1: "i32[][]cuda:0" = l_args_0_.new_empty([], dtype = torch.int32); child_1 = None + child_2: "i32[][]cuda:0" = l_args_0_.new_empty([], dtype = torch.int32); child_2 = None + child_3: "i32[][]cuda:0" = l_args_0_.new_empty([], dtype = torch.int32); child_3 = None + child_4: "i32[][]cuda:0" = l_args_0_.new_empty([], dtype = torch.int32); child_4 = None + child: "f32[][]cuda:0" = l_args_0_.new_empty([], requires_grad = False); child = None + score_mod_0 = self.score_mod_0 + child_5: "i32[][]cuda:0" = l_args_0_.new_empty([], dtype = torch.int32); child_5 = None + child_6: "i32[][]cuda:0" = l_args_0_.new_empty([], dtype = torch.int32); child_6 = None + child_7: "i32[][]cuda:0" = l_args_0_.new_empty([], dtype = torch.int32); child_7 = None + child_8: "i32[][]cuda:0" = l_args_0_.new_empty([], dtype = torch.int32); child_8 = None + mask_fn_0 = self.mask_fn_0 + flex_attention = torch.ops.higher_order.flex_attention(l_args_0_, l_args_1_, l_args_2_, score_mod_0, (l_args_4_0_, l_args_4_1_, l_args_4_2_, l_args_4_3_, l_args_4_4_, l_args_4_5_, l_args_4_6_, l_args_4_7_, 128, 128, mask_fn_0), 0.125, {'ROWS_GUARANTEED_SAFE': False, 'PRESCALE_QK': False, 'OUTPUT_LOGSUMEXP': False}, (), ()); l_args_0_ = l_args_1_ = l_args_2_ = score_mod_0 = l_args_4_0_ = l_args_4_1_ = l_args_4_2_ = l_args_4_3_ = l_args_4_4_ = l_args_4_5_ = l_args_4_6_ = l_args_4_7_ = mask_fn_0 = None + getitem: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0" = flex_attention[0] + getitem_1: "f32[1, 4, 512][2048, 512, 1]cuda:0" = flex_attention[1]; flex_attention = None + return (getitem, getitem_1) + + class score_mod_0(torch.nn.Module): + def forward(self, child: "f32[][]cuda:0", child_1: "i32[][]cuda:0", child_2: "i32[][]cuda:0", child_3: "i32[][]cuda:0", child_4: "i32[][]cuda:0"): + # File: /data/users/oulgen/pytorch/test/inductor/test_codecache.py:377 in score_mod, code: return score + (q - kv) + sub: "i32[][]cuda:0" = child_3 - child_4; child_3 = child_4 = None + add: "f32[][]cuda:0" = child + sub; child = sub = None + return add + + class mask_fn_0(torch.nn.Module): + def forward(self, child_5: "i32[][]cuda:0", child_6: "i32[][]cuda:0", child_7: "i32[][]cuda:0", child_8: "i32[][]cuda:0"): + # File: /data/users/oulgen/pytorch/test/inductor/test_codecache.py:373 in , code: lambda b, h, q, kv: q >= kv, None, None, 2048, 2048 + ge: "b8[][]cuda:0" = child_7 >= child_8; child_7 = child_8 = None + return ge + +V1003 10:11:02.792000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "bbe4f56afd5b46c1bcfb8b89b3682d1c"} + { + "name": "OutputGraph.call_user_compiler", + "ts": 1727975462792636.8, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:02.793000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "42f674bdb6c8ba79dc796eb9e177d39e"} + { + "name": "backend_compile", + "ts": 1727975462792636.8, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:02.793000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "70d8c05c0dc42c6de392be3605cefc5d"} + { + "name": "backend_compile", + "ts": 1727975462793358.8, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 0, + "fxgraph_cache_miss": 1, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:02.793000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "51a3e0876977448806622fe8baf64561"} + { + "name": "OutputGraph.call_user_compiler", + "ts": 1727975462793614.8, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 0, + "fxgraph_cache_miss": 1, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:02.818000 2235078 torch/_dynamo/guards.py:2311] {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "2b6b7ba9e9bb8df50f6d571c50ac67a8"} + + TREE_GUARD_MANAGER: + +- RootGuardManager + | +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:471 in init_ambient_guards + | +- GLOBAL_STATE: ___check_global_state() + | +- TORCH_FUNCTION_MODE_STACK: ___check_torch_function_mode_stack() + | +- GuardManager: source=L['args'], accessed_by=DictGetItemGuardAccessor(args) + | | +- TYPE_MATCH: ___check_type_id(L['args'], 8815232) + | | +- LENGTH_CHECK: len(L['args']) == 7 + | | +- GuardManager: source=L['args'][0], accessed_by=TupleGetItemGuardAccessor(0) + | | | +- TENSOR_MATCH: check_tensor(L['args'][0], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.float32, device=0, requires_grad=False, size=[1, 4, 512, 64], stride=[131072, 32768, 64, 1]) + | | | +- NO_HASATTR: hasattr(L['args'][0], '_dynamo_dynamic_indices') == False + | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['args'][0], L['args'][1], L['args'][2], L['args'][4][0], L['args'][4][1], L['args'][4][2], L['args'][4][3], L['args'][4][4], L['args'][4][5], L['args'][4][6], L['args'][4][7]) + | | +- GuardManager: source=L['args'][1], accessed_by=TupleGetItemGuardAccessor(1) + | | | +- TENSOR_MATCH: check_tensor(L['args'][1], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.float32, device=0, requires_grad=False, size=[1, 4, 512, 64], stride=[131072, 32768, 64, 1]) + | | | +- NO_HASATTR: hasattr(L['args'][1], '_dynamo_dynamic_indices') == False + | | | +- NO_TENSOR_ALIASING + | | +- GuardManager: source=L['args'][2], accessed_by=TupleGetItemGuardAccessor(2) + | | | +- TENSOR_MATCH: check_tensor(L['args'][2], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.float32, device=0, requires_grad=False, size=[1, 4, 512, 64], stride=[131072, 32768, 64, 1]) + | | | +- NO_HASATTR: hasattr(L['args'][2], '_dynamo_dynamic_indices') == False + | | | +- NO_TENSOR_ALIASING + | | +- GuardManager: source=L['args'][3], accessed_by=TupleGetItemGuardAccessor(3) + | | | +- GuardManager: source=L['args'][3].__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | +- ID_MATCH: ___check_obj_id(L['args'][3].__code__, 140413271879296) + | | +- GuardManager: source=L['args'][4], accessed_by=TupleGetItemGuardAccessor(4) + | | | +- TYPE_MATCH: ___check_type_id(L['args'][4], 8815232) + | | | +- LENGTH_CHECK: len(L['args'][4]) == 11 + | | | +- GuardManager: source=L['args'][4][0], accessed_by=TupleGetItemGuardAccessor(0) + | | | | +- TENSOR_MATCH: check_tensor(L['args'][4][0], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16], stride=[16, 16, 1]) + | | | | +- NO_HASATTR: hasattr(L['args'][4][0], '_dynamo_dynamic_indices') == False + | | | | +- NO_TENSOR_ALIASING + | | | +- GuardManager: source=L['args'][4][1], accessed_by=TupleGetItemGuardAccessor(1) + | | | | +- TENSOR_MATCH: check_tensor(L['args'][4][1], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16, 16], stride=[256, 256, 16, 1]) + | | | | +- NO_HASATTR: hasattr(L['args'][4][1], '_dynamo_dynamic_indices') == False + | | | | +- NO_TENSOR_ALIASING + | | | +- GuardManager: source=L['args'][4][2], accessed_by=TupleGetItemGuardAccessor(2) + | | | | +- TENSOR_MATCH: check_tensor(L['args'][4][2], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16], stride=[16, 16, 1]) + | | | | +- NO_HASATTR: hasattr(L['args'][4][2], '_dynamo_dynamic_indices') == False + | | | | +- NO_TENSOR_ALIASING + | | | +- GuardManager: source=L['args'][4][3], accessed_by=TupleGetItemGuardAccessor(3) + | | | | +- TENSOR_MATCH: check_tensor(L['args'][4][3], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16, 16], stride=[256, 256, 16, 1]) + | | | | +- NO_HASATTR: hasattr(L['args'][4][3], '_dynamo_dynamic_indices') == False + | | | | +- NO_TENSOR_ALIASING + | | | +- GuardManager: source=L['args'][4][4], accessed_by=TupleGetItemGuardAccessor(4) + | | | | +- TENSOR_MATCH: check_tensor(L['args'][4][4], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16], stride=[16, 16, 1]) + | | | | +- NO_HASATTR: hasattr(L['args'][4][4], '_dynamo_dynamic_indices') == False + | | | | +- NO_TENSOR_ALIASING + | | | +- GuardManager: source=L['args'][4][5], accessed_by=TupleGetItemGuardAccessor(5) + | | | | +- TENSOR_MATCH: check_tensor(L['args'][4][5], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16, 16], stride=[256, 256, 16, 1]) + | | | | +- NO_HASATTR: hasattr(L['args'][4][5], '_dynamo_dynamic_indices') == False + | | | | +- NO_TENSOR_ALIASING + | | | +- GuardManager: source=L['args'][4][6], accessed_by=TupleGetItemGuardAccessor(6) + | | | | +- TENSOR_MATCH: check_tensor(L['args'][4][6], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16], stride=[16, 16, 1]) + | | | | +- NO_HASATTR: hasattr(L['args'][4][6], '_dynamo_dynamic_indices') == False + | | | | +- NO_TENSOR_ALIASING + | | | +- GuardManager: source=L['args'][4][7], accessed_by=TupleGetItemGuardAccessor(7) + | | | | +- TENSOR_MATCH: check_tensor(L['args'][4][7], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16, 16], stride=[256, 256, 16, 1]) + | | | | +- NO_HASATTR: hasattr(L['args'][4][7], '_dynamo_dynamic_indices') == False + | | | | +- NO_TENSOR_ALIASING + | | | +- GuardManager: source=L['args'][4][8], accessed_by=TupleGetItemGuardAccessor(8) + | | | | +- EQUALS_MATCH: L['args'][4][8] == 128 + | | | +- GuardManager: source=L['args'][4][9], accessed_by=TupleGetItemGuardAccessor(9) + | | | | +- EQUALS_MATCH: L['args'][4][9] == 128 + | | | +- GuardManager: source=L['args'][4][10], accessed_by=TupleGetItemGuardAccessor(10) + | | | | +- GuardManager: source=L['args'][4][10].__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(L['args'][4][10].__code__, 140413271880128) + | | +- GuardManager: source=L['args'][5], accessed_by=TupleGetItemGuardAccessor(5) + | | | +- EQUALS_MATCH: L['args'][5] == 0.125 + | | +- GuardManager: source=L['args'][6], accessed_by=TupleGetItemGuardAccessor(6) + | | | +- DICT_LENGTH: len(L['args'][6]) == 3 + | | | +- GuardManager: source=L['args'][6]['ROWS_GUARANTEED_SAFE'], accessed_by=DictGetItemGuardAccessor(ROWS_GUARANTEED_SAFE) + | | | | +- ID_MATCH: ___check_obj_id(L['args'][6]['ROWS_GUARANTEED_SAFE'], 8910592) + | | | +- GuardManager: source=L['args'][6]['PRESCALE_QK'], accessed_by=DictGetItemGuardAccessor(PRESCALE_QK) + | | | | +- ID_MATCH: ___check_obj_id(L['args'][6]['PRESCALE_QK'], 8910592) + | | | +- GuardManager: source=L['args'][6]['OUTPUT_LOGSUMEXP'], accessed_by=DictGetItemGuardAccessor(OUTPUT_LOGSUMEXP) + | | | | +- ID_MATCH: ___check_obj_id(L['args'][6]['OUTPUT_LOGSUMEXP'], 8910592) + | +- GuardManager: source=L['kwargs'], accessed_by=DictGetItemGuardAccessor(kwargs) + | | +- DICT_LENGTH: not L['kwargs'] + | +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor + | | +- GuardManager: source=G['flex_attention_hop'], accessed_by=DictGetItemGuardAccessor(flex_attention_hop) + | | | +- TYPE_MATCH: ___check_type_id(G['flex_attention_hop'], 96992544) + | | | +- GuardManager: source=G['flex_attention_hop'].__name__, accessed_by=GetAttrGuardAccessor(__name__) + | | | | +- EQUALS_MATCH: G['flex_attention_hop'].__name__ == 'flex_attention' + | | +- GuardManager: source=G['__builtins_dict___4'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___4) + | | | +- GuardManager: source=G['__builtins_dict___4']['len'], accessed_by=DictGetItemGuardAccessor(len) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___4']['len'], 140413275558816) + | | | +- GuardManager: source=G['__builtins_dict___4']['sum'], accessed_by=DictGetItemGuardAccessor(sum) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___4']['sum'], 140413275559936) + | | | +- GuardManager: source=G['__builtins_dict___4']['list'], accessed_by=DictGetItemGuardAccessor(list) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___4']['list'], 8844320) + | | | +- GuardManager: source=G['__builtins_dict___4']['type'], accessed_by=DictGetItemGuardAccessor(type) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___4']['type'], 8813248) + | | | +- GuardManager: source=G['__builtins_dict___4']['tuple'], accessed_by=DictGetItemGuardAccessor(tuple) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___4']['tuple'], 8815232) + | | | +- GuardManager: source=G['__builtins_dict___4']['object'], accessed_by=DictGetItemGuardAccessor(object) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___4']['object'], 8813984) + | | | +- GuardManager: source=G['__builtins_dict___4']['isinstance'], accessed_by=DictGetItemGuardAccessor(isinstance) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___4']['isinstance'], 140413275558496) + | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot_utils_dot__pytree) + | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree'], 140411217627952) + | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree'].TreeSpec, accessed_by=GetAttrGuardAccessor(TreeSpec) + | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree'].TreeSpec, 84866496) + | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._is_leaf, accessed_by=GetAttrGuardAccessor(_is_leaf) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._is_leaf.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree']._is_leaf.__code__, 140411217262720) + | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC, accessed_by=GetAttrGuardAccessor(_LEAF_SPEC) + | | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC, 85171104) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.type, accessed_by=GetAttrGuardAccessor(type) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.type, 8825760) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.context, accessed_by=GetAttrGuardAccessor(context) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.context, 8825760) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.num_nodes, accessed_by=GetAttrGuardAccessor(num_nodes) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.num_nodes == 1 + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.num_leaves, accessed_by=GetAttrGuardAccessor(num_leaves) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.num_leaves == 1 + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.num_children, accessed_by=GetAttrGuardAccessor(num_children) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.num_children == 0 + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.children_specs, accessed_by=GetAttrGuardAccessor(children_specs) + | | | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.children_specs, 8844320) + | | | | | +- LENGTH_CHECK: not G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.children_specs + | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._get_node_type, accessed_by=GetAttrGuardAccessor(_get_node_type) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._get_node_type.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree']._get_node_type.__code__, 140411217262448) + | | | +- DictGuardManager: source=G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES, accessed_by=GetAttrGuardAccessor(SUPPORTED_NODES) + | | | | +- DICT_VERSION: ___dict_version(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES) == 519596 + | | | | +- KeyValueManager pair at index=1 + | | | | | +- ValueManager: GuardManager: source=G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]] + | | | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]].flatten_fn, accessed_by=GetAttrGuardAccessor(flatten_fn) + | | | | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]].flatten_fn.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]].flatten_fn.__code__, 140411196281984) + | | | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]].unflatten_fn, accessed_by=GetAttrGuardAccessor(unflatten_fn) + | | | | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]].unflatten_fn.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]].unflatten_fn.__code__, 140411217182288) + | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._tree_flatten_helper, accessed_by=GetAttrGuardAccessor(_tree_flatten_helper) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._tree_flatten_helper.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree']._tree_flatten_helper.__code__, 140411217413040) + | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._is_namedtuple_instance, accessed_by=GetAttrGuardAccessor(_is_namedtuple_instance) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._is_namedtuple_instance.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree']._is_namedtuple_instance.__code__, 140411217412592) + +V1003 10:11:02.818000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "0a198eacfa70f28e771164e9f21c0377"} + { + "name": "entire_frame_compile", + "ts": 1727975462818771.2, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 0, + "fxgraph_cache_miss": 1, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:02.819000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "fe22d36e59a389d50249ec2f0d81445a"} + { + "name": "_compile.compile_inner", + "ts": 1727975462819091.0, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 0, + "fxgraph_cache_miss": 1, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:02.819000 2235078 torch/_dynamo/utils.py:840] {"compilation_metrics": {"compile_id": "0/0", "frame_key": "1", "co_name": "_flex_attention_hop_wrapper", "co_filename": "/data/users/oulgen/pytorch/torch/nn/attention/flex_attention.py", "co_firstlineno": 1049, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 55, "shape_env_guard_count": 0, "graph_op_count": 12, "graph_node_count": 26, "graph_input_count": 11, "start_time": 1727975462.6493652, "entire_frame_compile_time_s": 0.16928768157958984, "backend_compile_time_s": 0.0006258487701416016, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0, "has_guarded_code": true, "possibly_missed_reinplacing_opportunities": 0, "remote_cache_time_saved_s": 0, "structured_logging_overhead_s": 0.032939939, "config_suppress_errors": false, "config_inline_inbuilt_nn_modules": true, "specialize_float": true}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.823000 2235078 torch/_dynamo/convert_frame.py:915] {"dynamo_start": {"stack": [{"line": 916, "name": "", "filename": 1}, {"line": 14, "name": "run_tests", "filename": 2}, {"line": 38, "name": "run_tests", "filename": 3}, {"line": 1273, "name": "run_tests", "filename": 4}, {"line": 102, "name": "__init__", "filename": 5}, {"line": 274, "name": "runTests", "filename": 5}, {"line": 217, "name": "run", "filename": 6}, {"line": 84, "name": "__call__", "filename": 7}, {"line": 122, "name": "run", "filename": 7}, {"line": 84, "name": "__call__", "filename": 7}, {"line": 122, "name": "run", "filename": 7}, {"line": 678, "name": "__call__", "filename": 8}, {"line": 3116, "name": "run", "filename": 4}, {"line": 3088, "name": "_run_custom", "filename": 4}, {"line": 623, "name": "run", "filename": 8}, {"line": 579, "name": "_callTestMethod", "filename": 8}, {"line": 2983, "name": "wrapper", "filename": 4}, {"line": 81, "name": "inner", "filename": 9}, {"line": 81, "name": "inner", "filename": 9}, {"line": 405, "name": "test_flex_attention_caching", "filename": 1}, {"line": 379, "name": "fn", "filename": 1}]}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.824000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "5c328b3ed733b4526b0979bdb2929653"} + { + "name": "_compile.compile_inner", + "ts": 1727975462824011.2, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:02.824000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "7333e4a8a9e70773a0188ced97e391ef"} + { + "name": "entire_frame_compile", + "ts": 1727975462824011.2, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:02.826000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 0, "describer_id": 256, "size": 524288}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.827000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 0, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1, 4, 512, 64], "is_leaf": true, "stride": [131072, 32768, 64, 1], "storage": 0, "view_func": "", "describer_id": 256}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.827000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 256, "id": 0, "source": "L['q']"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.836000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 1, "describer_id": 256, "size": 524288}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.836000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 1, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1, 4, 512, 64], "is_leaf": true, "stride": [131072, 32768, 64, 1], "storage": 1, "view_func": "", "describer_id": 256}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.836000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 256, "id": 1, "source": "L['k']"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.838000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 2, "describer_id": 256, "size": 524288}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.838000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1, 4, 512, 64], "is_leaf": true, "stride": [131072, 32768, 64, 1], "storage": 2, "view_func": "", "describer_id": 256}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.838000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 256, "id": 2, "source": "L['v']"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.847000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 3, "describer_id": 256, "size": 64}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.847000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 3, "ndim": 3, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16], "is_leaf": true, "stride": [16, 16, 1], "storage": 3, "view_func": "", "describer_id": 256}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.847000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 256, "id": 3, "source": "L['block_mask'].kv_num_blocks"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.985000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 4, "describer_id": 256, "size": 1024}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.985000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 7, "ndim": 4, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16, 16], "is_leaf": true, "stride": [256, 256, 16, 1], "storage": 4, "view_func": "", "describer_id": 256}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.985000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 256, "id": 7, "source": "L['block_mask'].kv_indices"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.987000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 5, "describer_id": 256, "size": 64}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.987000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 8, "ndim": 3, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16], "is_leaf": true, "stride": [16, 16, 1], "storage": 5, "view_func": "", "describer_id": 256}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.987000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 256, "id": 8, "source": "L['block_mask'].full_kv_num_blocks"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.989000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 6, "describer_id": 256, "size": 1024}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.989000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 9, "ndim": 4, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16, 16], "is_leaf": true, "stride": [256, 256, 16, 1], "storage": 6, "view_func": "", "describer_id": 256}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.989000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 256, "id": 9, "source": "L['block_mask'].full_kv_indices"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.991000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 7, "describer_id": 256, "size": 64}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.991000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 10, "ndim": 3, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16], "is_leaf": true, "stride": [16, 16, 1], "storage": 7, "view_func": "", "describer_id": 256}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.991000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 256, "id": 10, "source": "L['block_mask'].q_num_blocks"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.992000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 8, "describer_id": 256, "size": 1024}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.993000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 11, "ndim": 4, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16, 16], "is_leaf": true, "stride": [256, 256, 16, 1], "storage": 8, "view_func": "", "describer_id": 256}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.993000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 256, "id": 11, "source": "L['block_mask'].q_indices"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.994000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 9, "describer_id": 256, "size": 64}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.995000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 12, "ndim": 3, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16], "is_leaf": true, "stride": [16, 16, 1], "storage": 9, "view_func": "", "describer_id": 256}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.995000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 256, "id": 12, "source": "L['block_mask'].full_q_num_blocks"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.996000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 10, "describer_id": 256, "size": 1024}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.997000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 13, "ndim": 4, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16, 16], "is_leaf": true, "stride": [256, 256, 16, 1], "storage": 10, "view_func": "", "describer_id": 256}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:02.997000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 256, "id": 13, "source": "L['block_mask'].full_q_indices"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.005000 2235078 torch/_dynamo/output_graph.py:1347] {"dynamo_output_graph": {"sizes": {"l_q_": [1, 4, 512, 64], "l_k_": [1, 4, 512, 64], "l_v_": [1, 4, 512, 64], "l_block_mask_kv_num_blocks": [1, 1, 16], "l_block_mask_kv_indices": [1, 1, 16, 16], "l_block_mask_full_kv_num_blocks": [1, 1, 16], "l_block_mask_full_kv_indices": [1, 1, 16, 16], "l_block_mask_q_num_blocks": [1, 1, 16], "l_block_mask_q_indices": [1, 1, 16, 16], "l_block_mask_full_q_num_blocks": [1, 1, 16], "l_block_mask_full_q_indices": [1, 1, 16, 16], "child_1": [], "child_2": [], "child_3": [], "child_4": [], "child": [], "child_5": [], "child_6": [], "child_7": [], "child_8": [], "out": [1, 4, 512, 64]}}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "0365bd76b3474fb6d96e4c3c42585fb7"} + class GraphModule(torch.nn.Module): + def forward(self, L_q_: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0", L_k_: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0", L_v_: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0", L_block_mask_kv_num_blocks: "i32[1, 1, 16][16, 16, 1]cuda:0", L_block_mask_kv_indices: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0", L_block_mask_full_kv_num_blocks: "i32[1, 1, 16][16, 16, 1]cuda:0", L_block_mask_full_kv_indices: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0", L_block_mask_q_num_blocks: "i32[1, 1, 16][16, 16, 1]cuda:0", L_block_mask_q_indices: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0", L_block_mask_full_q_num_blocks: "i32[1, 1, 16][16, 16, 1]cuda:0", L_block_mask_full_q_indices: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0"): + l_q_ = L_q_ + l_k_ = L_k_ + l_v_ = L_v_ + l_block_mask_kv_num_blocks = L_block_mask_kv_num_blocks + l_block_mask_kv_indices = L_block_mask_kv_indices + l_block_mask_full_kv_num_blocks = L_block_mask_full_kv_num_blocks + l_block_mask_full_kv_indices = L_block_mask_full_kv_indices + l_block_mask_q_num_blocks = L_block_mask_q_num_blocks + l_block_mask_q_indices = L_block_mask_q_indices + l_block_mask_full_q_num_blocks = L_block_mask_full_q_num_blocks + l_block_mask_full_q_indices = L_block_mask_full_q_indices + + # File: /data/users/oulgen/pytorch/torch/nn/attention/flex_attention.py:1032 in flex_attention, code: out, lse = flex_attention_hop( + child_1: "i32[][]cuda:0" = l_q_.new_empty([], dtype = torch.int32); child_1 = None + child_2: "i32[][]cuda:0" = l_q_.new_empty([], dtype = torch.int32); child_2 = None + child_3: "i32[][]cuda:0" = l_q_.new_empty([], dtype = torch.int32); child_3 = None + child_4: "i32[][]cuda:0" = l_q_.new_empty([], dtype = torch.int32); child_4 = None + child: "f32[][]cuda:0" = l_q_.new_empty([], requires_grad = False); child = None + score_mod_0 = self.score_mod_0 + child_5: "i32[][]cuda:0" = l_q_.new_empty([], dtype = torch.int32); child_5 = None + child_6: "i32[][]cuda:0" = l_q_.new_empty([], dtype = torch.int32); child_6 = None + child_7: "i32[][]cuda:0" = l_q_.new_empty([], dtype = torch.int32); child_7 = None + child_8: "i32[][]cuda:0" = l_q_.new_empty([], dtype = torch.int32); child_8 = None + mask_fn_0 = self.mask_fn_0 + flex_attention = torch.ops.higher_order.flex_attention(l_q_, l_k_, l_v_, score_mod_0, (l_block_mask_kv_num_blocks, l_block_mask_kv_indices, l_block_mask_full_kv_num_blocks, l_block_mask_full_kv_indices, l_block_mask_q_num_blocks, l_block_mask_q_indices, l_block_mask_full_q_num_blocks, l_block_mask_full_q_indices, 128, 128, mask_fn_0), 0.125, {'ROWS_GUARANTEED_SAFE': False, 'PRESCALE_QK': False, 'OUTPUT_LOGSUMEXP': False}, (), ()); l_q_ = l_k_ = l_v_ = score_mod_0 = l_block_mask_kv_num_blocks = l_block_mask_kv_indices = l_block_mask_full_kv_num_blocks = l_block_mask_full_kv_indices = l_block_mask_q_num_blocks = l_block_mask_q_indices = l_block_mask_full_q_num_blocks = l_block_mask_full_q_indices = mask_fn_0 = None + out: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0" = flex_attention[0]; flex_attention = None + return (out,) + + class score_mod_0(torch.nn.Module): + def forward(self, child: "f32[][]cuda:0", child_1: "i32[][]cuda:0", child_2: "i32[][]cuda:0", child_3: "i32[][]cuda:0", child_4: "i32[][]cuda:0"): + # File: /data/users/oulgen/pytorch/test/inductor/test_codecache.py:377 in score_mod, code: return score + (q - kv) + sub: "i32[][]cuda:0" = child_3 - child_4; child_3 = child_4 = None + add: "f32[][]cuda:0" = child + sub; child = sub = None + return add + + class mask_fn_0(torch.nn.Module): + def forward(self, child_5: "i32[][]cuda:0", child_6: "i32[][]cuda:0", child_7: "i32[][]cuda:0", child_8: "i32[][]cuda:0"): + # File: /data/users/oulgen/pytorch/test/inductor/test_codecache.py:373 in , code: lambda b, h, q, kv: q >= kv, None, None, 2048, 2048 + ge: "b8[][]cuda:0" = child_7 >= child_8; child_7 = child_8 = None + return ge + +V1003 10:11:03.006000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "d0fdf0bbe0d39c70da0305c36b970c1c"} + { + "name": "OutputGraph.call_user_compiler", + "ts": 1727975463006504.2, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.006000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "2c12bb411b8d14ec0cf2ffdf2c4427c3"} + { + "name": "backend_compile", + "ts": 1727975463006504.2, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.012000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "17a34747abc18cf69e8e2a0141a3ab09"} + { + "name": "create_aot_dispatcher_function", + "ts": 1727975463012509.8, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.138000 2235078 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:215] {"aot_forward_graph": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "f6cec159cb1cee52405afcfe40b62c03"} + class (torch.nn.Module): + def forward(self, arg0_1: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0", arg1_1: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0", arg2_1: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0", arg3_1: "i32[1, 1, 16][16, 16, 1]cuda:0", arg4_1: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0", arg5_1: "i32[1, 1, 16][16, 16, 1]cuda:0", arg6_1: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0", arg7_1: "i32[1, 1, 16][16, 16, 1]cuda:0", arg8_1: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0", arg9_1: "i32[1, 1, 16][16, 16, 1]cuda:0", arg10_1: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0"): + # File: /data/users/oulgen/pytorch/torch/nn/attention/flex_attention.py:1032 in flex_attention, code: out, lse = flex_attention_hop( + sdpa_score0 = self.sdpa_score0 + sdpa_mask0 = self.sdpa_mask0 + flex_attention = torch.ops.higher_order.flex_attention(arg0_1, arg1_1, arg2_1, sdpa_score0, (arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, 128, 128, sdpa_mask0), 0.125, {'ROWS_GUARANTEED_SAFE': False, 'PRESCALE_QK': False, 'OUTPUT_LOGSUMEXP': False}, (), ()); arg0_1 = arg1_1 = arg2_1 = sdpa_score0 = arg3_1 = arg4_1 = arg5_1 = arg6_1 = arg7_1 = arg8_1 = arg9_1 = arg10_1 = sdpa_mask0 = None + getitem: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0" = flex_attention[0]; flex_attention = None + return (getitem,) + + class sdpa_score0(torch.nn.Module): + def forward(self, arg0_1: "f32[][]cuda:0", arg1_1: "i32[][]cuda:0", arg2_1: "i32[][]cuda:0", arg3_1: "i32[][]cuda:0", arg4_1: "i32[][]cuda:0"): + # File: /data/users/oulgen/pytorch/torch/nn/attention/flex_attention.py:1032 in flex_attention, code: out, lse = flex_attention_hop( + sub: "i32[][]cuda:0" = torch.ops.aten.sub.Tensor(arg3_1, arg4_1); arg3_1 = arg4_1 = None + add: "f32[][]cuda:0" = torch.ops.aten.add.Tensor(arg0_1, sub); arg0_1 = sub = None + return add + + class sdpa_mask0(torch.nn.Module): + def forward(self, arg0_1: "i32[][]cuda:0", arg1_1: "i32[][]cuda:0", arg2_1: "i32[][]cuda:0", arg3_1: "i32[][]cuda:0"): + # File: /data/users/oulgen/pytorch/test/inductor/test_codecache.py:373 in , code: lambda b, h, q, kv: q >= kv, None, None, 2048, 2048 + ge: "b8[][]cuda:0" = torch.ops.aten.ge.Tensor(arg2_1, arg3_1); arg2_1 = arg3_1 = None + return ge + +V1003 10:11:03.139000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "96a6ba3562cc8fdc9267e57d6099f6dd"} + { + "name": "compile_fx..fw_compiler_base", + "ts": 1727975463139343.8, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.141000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "4405e9a49d13d3764e0174322946bd85"} + { + "name": "compile_fx_inner", + "ts": 1727975463141246.5, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.141000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "98ce9c89d4855d24e864b2a507237295"} + { + "name": "inductor_compile", + "ts": 1727975463141246.5, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.282000 2235078 torch/_inductor/codecache.py:1138] {"inductor_output_code": {"filename": "/tmp/oulgen/tmp4z1i5ywe/kp/ckpysuucou6gm55terbvpynnfevubjpwkfm3ubzxyauw2bgggpvi.py"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "187381168c614ba1181a424c67fdb77c"} + # AOT ID: ['0_inference'] + from ctypes import c_void_p, c_long, c_int + import torch + import math + import random + import os + import tempfile + from math import inf, nan + from torch._inductor.hooks import run_intermediate_hooks + from torch._inductor.utils import maybe_profile + from torch._inductor.codegen.memory_planning import _align as align + from torch import device, empty_strided + from torch._inductor.async_compile import AsyncCompile + from torch._inductor.select_algorithm import extern_kernels + from torch._inductor.codegen.multi_kernel import MultiKernelCall + import torch._inductor.kernel.flex_attention + from torch._C import _cuda_getCurrentRawStream as get_raw_stream + import triton + import triton.language as tl + from torch._inductor.runtime.triton_heuristics import grid, split_scan_grid, grid_combo_kernels, start_graph, end_graph + + aten = torch.ops.aten + inductor_ops = torch.ops.inductor + _quantized = torch.ops._quantized + assert_size_stride = torch._C._dynamo.guards.assert_size_stride + empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu + empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda + empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu + reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor + alloc_from_pool = torch.ops.inductor._alloc_from_pool + async_compile = AsyncCompile() + + + # kernel path: /tmp/oulgen/tmp4z1i5ywe/ke/ckedjitfju7kpaxnpatsaz3gzkdz6znc4u5v2e22w2xzbjdvecy5.py + # Topologically Sorted Source Nodes: [flex_attention], Original ATen: [] + # Source node to ATen node mapping: + # flex_attention => flex_attention + # Graph fragment: + # %flex_attention : [num_users=1] = call_function[target=torch.ops.higher_order.flex_attention](args = (%arg0_1, %arg1_1, %arg2_1, %sdpa_score0, (%arg3_1, %arg4_1, %arg5_1, %arg6_1, %arg7_1, %arg8_1, %arg9_1, %arg10_1, 128, 128, %sdpa_mask0), 0.125, {ROWS_GUARANTEED_SAFE: False, PRESCALE_QK: False, OUTPUT_LOGSUMEXP: False}, (), ()), kwargs = {}) + triton_tem_fused_0 = async_compile.triton('triton_tem_fused_0', ''' + import triton + import triton.language as tl + from triton.compiler.compiler import AttrsDescriptor + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, instance_descriptor, DeviceProperties + + @triton_heuristics.template( + num_stages=3, + num_warps=4, + triton_meta={'signature': {'arg_Q': '*fp32', 'arg_K': '*fp32', 'arg_V': '*fp32', 'arg_LSE': '*fp32', 'arg_KV_NUM_BLKS': '*i32', 'arg_KV_IDX': '*i32', 'arg_FULL_KV_NUM_BLKS': '*i32', 'arg_FULL_KV_IDX': '*i32', 'out_ptr0': '*fp32'}, 'device': DeviceProperties(type='cuda', index=0, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, multi_processor_count=108, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}, + inductor_meta={'kernel_name': 'triton_tem_fused_0', 'backend_hash': 'FB2CA426CF35F271C56C0D69873498391AC248E25890F2B631CA8B52D56952BD', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': False, 'autotune_pointwise': True, 'autotune_remote_cache': False, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}, + ) + @triton.jit + def triton_tem_fused_0(arg_Q, arg_K, arg_V, arg_LSE, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0): + ROWS_GUARANTEED_SAFE : tl.constexpr = False + PRESCALE_QK : tl.constexpr = False + OUTPUT_LOGSUMEXP : tl.constexpr = False + FLOAT32_PRECISION : tl.constexpr = 'ieee' + IS_DIVISIBLE : tl.constexpr = True + SM_SCALE : tl.constexpr = 0.125 + GQA_SHARED_HEADS : tl.constexpr = 1 + HAS_FULL_BLOCKS : tl.constexpr = True + QK_HEAD_DIM : tl.constexpr = 64 + V_HEAD_DIM : tl.constexpr = 64 + BLOCK_M : tl.constexpr = 128 + BLOCK_N : tl.constexpr = 32 + SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128 + SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128 + Q = arg_Q + K = arg_K + V = arg_V + LSE = arg_LSE + KV_NUM_BLKS = arg_KV_NUM_BLKS + KV_IDX = arg_KV_IDX + FULL_KV_NUM_BLKS = arg_FULL_KV_NUM_BLKS + FULL_KV_IDX = arg_FULL_KV_IDX + + # Sub notation for this kernel: + # + # Q: Query, K: Key, V: Value + # M: Number of queries, N: Number of keys/values, D: Model dimension + # QK_HEAD_DIM: The dimension of the query and key embeddings + # V_HEAD_DIM: The dimension of the value embeddings + # z: Batch size, h: Number of heads, m: Number of queries per head, k: Number of keys per head + # GQA_SHARED_HEADS: number of query heads sharing one kv head in GQA setups. + # + # The following FULL_* and PARTIAL_* is defined in the block sparse mask grid, rather than the thread block grid. + # KV_NUM_BLKS: The number of KV blocks (that may or may not require masking) for each query. + # KV_IDX: The indices of KV blocks (that may or may not require masking) for each query. + # FULL_KV_NUM_BLKS: The number of fully unmasked KV blocks (so we don't need masking) for each query. + # FULL_KV_IDX: The indices of fully unmasked KV blocks (so we don't need masking) for each query. + # + # OUTPUT_LOGSUMEXP: We only need to store the logsumexp if we require grad + # + # (Modifiable) Performance tuning options + # BLOCK_M: The thread block size across the seqlen dim of Q. + # BLOCK_N: Iterate over BLOCK_N across the seqlen dim of K/V in each thread block. + + # The below are kernel options that can be applied for certain score_mods, + # or involve a numerics vs. perf tradeoff + # PRESCALE_QK: Whether to pre-scale QK by 1/sqrt(d) and change of base. Has + # about 20% more numerical error, but slightly faster. + # ROWS_GUARANTEED_SAFE: Is it guaranteed that at least one value in each row + # is not masked out? If so, we can skip an extra safety check + + tl.static_assert(SPARSE_Q_BLOCK_SIZE >= BLOCK_M and SPARSE_Q_BLOCK_SIZE % BLOCK_M == 0) + tl.static_assert(SPARSE_KV_BLOCK_SIZE >= BLOCK_N and SPARSE_KV_BLOCK_SIZE % BLOCK_N == 0) + + # Define strides of inputs + stride_qz, stride_qh, stride_qm, stride_qk = 131072, 32768, 64, 1 + stride_kz, stride_kh, stride_kn, stride_kk = 131072, 32768, 64, 1 + stride_vz, stride_vh, stride_vn, stride_vk = 131072, 32768, 64, 1 + + ZQ = 1 + HQ = 4 + Q_LEN = 512 + ZKV = 1 + KV_LEN = 512 + + MATMUL_PRECISION = Q.dtype.element_ty + + q_start = tl.program_id(0) + off_zq = tl.program_id(1) // HQ + off_hq = tl.program_id(1) % HQ + + # We support two cases for batch dimension. a) (ZKV == ZQ) where off_zkv = off_zq. + # b) (ZKV == 1 and ZQ > 1) where KV is broadcasted along the batch dimension and off_zkv=0. + off_zkv = off_zq % ZKV + off_hkv = off_hq // GQA_SHARED_HEADS + off_g = off_hq % GQA_SHARED_HEADS + + q_offset = off_zq * stride_qz + off_hq * stride_qh + k_offset = off_zkv * stride_kz + off_hkv * stride_kh + v_offset = off_zkv * stride_vz + off_hkv * stride_vh + + Q = Q + q_offset + K = K + k_offset + V = V + v_offset + + SPARSE_Z = 1 + SPARSE_HQ = 1 + + sparse_idx_z = off_zq % SPARSE_Z + sparse_idx_hq = off_hq % SPARSE_HQ + + SPARSE_Q_MULTIPLE: tl.constexpr = (SPARSE_Q_BLOCK_SIZE // BLOCK_M) + SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N) + + stride_kv_num_blks_h = 16 + stride_kv_idx_h = 256 + stride_kv_idx_m = 16 + + # initialize pointer to m and l + m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") + l_i = tl.zeros([BLOCK_M], dtype=tl.float32) + acc = tl.zeros([BLOCK_M, V_HEAD_DIM], dtype=tl.float32) + + offs_m = q_start * BLOCK_M + tl.arange(0, BLOCK_M) + + # KV_IDX and KV_NUM_BLKS are always contiguous. + sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq + sparse_kv_num_blks_offset = sparse_hz_offset * stride_kv_num_blks_h + q_start // SPARSE_Q_MULTIPLE + sparse_kv_idx_offset = sparse_hz_offset * stride_kv_idx_h + (q_start // SPARSE_Q_MULTIPLE) * stride_kv_idx_m # noqa: B950 + + Q_block_ptr = tl.make_block_ptr( + base=Q, + shape=(Q_LEN, QK_HEAD_DIM), + strides=(stride_qm, stride_qk), + offsets=(q_start * BLOCK_M, 0), + block_shape=(BLOCK_M, QK_HEAD_DIM), + order=(1, 0) + ) + + # load q: it stays in SRAM throughout the inner loop. + if IS_DIVISIBLE: + q = tl.load(Q_block_ptr) + else: + # boundary check is not free, so we only do it when necessary. + q = tl.load(Q_block_ptr, boundary_check=(0,), padding_option = "zero") + + # ~~~~~~~~~~~~~~ normal blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # We don't know anything "special" about these blocks, so we need to apply + # both score_mod and mask_mod to it + kv_indices = KV_IDX + sparse_kv_idx_offset + kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading + kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_kv_num_blks_offset) + block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1)) + + K_block_ptr = tl.make_block_ptr( + base=K, + shape=(QK_HEAD_DIM, KV_LEN), + strides=(stride_kk, stride_kn), + offsets=(0, kv_start), + block_shape=(QK_HEAD_DIM, BLOCK_N), + order=(0, 1) + ) + V_block_ptr = tl.make_block_ptr( + base=V, + shape=(KV_LEN, V_HEAD_DIM), + strides=(stride_vn, stride_vk), + offsets=(kv_start, 0), + block_shape=(BLOCK_N, V_HEAD_DIM), + order=(1, 0) + ) + offs_n = kv_start + tl.arange(0, BLOCK_N) + + acc, l_i, m_i = forward_inner( + arg_Q, arg_K, arg_V, arg_LSE, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, + q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN, + acc, l_i, m_i, + off_zq, off_hq, offs_m[:, None], offs_n[None, :], + kv_indices, kv_num_blocks, + 0, block_n_end, + MATMUL_PRECISION, + IS_FULL_BLOCKS=False, + ) + + # ~~~~~~~~~~~~~~ "full" blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # We know these blocks are guaranteed to be "full", so we don't need to + # apply mask_mod to them - only score_mod + if HAS_FULL_BLOCKS: + # FULL_KV_IDX and FULL_KV_NUM_BLKS are always contiguous. + kv_indices = FULL_KV_IDX + sparse_kv_idx_offset + kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading + kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_kv_num_blks_offset) + block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1)) + + K_block_ptr = tl.make_block_ptr( + base=K, + shape=(QK_HEAD_DIM, KV_LEN), + strides=(stride_kk, stride_kn), + offsets=(0, kv_start), + block_shape=(QK_HEAD_DIM, BLOCK_N), + order=(0, 1) + ) + V_block_ptr = tl.make_block_ptr( + base=V, + shape=(KV_LEN, V_HEAD_DIM), + strides=(stride_vn, stride_vk), + offsets=(kv_start, 0), + block_shape=(BLOCK_N, V_HEAD_DIM), + order=(1, 0) + ) + offs_n = kv_start + tl.arange(0, BLOCK_N) + + acc, l_i, m_i = forward_inner( + arg_Q, arg_K, arg_V, arg_LSE, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, + q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN, + acc, l_i, m_i, + off_zq, off_hq, offs_m[:, None], offs_n[None, :], + kv_indices, kv_num_blocks, + 0, block_n_end, + MATMUL_PRECISION, + IS_FULL_BLOCKS=True, + ) + + + # [Note] Handle fully masked out rows: + # Li will be the sum(e^(-inf)) == 0.0 for masked out rows, mi will be -inf. + # We set Li to 1.0 which will result in lse/out = 0.0 | after the log(li) + mi(0.0) step + l_i = tl.where(l_i == 0.0, 1, l_i) + + acc = acc / l_i[:, None] + idx_zq = tl.program_id(1) // HQ + idx_hq = tl.program_id(1) % HQ + idx_m = offs_m[:, None] + idx_d = tl.arange(0, V_HEAD_DIM)[None, :] + + mask = idx_m < Q_LEN + # TODO generalize and add proper mask support + xindex = idx_d + (64*idx_m) + (32768*idx_hq) + (131072*idx_zq) + tl.store(out_ptr0 + (tl.broadcast_to(idx_d + (64*idx_m) + (32768*idx_hq), acc.shape)), acc, mask) + + # TODO dont want to write this if we dont require grad + if OUTPUT_LOGSUMEXP: + off_hz = tl.program_id(1) + l_ptrs = LSE + off_hz * Q_LEN + offs_m + lse = m_i + tl.math.log2(l_i) + if IS_DIVISIBLE: + tl.store(l_ptrs, lse) + else: + tl.store(l_ptrs, lse, mask=offs_m < Q_LEN) + + @triton.jit + def forward_inner( + arg_Q, arg_K, arg_V, arg_LSE, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, + q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN, + # accumulated values + acc, l_i, m_i, + # Offsets used as inputs to score_mod & mask_mod + # of size [BLOCK_M, BLOCK_N] or scalar. + off_z, off_h, offs_m, offs_n, + # blocksparse data + kv_indices, kv_num_blocks, + # start kv and end kv block + block_n_start, block_n_end, + MATMUL_PRECISION, + IS_FULL_BLOCKS, + ): + # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through + ROWS_GUARANTEED_SAFE : tl.constexpr = False + PRESCALE_QK : tl.constexpr = False + OUTPUT_LOGSUMEXP : tl.constexpr = False + FLOAT32_PRECISION : tl.constexpr = 'ieee' + IS_DIVISIBLE : tl.constexpr = True + SM_SCALE : tl.constexpr = 0.125 + GQA_SHARED_HEADS : tl.constexpr = 1 + HAS_FULL_BLOCKS : tl.constexpr = True + QK_HEAD_DIM : tl.constexpr = 64 + V_HEAD_DIM : tl.constexpr = 64 + BLOCK_M : tl.constexpr = 128 + BLOCK_N : tl.constexpr = 32 + SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128 + SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128 + + + SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N) + RCP_LN2: tl.constexpr = 1.44269504 + + if PRESCALE_QK: + q = (q * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION) + + # loop over k, v and update accumulator until block_n_end + for start_n in range(block_n_start, block_n_end): + if IS_DIVISIBLE: + acc, l_i, m_i = forward_block_mn( + arg_Q, arg_K, arg_V, arg_LSE, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, + q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN, + # accumulated values + acc, l_i, m_i, + # Offsets + off_z, off_h, offs_m, offs_n, + MATMUL_PRECISION, RCP_LN2, + IS_FULL_BLOCKS, + ) + else: + # Benchmark shows even we applied mod & mask to each block for non divisible seqlen, + # it's on par or slightly faster than only applying to the last block in fwd. + # However, we choose different strategy for bwd, where we only apply mod & mask + # to the last block because it's faster a lot. + acc, l_i, m_i = forward_block_mn( + arg_Q, arg_K, arg_V, arg_LSE, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, + q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN, + # accumulated values + acc, l_i, m_i, + # Offsets + off_z, off_h, offs_m, offs_n, + MATMUL_PRECISION, RCP_LN2, + IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=True, + ) + + # update pointers + offset = get_offset_for_next_block( + start_n, kv_indices, kv_num_blocks, + SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N + ) + + V_block_ptr = tl.advance(V_block_ptr, (offset, 0)) + K_block_ptr = tl.advance(K_block_ptr, (0, offset)) + + offs_n = offs_n + offset + + return acc, l_i, m_i + + + @triton.jit + def get_offset_for_next_block(loop_iter, col_indices, total_blocks, SPARSE_BLOCK, SPARSE_BLOCK_MULTIPLE, BLOCK): + cur_block_idx = loop_iter // SPARSE_BLOCK_MULTIPLE + cur_block = tl.load(col_indices + cur_block_idx, eviction_policy="evict_last") + next_block = tl.load(col_indices + cur_block_idx + 1, eviction_policy="evict_last", mask=cur_block_idx + 1 < total_blocks) + needs_jump = (loop_iter + 1) % SPARSE_BLOCK_MULTIPLE == 0 + jump_to_block = (next_block - cur_block ) * SPARSE_BLOCK - (SPARSE_BLOCK_MULTIPLE - 1) * BLOCK + + offset = jump_to_block * needs_jump + (1 - needs_jump) * BLOCK + return offset + + @triton.jit + def forward_block_mn( + arg_Q, arg_K, arg_V, arg_LSE, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, + q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN, + # accumulated values + acc, l_i, m_i, + # Offsets + off_z, off_h, offs_m, offs_n, + MATMUL_PRECISION, RCP_LN2, + IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=False, + ): + # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through + ROWS_GUARANTEED_SAFE : tl.constexpr = False + PRESCALE_QK : tl.constexpr = False + OUTPUT_LOGSUMEXP : tl.constexpr = False + FLOAT32_PRECISION : tl.constexpr = 'ieee' + IS_DIVISIBLE : tl.constexpr = True + SM_SCALE : tl.constexpr = 0.125 + GQA_SHARED_HEADS : tl.constexpr = 1 + HAS_FULL_BLOCKS : tl.constexpr = True + QK_HEAD_DIM : tl.constexpr = 64 + V_HEAD_DIM : tl.constexpr = 64 + BLOCK_M : tl.constexpr = 128 + BLOCK_N : tl.constexpr = 32 + SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128 + SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128 + + + # -- load k -- + if IS_DIVISIBLE: + k = tl.load(K_block_ptr) + else: + k = tl.load(K_block_ptr, boundary_check=(1,), padding_option = "zero") + # -- compute qk --- + qk = tl.dot(q, k, input_precision=FLOAT32_PRECISION) # TODO: use cuda matmul when q_len <= 2. + if not PRESCALE_QK: + qk *= SM_SCALE + # ~~~~~~~~~~~~~~~~~~~ Apply score modification ~~~~~~~~~~~~~~~~~~~ + if CHECK_BLOCK_BOUNDARY: + # If this is the last block of a non divisible seqlen, we still need to load [BLOCK_M, BLOCK_N] elements, + # which is larger than the actual number of elements. To avoid access memory out of bound, + # we need to mask out the elements that are out of Q_LEN & KV_LEN. + m = offs_m % Q_LEN + n = offs_n % KV_LEN + else: + m = offs_m + n = offs_n + + tmp0 = (m) - (n) + tmp1 = tmp0.to(tl.float32) + tmp2 = (qk) + tmp1 + post_mod_scores = tmp2 + + + if CHECK_BLOCK_BOUNDARY: + # Mask out the elements that are out of the KV_LEN for non divisible seqlen. + post_mod_scores = tl.where(offs_n < KV_LEN, post_mod_scores, float("-inf")) + + if not IS_FULL_BLOCKS: + tmp3 = (m) >= (n) + mask_mod_output = tmp3 + + + if CHECK_BLOCK_BOUNDARY: + mask_mod_output = tl.where(offs_n < KV_LEN, mask_mod_output, float("-inf")) + # apply mask for partially unmasked blocks + post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf")) + + # TODO: In the case that score_mod is linear, this can be LICMed + if not PRESCALE_QK: + post_mod_scores *= RCP_LN2 + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + # -- compute scaling constant --- + m_ij = tl.maximum(m_i, tl.max(post_mod_scores, 1)) + if not ROWS_GUARANTEED_SAFE: + masked_out_rows = (m_ij == float("-inf")) + m_ij_masked = tl.where(masked_out_rows, 0, m_ij) + else: + m_ij_masked = m_ij + + alpha = tl.math.exp2(m_i - m_ij_masked) + p = tl.math.exp2(post_mod_scores - m_ij_masked[:, None]) + + # NB: l_i update is pulled up here since it's a bit faster + # NB: For headdim=256, it's faster to move it back down to after m_i = + # m_ij + l_i = l_i * alpha + tl.sum(p, 1) + # # -- scale and update acc -- + acc = acc * alpha[:, None] + + if IS_DIVISIBLE: + v = tl.load(V_block_ptr) + else: + v = tl.load(V_block_ptr, boundary_check=(0,), padding_option = "zero") + acc = tl.dot(p.to(MATMUL_PRECISION), v, acc, input_precision=FLOAT32_PRECISION) + + # -- update m_i + m_i = m_ij + + return acc, l_i, m_i + ''', device_str='cuda') + meta0 = {'ROWS_GUARANTEED_SAFE': False, 'PRESCALE_QK': False, 'OUTPUT_LOGSUMEXP': False, 'FLOAT32_PRECISION': "'ieee'", 'IS_DIVISIBLE': True, 'SM_SCALE': 0.125, 'GQA_SHARED_HEADS': 1, 'HAS_FULL_BLOCKS': True, 'QK_HEAD_DIM': 64, 'V_HEAD_DIM': 64, 'BLOCK_M': 128, 'BLOCK_N': 32, 'SPARSE_Q_BLOCK_SIZE': 128, 'SPARSE_KV_BLOCK_SIZE': 128} + + + async_compile.wait(globals()) + del async_compile + + def call(args): + arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1 = args + args.clear() + assert_size_stride(arg0_1, (1, 4, 512, 64), (131072, 32768, 64, 1)) + assert_size_stride(arg1_1, (1, 4, 512, 64), (131072, 32768, 64, 1)) + assert_size_stride(arg2_1, (1, 4, 512, 64), (131072, 32768, 64, 1)) + assert_size_stride(arg3_1, (1, 1, 16), (16, 16, 1)) + assert_size_stride(arg4_1, (1, 1, 16, 16), (256, 256, 16, 1)) + assert_size_stride(arg5_1, (1, 1, 16), (16, 16, 1)) + assert_size_stride(arg6_1, (1, 1, 16, 16), (256, 256, 16, 1)) + assert_size_stride(arg7_1, (1, 1, 16), (16, 16, 1)) + assert_size_stride(arg8_1, (1, 1, 16, 16), (256, 256, 16, 1)) + assert_size_stride(arg9_1, (1, 1, 16), (16, 16, 1)) + assert_size_stride(arg10_1, (1, 1, 16, 16), (256, 256, 16, 1)) + buf0 = empty_strided_cuda((1, 4, 512), (2048, 512, 1), torch.float32) + with torch.cuda._DeviceGuard(0): + torch.cuda.set_device(0) + buf1 = empty_strided_cuda((1, 4, 512, 64), (131072, 32768, 64, 1), torch.float32) + # Topologically Sorted Source Nodes: [flex_attention], Original ATen: [] + stream0 = get_raw_stream(0) + triton_tem_fused_0.run(arg0_1, arg1_1, arg2_1, buf0, arg3_1, arg4_1, arg5_1, arg6_1, buf1, grid=torch._inductor.kernel.flex_attention.flex_attention_grid(1, 4, 512, 64, meta0), stream=stream0) + del arg0_1 + del arg1_1 + del arg2_1 + del arg3_1 + del arg4_1 + del arg5_1 + del arg6_1 + del buf0 + return (buf1, ) + + + def benchmark_compiled_module(times=10, repeat=10): + from torch._dynamo.testing import rand_strided + from torch._inductor.utils import print_performance + arg0_1 = rand_strided((1, 4, 512, 64), (131072, 32768, 64, 1), device='cuda:0', dtype=torch.float32) + arg1_1 = rand_strided((1, 4, 512, 64), (131072, 32768, 64, 1), device='cuda:0', dtype=torch.float32) + arg2_1 = rand_strided((1, 4, 512, 64), (131072, 32768, 64, 1), device='cuda:0', dtype=torch.float32) + arg3_1 = rand_strided((1, 1, 16), (16, 16, 1), device='cuda:0', dtype=torch.int32) + arg4_1 = rand_strided((1, 1, 16, 16), (256, 256, 16, 1), device='cuda:0', dtype=torch.int32) + arg5_1 = rand_strided((1, 1, 16), (16, 16, 1), device='cuda:0', dtype=torch.int32) + arg6_1 = rand_strided((1, 1, 16, 16), (256, 256, 16, 1), device='cuda:0', dtype=torch.int32) + arg7_1 = rand_strided((1, 1, 16), (16, 16, 1), device='cuda:0', dtype=torch.int32) + arg8_1 = rand_strided((1, 1, 16, 16), (256, 256, 16, 1), device='cuda:0', dtype=torch.int32) + arg9_1 = rand_strided((1, 1, 16), (16, 16, 1), device='cuda:0', dtype=torch.int32) + arg10_1 = rand_strided((1, 1, 16, 16), (256, 256, 16, 1), device='cuda:0', dtype=torch.int32) + fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1]) + return print_performance(fn, times=times, repeat=repeat) + + + if __name__ == "__main__": + from torch._inductor.wrapper_benchmark import compiled_module_main + compiled_module_main('None', benchmark_compiled_module) + +V1003 10:11:03.283000 2235078 torch/_dynamo/utils.py:1020] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "55abfbf07c349adf63e8871178bf974b"} + { + "name": "fx_graph_cache_hit", + "ts": 1727975463283186.5, + "args": { + "key": "f4lkea5y7lzhlshohvr3aqpd7bchdflfs7j5wn7mrurponawoutk", + "components": [ + "[n7x23yy6fih6vdcjzlzbhy3d6vx3ilu7ylp3zz6wkultu4yvnzn] gm: (\n (sdpa_score0): ()\n (sdpa_mask0): ()\n)\n\n\n\ndef forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1):\n sdpa_score0 = self.sdpa_score0\n sdpa_mask0 = self.sdpa_mask0\n flex_attention = torch.ops.higher_order.flex_attention(arg0_1, arg1_1, arg2_1, sdpa_score0, (arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, 128, 128, sdpa_mask0), 0.125, {'ROWS_GUARANTEED_SAFE': False, 'PRESCALE_QK': False, 'OUTPUT_LOGSUMEXP': False}, (), ()); arg0_1 = arg1_1 = arg2_1 = sdpa_score0 = arg3_1 = arg4_1 = arg5_1 = arg6_1 = arg7_1 = arg8_1 = arg9_1 = arg10_1 = sdpa_mask0 = None\n getitem = flex_attention[0]; flex_attention = None\n return (getitem,)\n \n# To see more debug info, please use `graph_module.print_readable()`", + "[avf2u3luxvyabchjhbddapcjn5gev47wfdtkrprayuhv6lf2z6u] example_inputs[0]: TensorMetadata(dtype=torch.float32, shape=torch.Size([1, 4, 512, 64]), stride=(131072, 32768, 64, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", + "[avf2u3luxvyabchjhbddapcjn5gev47wfdtkrprayuhv6lf2z6u] example_inputs[1]: TensorMetadata(dtype=torch.float32, shape=torch.Size([1, 4, 512, 64]), stride=(131072, 32768, 64, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", + "[avf2u3luxvyabchjhbddapcjn5gev47wfdtkrprayuhv6lf2z6u] example_inputs[2]: TensorMetadata(dtype=torch.float32, shape=torch.Size([1, 4, 512, 64]), stride=(131072, 32768, 64, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", + "[zsk3gejenkcvvwhiyk36u5zdnlrcs6wgy3pina3csuierfd2zri] example_inputs[3]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16]), stride=(16, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", + "[hnbjjzmb63q27mbr22eubaelyb423burv27meouma6ccysmwu6g] example_inputs[4]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16, 16]), stride=(256, 256, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", + "[zsk3gejenkcvvwhiyk36u5zdnlrcs6wgy3pina3csuierfd2zri] example_inputs[5]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16]), stride=(16, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", + "[hnbjjzmb63q27mbr22eubaelyb423burv27meouma6ccysmwu6g] example_inputs[6]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16, 16]), stride=(256, 256, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", + "[zsk3gejenkcvvwhiyk36u5zdnlrcs6wgy3pina3csuierfd2zri] example_inputs[7]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16]), stride=(16, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", + "[hnbjjzmb63q27mbr22eubaelyb423burv27meouma6ccysmwu6g] example_inputs[8]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16, 16]), stride=(256, 256, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", + "[zsk3gejenkcvvwhiyk36u5zdnlrcs6wgy3pina3csuierfd2zri] example_inputs[9]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16]), stride=(16, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", + "[hnbjjzmb63q27mbr22eubaelyb423burv27meouma6ccysmwu6g] example_inputs[10]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16, 16]), stride=(256, 256, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] fx_kwargs[aot_mode]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] fx_kwargs[cpp_wrapper]: False", + "[xq2hdkbfkbcuye6rgtypayrkhqf4cntij2dsd24rei3lsknakkf] fx_kwargs[cudagraphs]: BoxedBool(value=False)", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] fx_kwargs[extern_node_serializer]: None", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] fx_kwargs[is_backward]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] fx_kwargs[is_inference]: True", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] fx_kwargs[layout_opt]: None", + "[h25wqx6vliw4j5rtzzbv6latydxyei3deyg6v7wzvnzryfktuki] fx_kwargs[static_input_idxs]: []", + "[f44ag5aflby2bkxl7a4k6whljrk7jat7bmreuxklei4p3czhk7p] fx_kwargs[user_visible_outputs]: {'getitem': None}", + "[vrl5ktomgtzox5xucd3np6vug3vyj6hwwzahqijuwpmamlv7ohi] inputs_to_check[0]: 0", + "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inputs_to_check[1]: 1", + "[pr5nr4a7dthirgd2ljo3d2xakc63ywxugusu6mkmr6gmpeliyib] inputs_to_check[2]: 2", + "[kcuxe2zwm3mzv2uk6adm6iskoy35bqfv725twacrdewod2dbl5d] inputs_to_check[3]: 3", + "[lkkae3meylaixfif4thncru4hjqeaislawjoghffrbwuscaagei] inputs_to_check[4]: 4", + "[qs5hilycp4ew4ivtc7m5jaxp7q4pm5slioxw3fi3ur6ei65ybz4] inputs_to_check[5]: 5", + "[agkvbkaha53nbz3aeeuhvxjvvc4glhfjofzkg6g2qjoo2e5otcx] inputs_to_check[6]: 6", + "[j3s5elu6itwgjafc7rzhy4whrbufl6kfmlufjhh25grt643bk5f] inputs_to_check[7]: 7", + "[yttmfmxblgcbsvbokguzowcorrcxz5uunxtcvsbe6nijgcx45he] inputs_to_check[8]: 8", + "[qlgfiyqewrmkgqth2qm6wkq2ja5lzkapg3ypgnvoyfqqnidaoj3] inputs_to_check[9]: 9", + "[j6c55jha5r2sdys2rwq7uqhtleea5dgjcye7nicfgft36v7xfvp] inputs_to_check[10]: 10", + "[du4vyrfyozrfxcf6kk6ma7oqwatapifazeelfsawmsiu6gjdtxp] deterministic_algorithms_settings: (False, False, True)", + "[qiptf2633zubseuei4bkisoq3not35l6lud6p23p4qmcsxiw2uq] cuda_matmul_settings: (False, True, True)", + "[7uhqwjfn75ek3woo3k7em2mluon5hx2ojvzlevlvjvz6xfxjhzl] torch_version: ", + "[c3z7bmoxyo6gl5hi47v6dc7jwsl55b3asd75nr25uyengi5ah3p] system_info[device]: {'name': 'NVIDIA PG509-210'}", + "[3fb7kae6ogkdd4zcm3fkjoipdpybxhn4aoxzv7z7xsfwq233e4l] system_info[version]: {'triton': '3.1.0+5fe38ffd73dc767c8fadcf23ea82d79e257c37d44077eae7f681cf967565fd43e9c017937b-835d4fc33500e1accafc5c5e00f4f73d87432c114860c04b68849bf6f942b8e5-dc767c8fadcf23ea82d79e257c37d44077eae7f681cf967565fd43e9c017937b-23d635e690d670bf61798e1259674b78c0ed5ba222ab6a455f329f27a758fc2d-e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855-20b017e9c4d858ab05e783f77df50b86c6d6eee5d79f3f4b158562b4a54f8443-f44338a31e0534290b08653050804c3fabbde403a6d3004ae04f0c28495f0802-e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855-a979896b9c0acfd41dd953b90bdc4b10968f7c0b45a286eae3f829aaddb2bb55-da771298f7bc45d24a61f35ef51742304421df1ab49d50bf1fc510dd5a46ea4b-a8fb7be728d460b7ec64ab62edb8af1bbca8994fd718cde7178d46bad64530a1-71330f394e584b0df29595d49f6ac8ac0c5503db9147090dc58ad888cebac7be-f24adfd52383f7866791ebaa5d45a5d2cc826e56ee2fd285f438e85d201fe643-a34be0d3ae4b3ac9aede195cfda42f8a0a097b2bc9642fb59673ce6b3b607f10-36130a37af1b19a0dec569aa08d30b00c74c8f02b6b632999d86dea169146792-36d42f0429aae027cb985b53b9abc616fae4dad9e0ea03953e1e9fb46d0fb9a0-e5d2cb724c08d0ef4130f3ba858d22cf21f834bfd970a5388aa6ad2a6bab91f9', 'cuda': '12.0'}", + "[z5x2bdhir5lzlbti73vdbfulnuu5vinzpwgmmgf4rjb775tzl3h] system_info[hash]: 9698c97edde4a99a2f3b54bbd0db5291bbcdb75c83acb376ccff61fb0bf0ac1a", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[TYPE_CHECKING]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[abi_compatible]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aggressive_fusion]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[allow_buffer_reuse]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[allow_stack_allocation]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[always_keep_tensor_constants]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aot_inductor.debug_compile]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aot_inductor.debug_dump_consts_bin]: False", + "[ngkkx5e6z7erl6da23zb2cmsctz4yvaqyameyg5hbqln4wrhh7x] inductor_config[aot_inductor.debug_intermediate_value_printer]: 0", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[aot_inductor.filtered_kernel_names]: None", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aot_inductor.force_mmap_weights]: False", + "[4bryyl4ahh5whyg3zwqebpwmjnx6w77nqgqbdjlowju6lkqtn7w] inductor_config[aot_inductor.metadata]: {}", + "[v3hzzlv4tjgvp3pyhmzagjd25orl6n7nynoa7svlhhwk73b7u3c] inductor_config[aot_inductor.output_path]: ", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aot_inductor.package]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aot_inductor.package_cpp_only]: False", + "[v3hzzlv4tjgvp3pyhmzagjd25orl6n7nynoa7svlhhwk73b7u3c] inductor_config[aot_inductor.serialized_in_spec]: ", + "[v3hzzlv4tjgvp3pyhmzagjd25orl6n7nynoa7svlhhwk73b7u3c] inductor_config[aot_inductor.serialized_out_spec]: ", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aot_inductor.use_runtime_constant_folding]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[assert_indirect_indexing]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[assume_aligned_inputs]: False", + "[v3hzzlv4tjgvp3pyhmzagjd25orl6n7nynoa7svlhhwk73b7u3c] inductor_config[autoheuristic_collect]: ", + "[jvchmi66fvqzlemhr5fcqorz5trfdtdalzfagtj2aolmimwqhdq] inductor_config[autoheuristic_log_path]: DEFAULT", + "[jwbrgxes7vjqumngs5hyj6gn5nytv2whnppnzngvaagfmawhkkd] inductor_config[autoheuristic_use]: mixed_mm", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[autotune_fallback_to_aten]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[autotune_in_subproc]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[autotune_local_cache]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[autotune_multi_device]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[autotune_remote_cache]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[b2b_gemm_pass]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[batch_fusion]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[benchmark_combo_kernel]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[benchmark_epilogue_fusion]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[benchmark_fusion]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[benchmark_harness]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[benchmark_kernel]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[bw_outputs_user_visible]: True", + "[b4ha3ravs3qv237q65hpfqegbnoww7tf2ahcbu2i7xo6te5spqs] inductor_config[c_shim_version]: 2", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[check_stack_no_cycles_TESTING_ONLY]: False", + "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inductor_config[combo_kernel_allow_mixed_sizes]: 1", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[combo_kernel_foreach_dynamic_shapes]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[combo_kernels]: False", + "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inductor_config[combo_kernels_autotune]: 1", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[comment_origin]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[comprehensive_padding]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[compute_all_bounds]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[constant_and_index_propagation]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[conv_1x1_as_mm]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[coordinate_descent_check_all_directions]: False", + "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inductor_config[coordinate_descent_search_radius]: 1", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[coordinate_descent_tuning]: False", + "[c7zj4qytmety6keurs3hsh5wn7foxp3dqx4kym2ucszzcb2ngrf] inductor_config[cpp.cxx]: (None, 'g++')", + "[yrty22bseefglnysuoec4ji7j2rnaggdj3g33zzj7avogwfmgdw] inductor_config[cpp.descriptive_names]: original_aten", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cpp.dynamic_threads]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cpp.enable_floating_point_contract_flag]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cpp.enable_kernel_profile]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[cpp.enable_loop_tail_vec]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[cpp.enable_tiling_heuristics]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cpp.enable_unsafe_math_opt_flag]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[cpp.fallback_scatter_reduce_sum]: True", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cpp.gemm_cache_blocking]: None", + "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inductor_config[cpp.gemm_max_k_slices]: 1", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cpp.gemm_thread_factors]: None", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cpp.inject_log1p_bug_TESTING_ONLY]: None", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cpp.inject_relu_bug_TESTING_ONLY]: None", + "[ebt2ncs4f5y7dn7btzi76mnouepvzad474tmp5iju4wiuumjl4s] inductor_config[cpp.max_horizontal_fusion_size]: 16", + "[g7rrnbg5yonzux3cfj5ovre5lob3ayda7qcfpxjvtwmiz4uicii] inductor_config[cpp.min_chunk_size]: 4096", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[cpp.no_redundant_loops]: True", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cpp.simdlen]: None", + "[sz3im5ogc6asp7g4uqocnovype63tkdexzfrniv6hn2oank3biu] inductor_config[cpp.threads]: -1", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cpp.vec_isa_ok]: None", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[cpp.weight_prepack]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cpp_wrapper]: False", + "[bsvfcwwoczx2rlkdz2eta6doujsymyihmi46hhwk6clrrvwcb6m] inductor_config[cpu_backend]: cpp", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cuda.arch]: None", + "[tvyftmtdmezlejo2xllu7awzv4pzc4vm4fub4b3gpl5jptjkosi] inductor_config[cuda.compile_opt_level]: -O1", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cuda.cuda_cxx]: None", + "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inductor_config[cuda.cutlass_backend_min_gemm_size]: 1", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cuda.cutlass_max_profiling_configs]: None", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cuda.cutlass_op_allowlist_regex]: None", + "[lwkz5chtpji756gurqw4foijfi7zfgljtnn5nmnvdi2skpt4mgh] inductor_config[cuda.cutlass_op_denylist_regex]: pingpong", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cuda.enable_cuda_lto]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cuda.enable_debug_info]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cuda.enable_ptxas_info]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[cuda.generate_test_runner]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cuda.use_fast_math]: False", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cuda.version]: None", + "[caw4ly2z672k6kjfahoxwpajp5idhhtrpgf3ma2clylcp7c7aid] inductor_config[cuda_backend]: triton", + "[pikr7bbcoixfzftsazp5ggufhdklj24babfry77bl4nuvyrrcp4] inductor_config[custom_op_default_layout_constraint]: needs_fixed_stride_order", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[dce]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[debug]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[debug_fusion]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[debug_index_asserts]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[debug_ir_traceback]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[decompose_mem_bound_mm]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[developer_warnings]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[disable_cpp_codegen]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[disable_padding_cpu]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[disable_progress]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[dynamic_scale_rblock]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[efficient_conv_bn_eval_fx_passes]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[emulate_precision_casts]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[enable_auto_functionalized_v2]: False", + "[v3hzzlv4tjgvp3pyhmzagjd25orl6n7nynoa7svlhhwk73b7u3c] inductor_config[enabled_metric_tables]: ", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[epilogue_fusion]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[epilogue_fusion_first]: False", + "[lxxtoqhcoepwfokeiibd575gnxo3uzwiv4hmpomlwkpzqz3qzsh] inductor_config[estimate_op_runtime]: default", + "[h25wqx6vliw4j5rtzzbv6latydxyei3deyg6v7wzvnzryfktuki] inductor_config[external_matmul]: []", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[fallback_random]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[force_disable_caches]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[force_fuse_int_mm_with_mul]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[force_layout_optimization]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[force_same_precision]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[force_shape_pad]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[freezing]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[freezing_discard_parameters]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[fx_graph_cache]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[fx_graph_remote_cache]: False", + "[62lrdx35b7hnumwb7mp5oc5y5csm2abylvtdzfloct3noaqov3n] inductor_config[fx_passes_numeric_check]: {'pre_grad': False, 'post_grad': False, 'precision': 0.0001, 'num_iterations': 1, 'requires_optimizer': True}", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[generate_intermediate_hooks]: False", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[global_cache_dir]: None", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[group_fusion]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[halide.asserts]: False", + "[ljhgflgihidopsfsdcbqynv27nceykby3nutyd5jlcpq7n6e7l4] inductor_config[halide.cpu_target]: host", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[halide.debug]: False", + "[wx7vmsmrdpk5ue2txlywp3lj3faqmdjphs5fgg2ehzsyno7uovg] inductor_config[halide.gpu_target]: host-cuda", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[halide.scan_kernels]: False", + "[k5ogk6345jvklsnu7g2njqstiz2g6pm5wmqpgg3kasrmuqwjvl6] inductor_config[halide.scheduler_cpu]: Adams2019", + "[svgytlua5wcyeia7wq7e6zgh5tsueikrnzchmdmouvmkpfsc2zq] inductor_config[halide.scheduler_cuda]: Anderson2021", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[implicit_fallbacks]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[inplace_buffers]: True", + "[5fxczt3ciyxitdhizb7sfsgn7fhpczcqsngttnt5ot2wyctk7co] inductor_config[inter_node_bw]: 25", + "[yezuzjtg4h3jjur4jwtwiehbyixa7eonq4tqsqmwqve2lvvmrem] inductor_config[intra_node_bw]: 300", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[is_nightly_or_source]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[is_predispatch]: False", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[joint_custom_post_pass]: None", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[joint_custom_pre_pass]: None", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[joint_graph_constant_folding]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[keep_output_stride]: True", + "[j6c55jha5r2sdys2rwq7uqhtleea5dgjcye7nicfgft36v7xfvp] inductor_config[kernel_name_max_ops]: 10", + "[4p2fdjlvxrcw7c7fvzm5huhtqxnro4kvkx56f7p5zyrxqkwooov] inductor_config[layout_opt_default]: 1", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[layout_optimization]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[loop_ordering_after_fusion]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[max_autotune]: False", + "[uqlsbif4zxd75vt522p52txyuguieipi2lwz5g5awt56lccqk7s] inductor_config[max_autotune_conv_backends]: ATEN,TRITON", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[max_autotune_gemm]: False", + "[2y7luesktjrque3nr7qtxnum2mkbeegzdrsvkm3rvdlhqboajhx] inductor_config[max_autotune_gemm_backends]: ATEN,TRITON,CPP", + "[jvchmi66fvqzlemhr5fcqorz5trfdtdalzfagtj2aolmimwqhdq] inductor_config[max_autotune_gemm_search_space]: DEFAULT", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[max_autotune_pointwise]: False", + "[bh33ranllcgilhgmgr3qvygzxjm6isq5iexnfm3zx6fnr2zwlp2] inductor_config[max_autotune_subproc_graceful_timeout_seconds]: 1.0", + "[iglov24t7x5ruci344aer2tm6nqshi4veuw4wxlssxtu46cx76m] inductor_config[max_autotune_subproc_result_timeout_seconds]: 60.0", + "[pwoh5aypf4fxbntdvwt67rppxorqos6xr3w7qzeun6kblbfg2ga] inductor_config[max_autotune_subproc_terminate_timeout_seconds]: 2.0", + "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inductor_config[max_epilogue_benchmarked_choices]: 1", + "[jykiys6ynafs3zdylwa5ggq6j655mxeh42d6mtdi22gffkrmiac] inductor_config[max_fusion_size]: 64", + "[yttmfmxblgcbsvbokguzowcorrcxz5uunxtcvsbe6nijgcx45he] inductor_config[max_pointwise_cat_inputs]: 8", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[memory_planning]: False", + "[x75won4jmsgeb63pcvwr2y4eteyzzdhmf5rv6xhjppie4hx2yu5] inductor_config[memory_pool]: intermediates", + "[v2td5s4lnsvyxvaevy4chx6kc5h3mm2axazbgwimqule5zrzao7] inductor_config[mixed_mm_choice]: heuristic", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[nan_asserts]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[optimize_scatter_upon_const_tensor]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[pad_channels_last]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[pad_outputs]: False", + "[ljdqgtysl3vdf7j6attlz5gmjg2ncihnveojfyubosplmkrjgra] inductor_config[padding_alignment_bytes]: 128", + "[dnnw5ks3yxrp7mwvihb2hh4tqx35ye637xt33x64kw4fvz2nyzg] inductor_config[padding_stride_threshold]: 1024", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[pattern_matcher]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[permute_fusion]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[pick_loop_orders]: True", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[post_grad_custom_post_pass]: None", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[post_grad_custom_pre_pass]: None", + "[4bryyl4ahh5whyg3zwqebpwmjnx6w77nqgqbdjlowju6lkqtn7w] inductor_config[post_grad_fusion_options]: {}", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[pre_grad_custom_pass]: None", + "[4bryyl4ahh5whyg3zwqebpwmjnx6w77nqgqbdjlowju6lkqtn7w] inductor_config[pre_grad_fusion_options]: {}", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[profile_bandwidth]: False", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[profile_bandwidth_output]: None", + "[v3hzzlv4tjgvp3pyhmzagjd25orl6n7nynoa7svlhhwk73b7u3c] inductor_config[profile_bandwidth_regex]: ", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[profile_bandwidth_with_do_bench_using_profiling]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[profiler_mark_wrapper_call]: False", + "[yttmfmxblgcbsvbokguzowcorrcxz5uunxtcvsbe6nijgcx45he] inductor_config[realize_acc_reads_threshold]: 8", + "[rr5m5hsocoyodldz7vcvaizdwvm2rt34evmqdxvng7wz3tufvo6] inductor_config[realize_opcount_threshold]: 30", + "[lkkae3meylaixfif4thncru4hjqeaislawjoghffrbwuscaagei] inductor_config[realize_reads_threshold]: 4", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[reorder_for_compute_comm_overlap]: False", + "[ssupi7bu3rrhdpg2jyegzncu3kg3nnhklyliqvutaxgs7y7k3dx] inductor_config[reorder_for_compute_comm_overlap_passes]: ['reorder_compute_for_overlap', 'sink_waits', 'raise_comms']", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[reorder_for_locality]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[reorder_for_peak_memory]: False", + "[h25wqx6vliw4j5rtzzbv6latydxyei3deyg6v7wzvnzryfktuki] inductor_config[rocm.arch]: []", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[rocm.ck_dir]: None", + "[oartxnko2l7d67tzwwm2otcumaut3n4wwcfgz3o377hmcveu5ft] inductor_config[rocm.ck_supported_arch]: ['gfx90a', 'gfx940', 'gfx941', 'gfx942']", + "[klfqjprnpfhcdurgvuikvc4rpd5ynkpk77toousr5h3u5roty6p] inductor_config[rocm.compile_opt_level]: -O2", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[rocm.flush_denormals]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[rocm.is_debug]: False", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[rocm.n_max_profiling_configs]: None", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[rocm.print_kernel_resource_usage]: False", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[rocm.rocm_home]: None", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[rocm.save_temps]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[rocm.use_fast_math]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[rocm.use_preselected_instances]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[save_args]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[search_autotune_cache]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[shape_padding]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[size_asserts]: True", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[sleep_sec_TESTING_ONLY]: None", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[split_cat_fx_passes]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[split_reductions]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[static_weight_shapes]: True", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[triton.autotune_at_compile_time]: None", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.autotune_cublasLt]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.autotune_pointwise]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.codegen_upcast_to_fp32]: True", + "[tuax46wac7rfv2trf5gcps6vleo3cq44lbnrdxtprvo3ljjaddj] inductor_config[triton.cudagraph_dynamic_shape_warn_limit]: 50", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.cudagraph_skip_dynamic_graphs]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.cudagraph_support_input_mutation]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.cudagraph_trees]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.cudagraph_trees_history_recording]: False", + "[ljdqgtysl3vdf7j6attlz5gmjg2ncihnveojfyubosplmkrjgra] inductor_config[triton.cudagraph_unexpected_rerecord_limit]: 128", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.cudagraphs]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.debug_sync_graph]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.debug_sync_kernel]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.dense_indexing]: False", + "[yrty22bseefglnysuoec4ji7j2rnaggdj3g33zzj7avogwfmgdw] inductor_config[triton.descriptive_names]: original_aten", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.divisible_by_16]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.fast_path_cudagraph_asserts]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.force_cudagraph_sync]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.force_cudagraphs_warmup]: False", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[triton.inject_relu_bug_TESTING_ONLY]: None", + "[pr5nr4a7dthirgd2ljo3d2xakc63ywxugusu6mkmr6gmpeliyib] inductor_config[triton.max_tiles]: 2", + "[fv6slhtedtydps5s5u2etitscliblzcidyitqf7krsv4e23fzk6] inductor_config[triton.min_split_scan_rblock]: 256", + "[vrl5ktomgtzox5xucd3np6vug3vyj6hwwzahqijuwpmamlv7ohi] inductor_config[triton.multi_kernel]: 0", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.persistent_reductions]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.prefer_nd_tiling]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.skip_cudagraph_warmup]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.slow_path_cudagraph_asserts]: True", + "[ebt2ncs4f5y7dn7btzi76mnouepvzad474tmp5iju4wiuumjl4s] inductor_config[triton.spill_threshold]: 16", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.store_cubin]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.tiling_prevents_pointwise_fusion]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.tiling_prevents_reduction_fusion]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.unique_kernel_names]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.use_block_ptr]: False", + "[vzzema5ityqj2wepdmkulue7q5pcevdr5h27oxxutf35d4tjume] inductor_config[triton_kernel_default_layout_constraint]: flexible_layout", + "[wft6ljqsfr3x4m7fa5zuyb7cwknky4irrxz4bjr6uzr2yiopxqj] inductor_config[unbacked_symint_fallback]: 8192", + "[yttmfmxblgcbsvbokguzowcorrcxz5uunxtcvsbe6nijgcx45he] inductor_config[unroll_reductions_threshold]: 8", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[unsafe_ignore_unsupported_triton_autotune_args]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[use_minimal_arrayref_interface]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[use_mixed_mm]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[verbose_progress]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[warn_mix_layout]: False" + ], + "cache_event_time": 1727975463283186408, + "cache_state": "hit", + "time_saved_ns": 6372999079 + }, + "ph": "i", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0, + "s": "p" + } +V1003 10:11:03.284000 2235078 torch/_inductor/codecache.py:1463] {"artifact": {"name": "fx_graph_cache_hit", "encoding": "json"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "b87f8df3afd49918e4211191e3fc10a5"} + {"key": "f4lkea5y7lzhlshohvr3aqpd7bchdflfs7j5wn7mrurponawoutk", "components": ["[n7x23yy6fih6vdcjzlzbhy3d6vx3ilu7ylp3zz6wkultu4yvnzn] gm: (\n (sdpa_score0): ()\n (sdpa_mask0): ()\n)\n\n\n\ndef forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1):\n sdpa_score0 = self.sdpa_score0\n sdpa_mask0 = self.sdpa_mask0\n flex_attention = torch.ops.higher_order.flex_attention(arg0_1, arg1_1, arg2_1, sdpa_score0, (arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, 128, 128, sdpa_mask0), 0.125, {'ROWS_GUARANTEED_SAFE': False, 'PRESCALE_QK': False, 'OUTPUT_LOGSUMEXP': False}, (), ()); arg0_1 = arg1_1 = arg2_1 = sdpa_score0 = arg3_1 = arg4_1 = arg5_1 = arg6_1 = arg7_1 = arg8_1 = arg9_1 = arg10_1 = sdpa_mask0 = None\n getitem = flex_attention[0]; flex_attention = None\n return (getitem,)\n \n# To see more debug info, please use `graph_module.print_readable()`", "[avf2u3luxvyabchjhbddapcjn5gev47wfdtkrprayuhv6lf2z6u] example_inputs[0]: TensorMetadata(dtype=torch.float32, shape=torch.Size([1, 4, 512, 64]), stride=(131072, 32768, 64, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", "[avf2u3luxvyabchjhbddapcjn5gev47wfdtkrprayuhv6lf2z6u] example_inputs[1]: TensorMetadata(dtype=torch.float32, shape=torch.Size([1, 4, 512, 64]), stride=(131072, 32768, 64, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", "[avf2u3luxvyabchjhbddapcjn5gev47wfdtkrprayuhv6lf2z6u] example_inputs[2]: TensorMetadata(dtype=torch.float32, shape=torch.Size([1, 4, 512, 64]), stride=(131072, 32768, 64, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", "[zsk3gejenkcvvwhiyk36u5zdnlrcs6wgy3pina3csuierfd2zri] example_inputs[3]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16]), stride=(16, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", "[hnbjjzmb63q27mbr22eubaelyb423burv27meouma6ccysmwu6g] example_inputs[4]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16, 16]), stride=(256, 256, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", "[zsk3gejenkcvvwhiyk36u5zdnlrcs6wgy3pina3csuierfd2zri] example_inputs[5]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16]), stride=(16, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", "[hnbjjzmb63q27mbr22eubaelyb423burv27meouma6ccysmwu6g] example_inputs[6]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16, 16]), stride=(256, 256, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", "[zsk3gejenkcvvwhiyk36u5zdnlrcs6wgy3pina3csuierfd2zri] example_inputs[7]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16]), stride=(16, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", "[hnbjjzmb63q27mbr22eubaelyb423burv27meouma6ccysmwu6g] example_inputs[8]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16, 16]), stride=(256, 256, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", "[zsk3gejenkcvvwhiyk36u5zdnlrcs6wgy3pina3csuierfd2zri] example_inputs[9]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16]), stride=(16, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", "[hnbjjzmb63q27mbr22eubaelyb423burv27meouma6ccysmwu6g] example_inputs[10]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16, 16]), stride=(256, 256, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] fx_kwargs[aot_mode]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] fx_kwargs[cpp_wrapper]: False", "[xq2hdkbfkbcuye6rgtypayrkhqf4cntij2dsd24rei3lsknakkf] fx_kwargs[cudagraphs]: BoxedBool(value=False)", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] fx_kwargs[extern_node_serializer]: None", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] fx_kwargs[is_backward]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] fx_kwargs[is_inference]: True", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] fx_kwargs[layout_opt]: None", "[h25wqx6vliw4j5rtzzbv6latydxyei3deyg6v7wzvnzryfktuki] fx_kwargs[static_input_idxs]: []", "[f44ag5aflby2bkxl7a4k6whljrk7jat7bmreuxklei4p3czhk7p] fx_kwargs[user_visible_outputs]: {'getitem': None}", "[vrl5ktomgtzox5xucd3np6vug3vyj6hwwzahqijuwpmamlv7ohi] inputs_to_check[0]: 0", "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inputs_to_check[1]: 1", "[pr5nr4a7dthirgd2ljo3d2xakc63ywxugusu6mkmr6gmpeliyib] inputs_to_check[2]: 2", "[kcuxe2zwm3mzv2uk6adm6iskoy35bqfv725twacrdewod2dbl5d] inputs_to_check[3]: 3", "[lkkae3meylaixfif4thncru4hjqeaislawjoghffrbwuscaagei] inputs_to_check[4]: 4", "[qs5hilycp4ew4ivtc7m5jaxp7q4pm5slioxw3fi3ur6ei65ybz4] inputs_to_check[5]: 5", "[agkvbkaha53nbz3aeeuhvxjvvc4glhfjofzkg6g2qjoo2e5otcx] inputs_to_check[6]: 6", "[j3s5elu6itwgjafc7rzhy4whrbufl6kfmlufjhh25grt643bk5f] inputs_to_check[7]: 7", "[yttmfmxblgcbsvbokguzowcorrcxz5uunxtcvsbe6nijgcx45he] inputs_to_check[8]: 8", "[qlgfiyqewrmkgqth2qm6wkq2ja5lzkapg3ypgnvoyfqqnidaoj3] inputs_to_check[9]: 9", "[j6c55jha5r2sdys2rwq7uqhtleea5dgjcye7nicfgft36v7xfvp] inputs_to_check[10]: 10", "[du4vyrfyozrfxcf6kk6ma7oqwatapifazeelfsawmsiu6gjdtxp] deterministic_algorithms_settings: (False, False, True)", "[qiptf2633zubseuei4bkisoq3not35l6lud6p23p4qmcsxiw2uq] cuda_matmul_settings: (False, True, True)", "[7uhqwjfn75ek3woo3k7em2mluon5hx2ojvzlevlvjvz6xfxjhzl] torch_version: ", "[c3z7bmoxyo6gl5hi47v6dc7jwsl55b3asd75nr25uyengi5ah3p] system_info[device]: {'name': 'NVIDIA PG509-210'}", "[3fb7kae6ogkdd4zcm3fkjoipdpybxhn4aoxzv7z7xsfwq233e4l] system_info[version]: {'triton': '3.1.0+5fe38ffd73dc767c8fadcf23ea82d79e257c37d44077eae7f681cf967565fd43e9c017937b-835d4fc33500e1accafc5c5e00f4f73d87432c114860c04b68849bf6f942b8e5-dc767c8fadcf23ea82d79e257c37d44077eae7f681cf967565fd43e9c017937b-23d635e690d670bf61798e1259674b78c0ed5ba222ab6a455f329f27a758fc2d-e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855-20b017e9c4d858ab05e783f77df50b86c6d6eee5d79f3f4b158562b4a54f8443-f44338a31e0534290b08653050804c3fabbde403a6d3004ae04f0c28495f0802-e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855-a979896b9c0acfd41dd953b90bdc4b10968f7c0b45a286eae3f829aaddb2bb55-da771298f7bc45d24a61f35ef51742304421df1ab49d50bf1fc510dd5a46ea4b-a8fb7be728d460b7ec64ab62edb8af1bbca8994fd718cde7178d46bad64530a1-71330f394e584b0df29595d49f6ac8ac0c5503db9147090dc58ad888cebac7be-f24adfd52383f7866791ebaa5d45a5d2cc826e56ee2fd285f438e85d201fe643-a34be0d3ae4b3ac9aede195cfda42f8a0a097b2bc9642fb59673ce6b3b607f10-36130a37af1b19a0dec569aa08d30b00c74c8f02b6b632999d86dea169146792-36d42f0429aae027cb985b53b9abc616fae4dad9e0ea03953e1e9fb46d0fb9a0-e5d2cb724c08d0ef4130f3ba858d22cf21f834bfd970a5388aa6ad2a6bab91f9', 'cuda': '12.0'}", "[z5x2bdhir5lzlbti73vdbfulnuu5vinzpwgmmgf4rjb775tzl3h] system_info[hash]: 9698c97edde4a99a2f3b54bbd0db5291bbcdb75c83acb376ccff61fb0bf0ac1a", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[TYPE_CHECKING]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[abi_compatible]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aggressive_fusion]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[allow_buffer_reuse]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[allow_stack_allocation]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[always_keep_tensor_constants]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aot_inductor.debug_compile]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aot_inductor.debug_dump_consts_bin]: False", "[ngkkx5e6z7erl6da23zb2cmsctz4yvaqyameyg5hbqln4wrhh7x] inductor_config[aot_inductor.debug_intermediate_value_printer]: 0", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[aot_inductor.filtered_kernel_names]: None", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aot_inductor.force_mmap_weights]: False", "[4bryyl4ahh5whyg3zwqebpwmjnx6w77nqgqbdjlowju6lkqtn7w] inductor_config[aot_inductor.metadata]: {}", "[v3hzzlv4tjgvp3pyhmzagjd25orl6n7nynoa7svlhhwk73b7u3c] inductor_config[aot_inductor.output_path]: ", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aot_inductor.package]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aot_inductor.package_cpp_only]: False", "[v3hzzlv4tjgvp3pyhmzagjd25orl6n7nynoa7svlhhwk73b7u3c] inductor_config[aot_inductor.serialized_in_spec]: ", "[v3hzzlv4tjgvp3pyhmzagjd25orl6n7nynoa7svlhhwk73b7u3c] inductor_config[aot_inductor.serialized_out_spec]: ", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aot_inductor.use_runtime_constant_folding]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[assert_indirect_indexing]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[assume_aligned_inputs]: False", "[v3hzzlv4tjgvp3pyhmzagjd25orl6n7nynoa7svlhhwk73b7u3c] inductor_config[autoheuristic_collect]: ", "[jvchmi66fvqzlemhr5fcqorz5trfdtdalzfagtj2aolmimwqhdq] inductor_config[autoheuristic_log_path]: DEFAULT", "[jwbrgxes7vjqumngs5hyj6gn5nytv2whnppnzngvaagfmawhkkd] inductor_config[autoheuristic_use]: mixed_mm", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[autotune_fallback_to_aten]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[autotune_in_subproc]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[autotune_local_cache]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[autotune_multi_device]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[autotune_remote_cache]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[b2b_gemm_pass]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[batch_fusion]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[benchmark_combo_kernel]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[benchmark_epilogue_fusion]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[benchmark_fusion]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[benchmark_harness]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[benchmark_kernel]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[bw_outputs_user_visible]: True", "[b4ha3ravs3qv237q65hpfqegbnoww7tf2ahcbu2i7xo6te5spqs] inductor_config[c_shim_version]: 2", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[check_stack_no_cycles_TESTING_ONLY]: False", "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inductor_config[combo_kernel_allow_mixed_sizes]: 1", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[combo_kernel_foreach_dynamic_shapes]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[combo_kernels]: False", "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inductor_config[combo_kernels_autotune]: 1", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[comment_origin]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[comprehensive_padding]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[compute_all_bounds]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[constant_and_index_propagation]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[conv_1x1_as_mm]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[coordinate_descent_check_all_directions]: False", "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inductor_config[coordinate_descent_search_radius]: 1", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[coordinate_descent_tuning]: False", "[c7zj4qytmety6keurs3hsh5wn7foxp3dqx4kym2ucszzcb2ngrf] inductor_config[cpp.cxx]: (None, 'g++')", "[yrty22bseefglnysuoec4ji7j2rnaggdj3g33zzj7avogwfmgdw] inductor_config[cpp.descriptive_names]: original_aten", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cpp.dynamic_threads]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cpp.enable_floating_point_contract_flag]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cpp.enable_kernel_profile]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[cpp.enable_loop_tail_vec]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[cpp.enable_tiling_heuristics]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cpp.enable_unsafe_math_opt_flag]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[cpp.fallback_scatter_reduce_sum]: True", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cpp.gemm_cache_blocking]: None", "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inductor_config[cpp.gemm_max_k_slices]: 1", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cpp.gemm_thread_factors]: None", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cpp.inject_log1p_bug_TESTING_ONLY]: None", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cpp.inject_relu_bug_TESTING_ONLY]: None", "[ebt2ncs4f5y7dn7btzi76mnouepvzad474tmp5iju4wiuumjl4s] inductor_config[cpp.max_horizontal_fusion_size]: 16", "[g7rrnbg5yonzux3cfj5ovre5lob3ayda7qcfpxjvtwmiz4uicii] inductor_config[cpp.min_chunk_size]: 4096", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[cpp.no_redundant_loops]: True", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cpp.simdlen]: None", "[sz3im5ogc6asp7g4uqocnovype63tkdexzfrniv6hn2oank3biu] inductor_config[cpp.threads]: -1", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cpp.vec_isa_ok]: None", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[cpp.weight_prepack]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cpp_wrapper]: False", "[bsvfcwwoczx2rlkdz2eta6doujsymyihmi46hhwk6clrrvwcb6m] inductor_config[cpu_backend]: cpp", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cuda.arch]: None", "[tvyftmtdmezlejo2xllu7awzv4pzc4vm4fub4b3gpl5jptjkosi] inductor_config[cuda.compile_opt_level]: -O1", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cuda.cuda_cxx]: None", "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inductor_config[cuda.cutlass_backend_min_gemm_size]: 1", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cuda.cutlass_max_profiling_configs]: None", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cuda.cutlass_op_allowlist_regex]: None", "[lwkz5chtpji756gurqw4foijfi7zfgljtnn5nmnvdi2skpt4mgh] inductor_config[cuda.cutlass_op_denylist_regex]: pingpong", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cuda.enable_cuda_lto]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cuda.enable_debug_info]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cuda.enable_ptxas_info]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[cuda.generate_test_runner]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cuda.use_fast_math]: False", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cuda.version]: None", "[caw4ly2z672k6kjfahoxwpajp5idhhtrpgf3ma2clylcp7c7aid] inductor_config[cuda_backend]: triton", "[pikr7bbcoixfzftsazp5ggufhdklj24babfry77bl4nuvyrrcp4] inductor_config[custom_op_default_layout_constraint]: needs_fixed_stride_order", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[dce]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[debug]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[debug_fusion]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[debug_index_asserts]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[debug_ir_traceback]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[decompose_mem_bound_mm]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[developer_warnings]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[disable_cpp_codegen]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[disable_padding_cpu]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[disable_progress]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[dynamic_scale_rblock]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[efficient_conv_bn_eval_fx_passes]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[emulate_precision_casts]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[enable_auto_functionalized_v2]: False", "[v3hzzlv4tjgvp3pyhmzagjd25orl6n7nynoa7svlhhwk73b7u3c] inductor_config[enabled_metric_tables]: ", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[epilogue_fusion]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[epilogue_fusion_first]: False", "[lxxtoqhcoepwfokeiibd575gnxo3uzwiv4hmpomlwkpzqz3qzsh] inductor_config[estimate_op_runtime]: default", "[h25wqx6vliw4j5rtzzbv6latydxyei3deyg6v7wzvnzryfktuki] inductor_config[external_matmul]: []", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[fallback_random]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[force_disable_caches]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[force_fuse_int_mm_with_mul]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[force_layout_optimization]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[force_same_precision]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[force_shape_pad]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[freezing]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[freezing_discard_parameters]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[fx_graph_cache]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[fx_graph_remote_cache]: False", "[62lrdx35b7hnumwb7mp5oc5y5csm2abylvtdzfloct3noaqov3n] inductor_config[fx_passes_numeric_check]: {'pre_grad': False, 'post_grad': False, 'precision': 0.0001, 'num_iterations': 1, 'requires_optimizer': True}", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[generate_intermediate_hooks]: False", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[global_cache_dir]: None", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[group_fusion]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[halide.asserts]: False", "[ljhgflgihidopsfsdcbqynv27nceykby3nutyd5jlcpq7n6e7l4] inductor_config[halide.cpu_target]: host", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[halide.debug]: False", "[wx7vmsmrdpk5ue2txlywp3lj3faqmdjphs5fgg2ehzsyno7uovg] inductor_config[halide.gpu_target]: host-cuda", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[halide.scan_kernels]: False", "[k5ogk6345jvklsnu7g2njqstiz2g6pm5wmqpgg3kasrmuqwjvl6] inductor_config[halide.scheduler_cpu]: Adams2019", "[svgytlua5wcyeia7wq7e6zgh5tsueikrnzchmdmouvmkpfsc2zq] inductor_config[halide.scheduler_cuda]: Anderson2021", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[implicit_fallbacks]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[inplace_buffers]: True", "[5fxczt3ciyxitdhizb7sfsgn7fhpczcqsngttnt5ot2wyctk7co] inductor_config[inter_node_bw]: 25", "[yezuzjtg4h3jjur4jwtwiehbyixa7eonq4tqsqmwqve2lvvmrem] inductor_config[intra_node_bw]: 300", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[is_nightly_or_source]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[is_predispatch]: False", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[joint_custom_post_pass]: None", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[joint_custom_pre_pass]: None", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[joint_graph_constant_folding]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[keep_output_stride]: True", "[j6c55jha5r2sdys2rwq7uqhtleea5dgjcye7nicfgft36v7xfvp] inductor_config[kernel_name_max_ops]: 10", "[4p2fdjlvxrcw7c7fvzm5huhtqxnro4kvkx56f7p5zyrxqkwooov] inductor_config[layout_opt_default]: 1", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[layout_optimization]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[loop_ordering_after_fusion]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[max_autotune]: False", "[uqlsbif4zxd75vt522p52txyuguieipi2lwz5g5awt56lccqk7s] inductor_config[max_autotune_conv_backends]: ATEN,TRITON", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[max_autotune_gemm]: False", "[2y7luesktjrque3nr7qtxnum2mkbeegzdrsvkm3rvdlhqboajhx] inductor_config[max_autotune_gemm_backends]: ATEN,TRITON,CPP", "[jvchmi66fvqzlemhr5fcqorz5trfdtdalzfagtj2aolmimwqhdq] inductor_config[max_autotune_gemm_search_space]: DEFAULT", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[max_autotune_pointwise]: False", "[bh33ranllcgilhgmgr3qvygzxjm6isq5iexnfm3zx6fnr2zwlp2] inductor_config[max_autotune_subproc_graceful_timeout_seconds]: 1.0", "[iglov24t7x5ruci344aer2tm6nqshi4veuw4wxlssxtu46cx76m] inductor_config[max_autotune_subproc_result_timeout_seconds]: 60.0", "[pwoh5aypf4fxbntdvwt67rppxorqos6xr3w7qzeun6kblbfg2ga] inductor_config[max_autotune_subproc_terminate_timeout_seconds]: 2.0", "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inductor_config[max_epilogue_benchmarked_choices]: 1", "[jykiys6ynafs3zdylwa5ggq6j655mxeh42d6mtdi22gffkrmiac] inductor_config[max_fusion_size]: 64", "[yttmfmxblgcbsvbokguzowcorrcxz5uunxtcvsbe6nijgcx45he] inductor_config[max_pointwise_cat_inputs]: 8", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[memory_planning]: False", "[x75won4jmsgeb63pcvwr2y4eteyzzdhmf5rv6xhjppie4hx2yu5] inductor_config[memory_pool]: intermediates", "[v2td5s4lnsvyxvaevy4chx6kc5h3mm2axazbgwimqule5zrzao7] inductor_config[mixed_mm_choice]: heuristic", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[nan_asserts]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[optimize_scatter_upon_const_tensor]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[pad_channels_last]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[pad_outputs]: False", "[ljdqgtysl3vdf7j6attlz5gmjg2ncihnveojfyubosplmkrjgra] inductor_config[padding_alignment_bytes]: 128", "[dnnw5ks3yxrp7mwvihb2hh4tqx35ye637xt33x64kw4fvz2nyzg] inductor_config[padding_stride_threshold]: 1024", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[pattern_matcher]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[permute_fusion]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[pick_loop_orders]: True", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[post_grad_custom_post_pass]: None", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[post_grad_custom_pre_pass]: None", "[4bryyl4ahh5whyg3zwqebpwmjnx6w77nqgqbdjlowju6lkqtn7w] inductor_config[post_grad_fusion_options]: {}", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[pre_grad_custom_pass]: None", "[4bryyl4ahh5whyg3zwqebpwmjnx6w77nqgqbdjlowju6lkqtn7w] inductor_config[pre_grad_fusion_options]: {}", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[profile_bandwidth]: False", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[profile_bandwidth_output]: None", "[v3hzzlv4tjgvp3pyhmzagjd25orl6n7nynoa7svlhhwk73b7u3c] inductor_config[profile_bandwidth_regex]: ", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[profile_bandwidth_with_do_bench_using_profiling]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[profiler_mark_wrapper_call]: False", "[yttmfmxblgcbsvbokguzowcorrcxz5uunxtcvsbe6nijgcx45he] inductor_config[realize_acc_reads_threshold]: 8", "[rr5m5hsocoyodldz7vcvaizdwvm2rt34evmqdxvng7wz3tufvo6] inductor_config[realize_opcount_threshold]: 30", "[lkkae3meylaixfif4thncru4hjqeaislawjoghffrbwuscaagei] inductor_config[realize_reads_threshold]: 4", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[reorder_for_compute_comm_overlap]: False", "[ssupi7bu3rrhdpg2jyegzncu3kg3nnhklyliqvutaxgs7y7k3dx] inductor_config[reorder_for_compute_comm_overlap_passes]: ['reorder_compute_for_overlap', 'sink_waits', 'raise_comms']", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[reorder_for_locality]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[reorder_for_peak_memory]: False", "[h25wqx6vliw4j5rtzzbv6latydxyei3deyg6v7wzvnzryfktuki] inductor_config[rocm.arch]: []", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[rocm.ck_dir]: None", "[oartxnko2l7d67tzwwm2otcumaut3n4wwcfgz3o377hmcveu5ft] inductor_config[rocm.ck_supported_arch]: ['gfx90a', 'gfx940', 'gfx941', 'gfx942']", "[klfqjprnpfhcdurgvuikvc4rpd5ynkpk77toousr5h3u5roty6p] inductor_config[rocm.compile_opt_level]: -O2", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[rocm.flush_denormals]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[rocm.is_debug]: False", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[rocm.n_max_profiling_configs]: None", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[rocm.print_kernel_resource_usage]: False", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[rocm.rocm_home]: None", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[rocm.save_temps]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[rocm.use_fast_math]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[rocm.use_preselected_instances]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[save_args]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[search_autotune_cache]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[shape_padding]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[size_asserts]: True", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[sleep_sec_TESTING_ONLY]: None", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[split_cat_fx_passes]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[split_reductions]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[static_weight_shapes]: True", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[triton.autotune_at_compile_time]: None", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.autotune_cublasLt]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.autotune_pointwise]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.codegen_upcast_to_fp32]: True", "[tuax46wac7rfv2trf5gcps6vleo3cq44lbnrdxtprvo3ljjaddj] inductor_config[triton.cudagraph_dynamic_shape_warn_limit]: 50", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.cudagraph_skip_dynamic_graphs]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.cudagraph_support_input_mutation]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.cudagraph_trees]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.cudagraph_trees_history_recording]: False", "[ljdqgtysl3vdf7j6attlz5gmjg2ncihnveojfyubosplmkrjgra] inductor_config[triton.cudagraph_unexpected_rerecord_limit]: 128", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.cudagraphs]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.debug_sync_graph]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.debug_sync_kernel]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.dense_indexing]: False", "[yrty22bseefglnysuoec4ji7j2rnaggdj3g33zzj7avogwfmgdw] inductor_config[triton.descriptive_names]: original_aten", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.divisible_by_16]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.fast_path_cudagraph_asserts]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.force_cudagraph_sync]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.force_cudagraphs_warmup]: False", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[triton.inject_relu_bug_TESTING_ONLY]: None", "[pr5nr4a7dthirgd2ljo3d2xakc63ywxugusu6mkmr6gmpeliyib] inductor_config[triton.max_tiles]: 2", "[fv6slhtedtydps5s5u2etitscliblzcidyitqf7krsv4e23fzk6] inductor_config[triton.min_split_scan_rblock]: 256", "[vrl5ktomgtzox5xucd3np6vug3vyj6hwwzahqijuwpmamlv7ohi] inductor_config[triton.multi_kernel]: 0", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.persistent_reductions]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.prefer_nd_tiling]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.skip_cudagraph_warmup]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.slow_path_cudagraph_asserts]: True", "[ebt2ncs4f5y7dn7btzi76mnouepvzad474tmp5iju4wiuumjl4s] inductor_config[triton.spill_threshold]: 16", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.store_cubin]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.tiling_prevents_pointwise_fusion]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.tiling_prevents_reduction_fusion]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.unique_kernel_names]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.use_block_ptr]: False", "[vzzema5ityqj2wepdmkulue7q5pcevdr5h27oxxutf35d4tjume] inductor_config[triton_kernel_default_layout_constraint]: flexible_layout", "[wft6ljqsfr3x4m7fa5zuyb7cwknky4irrxz4bjr6uzr2yiopxqj] inductor_config[unbacked_symint_fallback]: 8192", "[yttmfmxblgcbsvbokguzowcorrcxz5uunxtcvsbe6nijgcx45he] inductor_config[unroll_reductions_threshold]: 8", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[unsafe_ignore_unsupported_triton_autotune_args]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[use_minimal_arrayref_interface]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[use_mixed_mm]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[verbose_progress]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[warn_mix_layout]: False"], "cache_event_time": 1727975463283186408, "cache_state": "hit", "time_saved_ns": 6372999079} +V1003 10:11:03.284000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "d86af378fd420082506385ca5acfaccc"} + { + "name": "inductor_compile", + "ts": 1727975463284819.8, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 1, + "fxgraph_cache_miss": 1, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.285000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "45657f0a65e5f7e3f3fcfb73f437d842"} + { + "name": "compile_fx_inner", + "ts": 1727975463285151.0, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 1, + "fxgraph_cache_miss": 1, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.285000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "12df42838dd8ed7060ce2ffd52760f3c"} + { + "name": "compile_fx..fw_compiler_base", + "ts": 1727975463285792.0, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 1, + "fxgraph_cache_miss": 1, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.289000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "1bd305e520f60c583bf3f70d53bf1988"} + { + "name": "create_aot_dispatcher_function", + "ts": 1727975463289199.8, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 1, + "fxgraph_cache_miss": 1, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.289000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "02f3ea38f17491dd957dd1864f5411e6"} + { + "name": "backend_compile", + "ts": 1727975463289789.5, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 1, + "fxgraph_cache_miss": 1, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.290000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "573fd1c41404d17c4da25ab7cfc13f4b"} + { + "name": "OutputGraph.call_user_compiler", + "ts": 1727975463290072.5, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 1, + "fxgraph_cache_miss": 1, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.320000 2235078 torch/_dynamo/guards.py:2311] {"dynamo_cpp_guards_str": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "6627aec2e890cf84ba5ee985a77950cc"} + + TREE_GUARD_MANAGER: + +- RootGuardManager + | +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:471 in init_ambient_guards + | +- GLOBAL_STATE: ___check_global_state() + | +- TORCH_FUNCTION_MODE_STACK: ___check_torch_function_mode_stack() + | +- GuardManager: source=L['k'], accessed_by=DictGetItemGuardAccessor(k) + | | +- TYPE_MATCH: ___check_type_id(L['k'], 82028112) + | | +- TENSOR_MATCH: check_tensor(L['k'], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.float32, device=0, requires_grad=False, size=[1, 4, 512, 64], stride=[131072, 32768, 64, 1]) + | | +- NO_HASATTR: hasattr(L['k'], '_dynamo_dynamic_indices') == False + | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['k'], L['q'], L['v'], L['block_mask'].q_indices, L['block_mask'].kv_indices, L['block_mask'].q_num_blocks, L['block_mask'].kv_num_blocks, L['block_mask'].full_q_indices, L['block_mask'].full_kv_indices, L['block_mask'].full_q_num_blocks, L['block_mask'].full_kv_num_blocks) + | +- GuardManager: source=L['q'], accessed_by=DictGetItemGuardAccessor(q) + | | +- TYPE_MATCH: ___check_type_id(L['q'], 82028112) + | | +- TENSOR_MATCH: check_tensor(L['q'], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.float32, device=0, requires_grad=False, size=[1, 4, 512, 64], stride=[131072, 32768, 64, 1]) + | | +- NO_HASATTR: hasattr(L['q'], '_dynamo_dynamic_indices') == False + | | +- NO_TENSOR_ALIASING + | +- GuardManager: source=L['v'], accessed_by=DictGetItemGuardAccessor(v) + | | +- TYPE_MATCH: ___check_type_id(L['v'], 82028112) + | | +- TENSOR_MATCH: check_tensor(L['v'], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.float32, device=0, requires_grad=False, size=[1, 4, 512, 64], stride=[131072, 32768, 64, 1]) + | | +- NO_HASATTR: hasattr(L['v'], '_dynamo_dynamic_indices') == False + | | +- NO_TENSOR_ALIASING + | +- GuardManager: source=L['score_mod'], accessed_by=DictGetItemGuardAccessor(score_mod) + | | +- GuardManager: source=L['score_mod'].__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | +- ID_MATCH: ___check_obj_id(L['score_mod'].__code__, 140413271879296) + | +- GuardManager: source=L['block_mask'], accessed_by=DictGetItemGuardAccessor(block_mask) + | | +- TYPE_MATCH: ___check_type_id(L['block_mask'], 387600320) + | | +- GuardManager: source=L['block_mask'].mask_mod, accessed_by=GetAttrGuardAccessor(mask_mod) + | | | +- GuardManager: source=L['block_mask'].mask_mod.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | +- ID_MATCH: ___check_obj_id(L['block_mask'].mask_mod.__code__, 140413271880128) + | | +- GuardManager: source=L['block_mask'].q_indices, accessed_by=GetAttrGuardAccessor(q_indices) + | | | +- TENSOR_MATCH: check_tensor(L['block_mask'].q_indices, Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16, 16], stride=[256, 256, 16, 1]) + | | | +- NO_HASATTR: hasattr(L['block_mask'].q_indices, '_dynamo_dynamic_indices') == False + | | | +- NO_TENSOR_ALIASING + | | +- GuardManager: source=L['block_mask'].BLOCK_SIZE, accessed_by=GetAttrGuardAccessor(BLOCK_SIZE) + | | | +- TYPE_MATCH: ___check_type_id(L['block_mask'].BLOCK_SIZE, 8815232) + | | | +- LENGTH_CHECK: len(L['block_mask'].BLOCK_SIZE) == 2 + | | | +- GuardManager: source=L['block_mask'].BLOCK_SIZE[0], accessed_by=TupleGetItemGuardAccessor(0) + | | | | +- EQUALS_MATCH: L['block_mask'].BLOCK_SIZE[0] == 128 + | | | +- GuardManager: source=L['block_mask'].BLOCK_SIZE[1], accessed_by=TupleGetItemGuardAccessor(1) + | | | | +- EQUALS_MATCH: L['block_mask'].BLOCK_SIZE[1] == 128 + | | +- GuardManager: source=L['block_mask'].kv_indices, accessed_by=GetAttrGuardAccessor(kv_indices) + | | | +- TENSOR_MATCH: check_tensor(L['block_mask'].kv_indices, Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16, 16], stride=[256, 256, 16, 1]) + | | | +- NO_HASATTR: hasattr(L['block_mask'].kv_indices, '_dynamo_dynamic_indices') == False + | | | +- NO_TENSOR_ALIASING + | | +- GuardManager: source=L['block_mask'].q_num_blocks, accessed_by=GetAttrGuardAccessor(q_num_blocks) + | | | +- TENSOR_MATCH: check_tensor(L['block_mask'].q_num_blocks, Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16], stride=[16, 16, 1]) + | | | +- NO_HASATTR: hasattr(L['block_mask'].q_num_blocks, '_dynamo_dynamic_indices') == False + | | | +- NO_TENSOR_ALIASING + | | +- GuardManager: source=L['block_mask'].kv_num_blocks, accessed_by=GetAttrGuardAccessor(kv_num_blocks) + | | | +- TENSOR_MATCH: check_tensor(L['block_mask'].kv_num_blocks, Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16], stride=[16, 16, 1]) + | | | +- NO_HASATTR: hasattr(L['block_mask'].kv_num_blocks, '_dynamo_dynamic_indices') == False + | | | +- NO_TENSOR_ALIASING + | | +- GuardManager: source=L['block_mask'].full_q_indices, accessed_by=GetAttrGuardAccessor(full_q_indices) + | | | +- TENSOR_MATCH: check_tensor(L['block_mask'].full_q_indices, Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16, 16], stride=[256, 256, 16, 1]) + | | | +- NO_HASATTR: hasattr(L['block_mask'].full_q_indices, '_dynamo_dynamic_indices') == False + | | | +- NO_TENSOR_ALIASING + | | +- GuardManager: source=L['block_mask'].full_kv_indices, accessed_by=GetAttrGuardAccessor(full_kv_indices) + | | | +- TENSOR_MATCH: check_tensor(L['block_mask'].full_kv_indices, Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16, 16], stride=[256, 256, 16, 1]) + | | | +- NO_HASATTR: hasattr(L['block_mask'].full_kv_indices, '_dynamo_dynamic_indices') == False + | | | +- NO_TENSOR_ALIASING + | | +- GuardManager: source=L['block_mask'].full_q_num_blocks, accessed_by=GetAttrGuardAccessor(full_q_num_blocks) + | | | +- TENSOR_MATCH: check_tensor(L['block_mask'].full_q_num_blocks, Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16], stride=[16, 16, 1]) + | | | +- NO_HASATTR: hasattr(L['block_mask'].full_q_num_blocks, '_dynamo_dynamic_indices') == False + | | | +- NO_TENSOR_ALIASING + | | +- GuardManager: source=L['block_mask'].full_kv_num_blocks, accessed_by=GetAttrGuardAccessor(full_kv_num_blocks) + | | | +- TENSOR_MATCH: check_tensor(L['block_mask'].full_kv_num_blocks, Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16], stride=[16, 16, 1]) + | | | +- NO_HASATTR: hasattr(L['block_mask'].full_kv_num_blocks, '_dynamo_dynamic_indices') == False + | | | +- NO_TENSOR_ALIASING + | | +- GuardManager: source=L['block_mask'].as_tuple, accessed_by=GetAttrGuardAccessor(as_tuple) + | | | +- GuardManager: source=L['block_mask'].as_tuple, accessed_by=FuncDefaultsGuardAccessor + | | | | +- GuardManager: source=L['block_mask'].as_tuple.__defaults__[0], accessed_by=GetItemGuardAccessor(0) + | | | | | +- ID_MATCH: ___check_obj_id(L['block_mask'].as_tuple.__defaults__[0], 8911040) + | +- GuardManager: source=L['flex_attention'], accessed_by=DictGetItemGuardAccessor(flex_attention) + | | +- GuardManager: source=L['flex_attention'].__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | +- ID_MATCH: ___check_obj_id(L['flex_attention'].__code__, 387082992) + | | +- GuardManager: source=L['flex_attention'], accessed_by=FuncDefaultsGuardAccessor + | | | +- GuardManager: source=L['flex_attention'].__defaults__[2], accessed_by=GetItemGuardAccessor(2) + | | | | +- ID_MATCH: ___check_obj_id(L['flex_attention'].__defaults__[2], 8825760) + | | | +- GuardManager: source=L['flex_attention'].__defaults__[3], accessed_by=GetItemGuardAccessor(3) + | | | | +- ID_MATCH: ___check_obj_id(L['flex_attention'].__defaults__[3], 8910592) + | | | +- GuardManager: source=L['flex_attention'].__defaults__[4], accessed_by=GetItemGuardAccessor(4) + | | | | +- ID_MATCH: ___check_obj_id(L['flex_attention'].__defaults__[4], 8910592) + | | | +- GuardManager: source=L['flex_attention'].__defaults__[5], accessed_by=GetItemGuardAccessor(5) + | | | | +- ID_MATCH: ___check_obj_id(L['flex_attention'].__defaults__[5], 8825760) + | +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor + | | +- GuardManager: source=G['__builtins_dict___6'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___6) + | | | +- GuardManager: source=G['__builtins_dict___6']['len'], accessed_by=DictGetItemGuardAccessor(len) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___6']['len'], 140413275558816) + | | | +- GuardManager: source=G['__builtins_dict___6']['sum'], accessed_by=DictGetItemGuardAccessor(sum) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___6']['sum'], 140413275559936) + | | | +- GuardManager: source=G['__builtins_dict___6']['list'], accessed_by=DictGetItemGuardAccessor(list) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___6']['list'], 8844320) + | | | +- GuardManager: source=G['__builtins_dict___6']['type'], accessed_by=DictGetItemGuardAccessor(type) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___6']['type'], 8813248) + | | | +- GuardManager: source=G['__builtins_dict___6']['tuple'], accessed_by=DictGetItemGuardAccessor(tuple) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___6']['tuple'], 8815232) + | | | +- GuardManager: source=G['__builtins_dict___6']['object'], accessed_by=DictGetItemGuardAccessor(object) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___6']['object'], 8813984) + | | | +- GuardManager: source=G['__builtins_dict___6']['isinstance'], accessed_by=DictGetItemGuardAccessor(isinstance) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___6']['isinstance'], 140413275558496) + | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot_utils_dot__pytree) + | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree'], 140411217627952) + | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree'].TreeSpec, accessed_by=GetAttrGuardAccessor(TreeSpec) + | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree'].TreeSpec, 84866496) + | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._is_leaf, accessed_by=GetAttrGuardAccessor(_is_leaf) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._is_leaf.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree']._is_leaf.__code__, 140411217262720) + | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC, accessed_by=GetAttrGuardAccessor(_LEAF_SPEC) + | | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC, 85171104) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.type, accessed_by=GetAttrGuardAccessor(type) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.type, 8825760) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.context, accessed_by=GetAttrGuardAccessor(context) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.context, 8825760) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.num_nodes, accessed_by=GetAttrGuardAccessor(num_nodes) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.num_nodes == 1 + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.num_leaves, accessed_by=GetAttrGuardAccessor(num_leaves) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.num_leaves == 1 + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.num_children, accessed_by=GetAttrGuardAccessor(num_children) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.num_children == 0 + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.children_specs, accessed_by=GetAttrGuardAccessor(children_specs) + | | | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.children_specs, 8844320) + | | | | | +- LENGTH_CHECK: not G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.children_specs + | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._get_node_type, accessed_by=GetAttrGuardAccessor(_get_node_type) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._get_node_type.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree']._get_node_type.__code__, 140411217262448) + | | | +- DictGuardManager: source=G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES, accessed_by=GetAttrGuardAccessor(SUPPORTED_NODES) + | | | | +- DICT_VERSION: ___dict_version(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES) == 519596 + | | | | +- KeyValueManager pair at index=1 + | | | | | +- ValueManager: GuardManager: source=G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]] + | | | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]].flatten_fn, accessed_by=GetAttrGuardAccessor(flatten_fn) + | | | | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]].flatten_fn.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]].flatten_fn.__code__, 140411196281984) + | | | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]].unflatten_fn, accessed_by=GetAttrGuardAccessor(unflatten_fn) + | | | | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]].unflatten_fn.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]].unflatten_fn.__code__, 140411217182288) + | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._tree_flatten_helper, accessed_by=GetAttrGuardAccessor(_tree_flatten_helper) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._tree_flatten_helper.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree']._tree_flatten_helper.__code__, 140411217413040) + | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._is_namedtuple_instance, accessed_by=GetAttrGuardAccessor(_is_namedtuple_instance) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._is_namedtuple_instance.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree']._is_namedtuple_instance.__code__, 140411217412592) + | | +- GuardManager: source=G['__import_torch_dot__dynamo_dot_comptime'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot__dynamo_dot_comptime) + | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot__dynamo_dot_comptime'], 140410226912176) + | | +- GuardManager: source=G['__import_torch_dot__dynamo_dot_decorators'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot__dynamo_dot_decorators) + | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot__dynamo_dot_decorators'], 140410226910096) + | | | +- GuardManager: source=G['__import_torch_dot__dynamo_dot_decorators'].is_compiling, accessed_by=GetAttrGuardAccessor(is_compiling) + | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot__dynamo_dot_decorators'].is_compiling, 140410376252096) + | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot__utils'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot_nn_dot_attention_dot__utils) + | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot__utils'], 140409673896784) + | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS, accessed_by=GetAttrGuardAccessor(_SUPPORTED_HEAD_DIMS) + | | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS, 8844320) + | | | | +- LENGTH_CHECK: len(G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS) == 10 + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS[0], accessed_by=ListGetItemGuardAccessor(0) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS[0] == 2 + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS[1], accessed_by=ListGetItemGuardAccessor(1) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS[1] == 4 + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS[2], accessed_by=ListGetItemGuardAccessor(2) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS[2] == 8 + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS[3], accessed_by=ListGetItemGuardAccessor(3) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS[3] == 16 + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS[4], accessed_by=ListGetItemGuardAccessor(4) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS[4] == 32 + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS[5], accessed_by=ListGetItemGuardAccessor(5) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS[5] == 64 + | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot_nn_dot_attention_dot_flex_attention) + | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention'], 140409673895824) + | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].math, accessed_by=GetAttrGuardAccessor(math) + | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].math, 140413266939392) + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].math.sqrt, accessed_by=GetAttrGuardAccessor(sqrt) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].math.sqrt, 140413266943072) + | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].torch, accessed_by=GetAttrGuardAccessor(torch) + | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].torch, 140413267918368) + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].torch._dynamo, accessed_by=GetAttrGuardAccessor(_dynamo) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].torch._dynamo, 140413260098400) + | | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].torch._dynamo.mark_static, accessed_by=GetAttrGuardAccessor(mark_static) + | | | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].torch._dynamo.mark_static.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].torch._dynamo.mark_static.__code__, 123166432) + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].torch.compiler, accessed_by=GetAttrGuardAccessor(compiler) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].torch.compiler, 140410826010400) + | | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].torch.compiler.is_dynamo_compiling, accessed_by=GetAttrGuardAccessor(is_dynamo_compiling) + | | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].torch.compiler.is_dynamo_compiling, 140410826132992) + | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._validate_device, accessed_by=GetAttrGuardAccessor(_validate_device) + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._validate_device.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._validate_device.__code__, 140409673611088) + | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].flex_attention_hop, accessed_by=GetAttrGuardAccessor(flex_attention_hop) + | | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].flex_attention_hop, 96992544) + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].flex_attention_hop.__name__, accessed_by=GetAttrGuardAccessor(__name__) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].flex_attention_hop.__name__ == 'flex_attention' + | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._supported_head_dim, accessed_by=GetAttrGuardAccessor(_supported_head_dim) + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._supported_head_dim.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._supported_head_dim.__code__, 140409673231376) + | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._validate_embed_dim, accessed_by=GetAttrGuardAccessor(_validate_embed_dim) + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._validate_embed_dim.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._validate_embed_dim.__code__, 388086512) + | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._validate_sdpa_input, accessed_by=GetAttrGuardAccessor(_validate_sdpa_input) + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._validate_sdpa_input.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._validate_sdpa_input.__code__, 387915104) + | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._apply_kernel_options, accessed_by=GetAttrGuardAccessor(_apply_kernel_options) + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._apply_kernel_options.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._apply_kernel_options.__code__, 140409683680752) + +V1003 10:11:03.321000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "341d7b7d938fce7537065544860a0efd"} + { + "name": "entire_frame_compile", + "ts": 1727975463321242.8, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 1, + "fxgraph_cache_miss": 1, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.321000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "02dcbee7c15ab2085e7ee42a1679beb8"} + { + "name": "_compile.compile_inner", + "ts": 1727975463321536.0, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 1, + "fxgraph_cache_miss": 1, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.321000 2235078 torch/_dynamo/utils.py:840] {"compilation_metrics": {"compile_id": "1/0", "frame_key": "2", "co_name": "fn", "co_filename": "/data/users/oulgen/pytorch/test/inductor/test_codecache.py", "co_firstlineno": 379, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 80, "shape_env_guard_count": 0, "graph_op_count": 11, "graph_node_count": 25, "graph_input_count": 11, "start_time": 1727975462.8240004, "entire_frame_compile_time_s": 0.4971275329589844, "backend_compile_time_s": 0.2832028865814209, "inductor_compile_time_s": 0.1434309482574463, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0, "has_guarded_code": true, "possibly_missed_reinplacing_opportunities": 0, "remote_cache_time_saved_s": 6.372999079, "structured_logging_overhead_s": 0.055921114, "config_suppress_errors": false, "config_inline_inbuilt_nn_modules": true, "specialize_float": true}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.325000 2235078 torch/_dynamo/convert_frame.py:915] {"dynamo_start": {"stack": [{"line": 916, "name": "", "filename": 1}, {"line": 14, "name": "run_tests", "filename": 2}, {"line": 38, "name": "run_tests", "filename": 3}, {"line": 1273, "name": "run_tests", "filename": 4}, {"line": 102, "name": "__init__", "filename": 5}, {"line": 274, "name": "runTests", "filename": 5}, {"line": 217, "name": "run", "filename": 6}, {"line": 84, "name": "__call__", "filename": 7}, {"line": 122, "name": "run", "filename": 7}, {"line": 84, "name": "__call__", "filename": 7}, {"line": 122, "name": "run", "filename": 7}, {"line": 678, "name": "__call__", "filename": 8}, {"line": 3116, "name": "run", "filename": 4}, {"line": 3088, "name": "_run_custom", "filename": 4}, {"line": 623, "name": "run", "filename": 8}, {"line": 579, "name": "_callTestMethod", "filename": 8}, {"line": 2983, "name": "wrapper", "filename": 4}, {"line": 81, "name": "inner", "filename": 9}, {"line": 81, "name": "inner", "filename": 9}, {"line": 414, "name": "test_flex_attention_caching", "filename": 1}, {"line": 386, "name": "fn2", "filename": 1}, {"line": 1062, "name": "flex_attention", "filename": 10}, {"line": 1049, "name": "_flex_attention_hop_wrapper", "filename": 10}]}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.325000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "af0d73822951ce32af4202533e2a8290"} + { + "name": "_compile.compile_inner", + "ts": 1727975463325378.5, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.325000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "dc5cd9b8a50a9005377afd5d28ba36fb"} + { + "name": "entire_frame_compile", + "ts": 1727975463325378.5, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.328000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 0, "describer_id": 348, "size": 524288}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.328000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 0, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1, 4, 512, 64], "is_leaf": true, "stride": [131072, 32768, 64, 1], "storage": 0, "view_func": "", "describer_id": 348}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.328000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 348, "id": 0, "source": "L['args'][0]"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.441000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 1, "describer_id": 348, "size": 524288}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.442000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1, 4, 512, 64], "is_leaf": true, "stride": [131072, 32768, 64, 1], "storage": 1, "view_func": "", "describer_id": 348}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.442000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 348, "id": 2, "source": "L['args'][1]"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.443000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 2, "describer_id": 348, "size": 524288}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.444000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 3, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1, 4, 512, 64], "is_leaf": true, "stride": [131072, 32768, 64, 1], "storage": 2, "view_func": "", "describer_id": 348}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.444000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 348, "id": 3, "source": "L['args'][2]"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.445000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 3, "describer_id": 348, "size": 64}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.445000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 4, "ndim": 3, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16], "is_leaf": true, "stride": [16, 16, 1], "storage": 3, "view_func": "", "describer_id": 348}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.445000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 348, "id": 4, "source": "L['args'][4][0]"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.446000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 4, "describer_id": 348, "size": 1024}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.447000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 5, "ndim": 4, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16, 16], "is_leaf": true, "stride": [256, 256, 16, 1], "storage": 4, "view_func": "", "describer_id": 348}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.447000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 348, "id": 5, "source": "L['args'][4][1]"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.448000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 5, "describer_id": 348, "size": 64}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.448000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 6, "ndim": 3, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16], "is_leaf": true, "stride": [16, 16, 1], "storage": 5, "view_func": "", "describer_id": 348}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.448000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 348, "id": 6, "source": "L['args'][4][2]"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.449000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 6, "describer_id": 348, "size": 1024}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.450000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 7, "ndim": 4, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16, 16], "is_leaf": true, "stride": [256, 256, 16, 1], "storage": 6, "view_func": "", "describer_id": 348}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.450000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 348, "id": 7, "source": "L['args'][4][3]"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.451000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 7, "describer_id": 348, "size": 64}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.451000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 8, "ndim": 3, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16], "is_leaf": true, "stride": [16, 16, 1], "storage": 7, "view_func": "", "describer_id": 348}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.451000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 348, "id": 8, "source": "L['args'][4][4]"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.452000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 8, "describer_id": 348, "size": 1024}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.453000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 9, "ndim": 4, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16, 16], "is_leaf": true, "stride": [256, 256, 16, 1], "storage": 8, "view_func": "", "describer_id": 348}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.453000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 348, "id": 9, "source": "L['args'][4][5]"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.454000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 9, "describer_id": 348, "size": 64}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.454000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 10, "ndim": 3, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16], "is_leaf": true, "stride": [16, 16, 1], "storage": 9, "view_func": "", "describer_id": 348}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.455000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 348, "id": 10, "source": "L['args'][4][6]"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.456000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 10, "describer_id": 348, "size": 1024}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.456000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 11, "ndim": 4, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16, 16], "is_leaf": true, "stride": [256, 256, 16, 1], "storage": 10, "view_func": "", "describer_id": 348}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.456000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 348, "id": 11, "source": "L['args'][4][7]"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.464000 2235078 torch/_dynamo/output_graph.py:1347] {"dynamo_output_graph": {"sizes": {"l_args_0_": [1, 4, 512, 64], "l_args_1_": [1, 4, 512, 64], "l_args_2_": [1, 4, 512, 64], "l_args_4_0_": [1, 1, 16], "l_args_4_1_": [1, 1, 16, 16], "l_args_4_2_": [1, 1, 16], "l_args_4_3_": [1, 1, 16, 16], "l_args_4_4_": [1, 1, 16], "l_args_4_5_": [1, 1, 16, 16], "l_args_4_6_": [1, 1, 16], "l_args_4_7_": [1, 1, 16, 16], "child_1": [], "child_2": [], "child_3": [], "child_4": [], "child": [], "child_5": [], "child_6": [], "child_7": [], "child_8": [], "getitem": [1, 4, 512, 64], "getitem_1": [1, 4, 512]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "c759d677da3d86354d83c3018e999089"} + class GraphModule(torch.nn.Module): + def forward(self, L_args_0_: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0", L_args_1_: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0", L_args_2_: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0", L_args_4_0_: "i32[1, 1, 16][16, 16, 1]cuda:0", L_args_4_1_: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0", L_args_4_2_: "i32[1, 1, 16][16, 16, 1]cuda:0", L_args_4_3_: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0", L_args_4_4_: "i32[1, 1, 16][16, 16, 1]cuda:0", L_args_4_5_: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0", L_args_4_6_: "i32[1, 1, 16][16, 16, 1]cuda:0", L_args_4_7_: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0"): + l_args_0_ = L_args_0_ + l_args_1_ = L_args_1_ + l_args_2_ = L_args_2_ + l_args_4_0_ = L_args_4_0_ + l_args_4_1_ = L_args_4_1_ + l_args_4_2_ = L_args_4_2_ + l_args_4_3_ = L_args_4_3_ + l_args_4_4_ = L_args_4_4_ + l_args_4_5_ = L_args_4_5_ + l_args_4_6_ = L_args_4_6_ + l_args_4_7_ = L_args_4_7_ + + # File: /data/users/oulgen/pytorch/torch/nn/attention/flex_attention.py:1050 in _flex_attention_hop_wrapper, code: return flex_attention_hop(*args, **kwargs) + child_1: "i32[][]cuda:0" = l_args_0_.new_empty([], dtype = torch.int32); child_1 = None + child_2: "i32[][]cuda:0" = l_args_0_.new_empty([], dtype = torch.int32); child_2 = None + child_3: "i32[][]cuda:0" = l_args_0_.new_empty([], dtype = torch.int32); child_3 = None + child_4: "i32[][]cuda:0" = l_args_0_.new_empty([], dtype = torch.int32); child_4 = None + child: "f32[][]cuda:0" = l_args_0_.new_empty([], requires_grad = False); child = None + score_mod_0 = self.score_mod_0 + child_5: "i32[][]cuda:0" = l_args_0_.new_empty([], dtype = torch.int32); child_5 = None + child_6: "i32[][]cuda:0" = l_args_0_.new_empty([], dtype = torch.int32); child_6 = None + child_7: "i32[][]cuda:0" = l_args_0_.new_empty([], dtype = torch.int32); child_7 = None + child_8: "i32[][]cuda:0" = l_args_0_.new_empty([], dtype = torch.int32); child_8 = None + mask_fn_0 = self.mask_fn_0 + flex_attention = torch.ops.higher_order.flex_attention(l_args_0_, l_args_1_, l_args_2_, score_mod_0, (l_args_4_0_, l_args_4_1_, l_args_4_2_, l_args_4_3_, l_args_4_4_, l_args_4_5_, l_args_4_6_, l_args_4_7_, 128, 128, mask_fn_0), 0.125, {'ROWS_GUARANTEED_SAFE': False, 'PRESCALE_QK': False, 'OUTPUT_LOGSUMEXP': False}, (), ()); l_args_0_ = l_args_1_ = l_args_2_ = score_mod_0 = l_args_4_0_ = l_args_4_1_ = l_args_4_2_ = l_args_4_3_ = l_args_4_4_ = l_args_4_5_ = l_args_4_6_ = l_args_4_7_ = mask_fn_0 = None + getitem: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0" = flex_attention[0] + getitem_1: "f32[1, 4, 512][2048, 512, 1]cuda:0" = flex_attention[1]; flex_attention = None + return (getitem, getitem_1) + + class score_mod_0(torch.nn.Module): + def forward(self, child: "f32[][]cuda:0", child_1: "i32[][]cuda:0", child_2: "i32[][]cuda:0", child_3: "i32[][]cuda:0", child_4: "i32[][]cuda:0"): + return child + + class mask_fn_0(torch.nn.Module): + def forward(self, child_5: "i32[][]cuda:0", child_6: "i32[][]cuda:0", child_7: "i32[][]cuda:0", child_8: "i32[][]cuda:0"): + # File: /data/users/oulgen/pytorch/test/inductor/test_codecache.py:373 in , code: lambda b, h, q, kv: q >= kv, None, None, 2048, 2048 + ge: "b8[][]cuda:0" = child_7 >= child_8; child_7 = child_8 = None + return ge + +V1003 10:11:03.465000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "3f2a0e3577c204c69d66ef7550317ca5"} + { + "name": "OutputGraph.call_user_compiler", + "ts": 1727975463465456.5, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.465000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "b48e24df1eadf9ac86a2058fefc95dfb"} + { + "name": "backend_compile", + "ts": 1727975463465456.5, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.466000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "9ed8f1c18e744cba5c7e86e34c900750"} + { + "name": "backend_compile", + "ts": 1727975463466215.0, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 1, + "fxgraph_cache_miss": 1, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.466000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "c19895e106748f6b125e2df40adb8b9e"} + { + "name": "OutputGraph.call_user_compiler", + "ts": 1727975463466480.0, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 1, + "fxgraph_cache_miss": 1, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.490000 2235078 torch/_dynamo/guards.py:2311] {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "6c94a4177e3b74bb7d4cee6cdd33898c"} + + TREE_GUARD_MANAGER: + +- RootGuardManager + | +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:471 in init_ambient_guards + | +- GLOBAL_STATE: ___check_global_state() + | +- TORCH_FUNCTION_MODE_STACK: ___check_torch_function_mode_stack() + | +- GuardManager: source=L['args'], accessed_by=DictGetItemGuardAccessor(args) + | | +- TYPE_MATCH: ___check_type_id(L['args'], 8815232) + | | +- LENGTH_CHECK: len(L['args']) == 7 + | | +- GuardManager: source=L['args'][0], accessed_by=TupleGetItemGuardAccessor(0) + | | | +- TENSOR_MATCH: check_tensor(L['args'][0], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.float32, device=0, requires_grad=False, size=[1, 4, 512, 64], stride=[131072, 32768, 64, 1]) + | | | +- NO_HASATTR: hasattr(L['args'][0], '_dynamo_dynamic_indices') == False + | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['args'][0], L['args'][1], L['args'][2], L['args'][4][0], L['args'][4][1], L['args'][4][2], L['args'][4][3], L['args'][4][4], L['args'][4][5], L['args'][4][6], L['args'][4][7]) + | | +- GuardManager: source=L['args'][1], accessed_by=TupleGetItemGuardAccessor(1) + | | | +- TENSOR_MATCH: check_tensor(L['args'][1], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.float32, device=0, requires_grad=False, size=[1, 4, 512, 64], stride=[131072, 32768, 64, 1]) + | | | +- NO_HASATTR: hasattr(L['args'][1], '_dynamo_dynamic_indices') == False + | | | +- NO_TENSOR_ALIASING + | | +- GuardManager: source=L['args'][2], accessed_by=TupleGetItemGuardAccessor(2) + | | | +- TENSOR_MATCH: check_tensor(L['args'][2], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.float32, device=0, requires_grad=False, size=[1, 4, 512, 64], stride=[131072, 32768, 64, 1]) + | | | +- NO_HASATTR: hasattr(L['args'][2], '_dynamo_dynamic_indices') == False + | | | +- NO_TENSOR_ALIASING + | | +- GuardManager: source=L['args'][3], accessed_by=TupleGetItemGuardAccessor(3) + | | | +- GuardManager: source=L['args'][3].__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | +- ID_MATCH: ___check_obj_id(L['args'][3].__code__, 140413275216112) + | | +- GuardManager: source=L['args'][4], accessed_by=TupleGetItemGuardAccessor(4) + | | | +- TYPE_MATCH: ___check_type_id(L['args'][4], 8815232) + | | | +- LENGTH_CHECK: len(L['args'][4]) == 11 + | | | +- GuardManager: source=L['args'][4][0], accessed_by=TupleGetItemGuardAccessor(0) + | | | | +- TENSOR_MATCH: check_tensor(L['args'][4][0], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16], stride=[16, 16, 1]) + | | | | +- NO_HASATTR: hasattr(L['args'][4][0], '_dynamo_dynamic_indices') == False + | | | | +- NO_TENSOR_ALIASING + | | | +- GuardManager: source=L['args'][4][1], accessed_by=TupleGetItemGuardAccessor(1) + | | | | +- TENSOR_MATCH: check_tensor(L['args'][4][1], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16, 16], stride=[256, 256, 16, 1]) + | | | | +- NO_HASATTR: hasattr(L['args'][4][1], '_dynamo_dynamic_indices') == False + | | | | +- NO_TENSOR_ALIASING + | | | +- GuardManager: source=L['args'][4][2], accessed_by=TupleGetItemGuardAccessor(2) + | | | | +- TENSOR_MATCH: check_tensor(L['args'][4][2], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16], stride=[16, 16, 1]) + | | | | +- NO_HASATTR: hasattr(L['args'][4][2], '_dynamo_dynamic_indices') == False + | | | | +- NO_TENSOR_ALIASING + | | | +- GuardManager: source=L['args'][4][3], accessed_by=TupleGetItemGuardAccessor(3) + | | | | +- TENSOR_MATCH: check_tensor(L['args'][4][3], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16, 16], stride=[256, 256, 16, 1]) + | | | | +- NO_HASATTR: hasattr(L['args'][4][3], '_dynamo_dynamic_indices') == False + | | | | +- NO_TENSOR_ALIASING + | | | +- GuardManager: source=L['args'][4][4], accessed_by=TupleGetItemGuardAccessor(4) + | | | | +- TENSOR_MATCH: check_tensor(L['args'][4][4], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16], stride=[16, 16, 1]) + | | | | +- NO_HASATTR: hasattr(L['args'][4][4], '_dynamo_dynamic_indices') == False + | | | | +- NO_TENSOR_ALIASING + | | | +- GuardManager: source=L['args'][4][5], accessed_by=TupleGetItemGuardAccessor(5) + | | | | +- TENSOR_MATCH: check_tensor(L['args'][4][5], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16, 16], stride=[256, 256, 16, 1]) + | | | | +- NO_HASATTR: hasattr(L['args'][4][5], '_dynamo_dynamic_indices') == False + | | | | +- NO_TENSOR_ALIASING + | | | +- GuardManager: source=L['args'][4][6], accessed_by=TupleGetItemGuardAccessor(6) + | | | | +- TENSOR_MATCH: check_tensor(L['args'][4][6], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16], stride=[16, 16, 1]) + | | | | +- NO_HASATTR: hasattr(L['args'][4][6], '_dynamo_dynamic_indices') == False + | | | | +- NO_TENSOR_ALIASING + | | | +- GuardManager: source=L['args'][4][7], accessed_by=TupleGetItemGuardAccessor(7) + | | | | +- TENSOR_MATCH: check_tensor(L['args'][4][7], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16, 16], stride=[256, 256, 16, 1]) + | | | | +- NO_HASATTR: hasattr(L['args'][4][7], '_dynamo_dynamic_indices') == False + | | | | +- NO_TENSOR_ALIASING + | | | +- GuardManager: source=L['args'][4][8], accessed_by=TupleGetItemGuardAccessor(8) + | | | | +- EQUALS_MATCH: L['args'][4][8] == 128 + | | | +- GuardManager: source=L['args'][4][9], accessed_by=TupleGetItemGuardAccessor(9) + | | | | +- EQUALS_MATCH: L['args'][4][9] == 128 + | | | +- GuardManager: source=L['args'][4][10], accessed_by=TupleGetItemGuardAccessor(10) + | | | | +- GuardManager: source=L['args'][4][10].__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(L['args'][4][10].__code__, 140413271880128) + | | +- GuardManager: source=L['args'][5], accessed_by=TupleGetItemGuardAccessor(5) + | | | +- EQUALS_MATCH: L['args'][5] == 0.125 + | | +- GuardManager: source=L['args'][6], accessed_by=TupleGetItemGuardAccessor(6) + | | | +- DICT_LENGTH: len(L['args'][6]) == 3 + | | | +- GuardManager: source=L['args'][6]['ROWS_GUARANTEED_SAFE'], accessed_by=DictGetItemGuardAccessor(ROWS_GUARANTEED_SAFE) + | | | | +- ID_MATCH: ___check_obj_id(L['args'][6]['ROWS_GUARANTEED_SAFE'], 8910592) + | | | +- GuardManager: source=L['args'][6]['PRESCALE_QK'], accessed_by=DictGetItemGuardAccessor(PRESCALE_QK) + | | | | +- ID_MATCH: ___check_obj_id(L['args'][6]['PRESCALE_QK'], 8910592) + | | | +- GuardManager: source=L['args'][6]['OUTPUT_LOGSUMEXP'], accessed_by=DictGetItemGuardAccessor(OUTPUT_LOGSUMEXP) + | | | | +- ID_MATCH: ___check_obj_id(L['args'][6]['OUTPUT_LOGSUMEXP'], 8910592) + | +- GuardManager: source=L['kwargs'], accessed_by=DictGetItemGuardAccessor(kwargs) + | | +- DICT_LENGTH: not L['kwargs'] + | +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor + | | +- GuardManager: source=G['flex_attention_hop'], accessed_by=DictGetItemGuardAccessor(flex_attention_hop) + | | | +- TYPE_MATCH: ___check_type_id(G['flex_attention_hop'], 96992544) + | | | +- GuardManager: source=G['flex_attention_hop'].__name__, accessed_by=GetAttrGuardAccessor(__name__) + | | | | +- EQUALS_MATCH: G['flex_attention_hop'].__name__ == 'flex_attention' + | | +- GuardManager: source=G['__builtins_dict___8'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___8) + | | | +- GuardManager: source=G['__builtins_dict___8']['len'], accessed_by=DictGetItemGuardAccessor(len) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___8']['len'], 140413275558816) + | | | +- GuardManager: source=G['__builtins_dict___8']['sum'], accessed_by=DictGetItemGuardAccessor(sum) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___8']['sum'], 140413275559936) + | | | +- GuardManager: source=G['__builtins_dict___8']['list'], accessed_by=DictGetItemGuardAccessor(list) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___8']['list'], 8844320) + | | | +- GuardManager: source=G['__builtins_dict___8']['type'], accessed_by=DictGetItemGuardAccessor(type) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___8']['type'], 8813248) + | | | +- GuardManager: source=G['__builtins_dict___8']['tuple'], accessed_by=DictGetItemGuardAccessor(tuple) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___8']['tuple'], 8815232) + | | | +- GuardManager: source=G['__builtins_dict___8']['object'], accessed_by=DictGetItemGuardAccessor(object) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___8']['object'], 8813984) + | | | +- GuardManager: source=G['__builtins_dict___8']['isinstance'], accessed_by=DictGetItemGuardAccessor(isinstance) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___8']['isinstance'], 140413275558496) + | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot_utils_dot__pytree) + | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree'], 140411217627952) + | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree'].TreeSpec, accessed_by=GetAttrGuardAccessor(TreeSpec) + | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree'].TreeSpec, 84866496) + | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._is_leaf, accessed_by=GetAttrGuardAccessor(_is_leaf) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._is_leaf.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree']._is_leaf.__code__, 140411217262720) + | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC, accessed_by=GetAttrGuardAccessor(_LEAF_SPEC) + | | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC, 85171104) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.type, accessed_by=GetAttrGuardAccessor(type) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.type, 8825760) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.context, accessed_by=GetAttrGuardAccessor(context) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.context, 8825760) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.num_nodes, accessed_by=GetAttrGuardAccessor(num_nodes) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.num_nodes == 1 + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.num_leaves, accessed_by=GetAttrGuardAccessor(num_leaves) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.num_leaves == 1 + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.num_children, accessed_by=GetAttrGuardAccessor(num_children) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.num_children == 0 + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.children_specs, accessed_by=GetAttrGuardAccessor(children_specs) + | | | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.children_specs, 8844320) + | | | | | +- LENGTH_CHECK: not G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.children_specs + | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._get_node_type, accessed_by=GetAttrGuardAccessor(_get_node_type) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._get_node_type.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree']._get_node_type.__code__, 140411217262448) + | | | +- DictGuardManager: source=G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES, accessed_by=GetAttrGuardAccessor(SUPPORTED_NODES) + | | | | +- DICT_VERSION: ___dict_version(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES) == 519596 + | | | | +- KeyValueManager pair at index=1 + | | | | | +- ValueManager: GuardManager: source=G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]] + | | | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]].flatten_fn, accessed_by=GetAttrGuardAccessor(flatten_fn) + | | | | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]].flatten_fn.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]].flatten_fn.__code__, 140411196281984) + | | | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]].unflatten_fn, accessed_by=GetAttrGuardAccessor(unflatten_fn) + | | | | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]].unflatten_fn.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]].unflatten_fn.__code__, 140411217182288) + | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._tree_flatten_helper, accessed_by=GetAttrGuardAccessor(_tree_flatten_helper) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._tree_flatten_helper.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree']._tree_flatten_helper.__code__, 140411217413040) + | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._is_namedtuple_instance, accessed_by=GetAttrGuardAccessor(_is_namedtuple_instance) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._is_namedtuple_instance.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree']._is_namedtuple_instance.__code__, 140411217412592) + +V1003 10:11:03.491000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "eda00b5e5981dad4abbde14a07a38b9c"} + { + "name": "entire_frame_compile", + "ts": 1727975463491399.2, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 1, + "fxgraph_cache_miss": 1, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.491000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "dc3c58a766d43af7e167ce88702b4e5c"} + { + "name": "_compile.compile_inner", + "ts": 1727975463491701.2, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 1, + "fxgraph_cache_miss": 1, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.492000 2235078 torch/_dynamo/utils.py:840] {"compilation_metrics": {"compile_id": "0/0", "frame_key": "1", "co_name": "_flex_attention_hop_wrapper", "co_filename": "/data/users/oulgen/pytorch/torch/nn/attention/flex_attention.py", "co_firstlineno": 1049, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 55, "shape_env_guard_count": 0, "graph_op_count": 12, "graph_node_count": 26, "graph_input_count": 11, "start_time": 1727975463.3253677, "entire_frame_compile_time_s": 0.16592144966125488, "backend_compile_time_s": 0.0006587505340576172, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0, "has_guarded_code": true, "possibly_missed_reinplacing_opportunities": 0, "remote_cache_time_saved_s": 0, "structured_logging_overhead_s": 0.04651781599999999, "config_suppress_errors": false, "config_inline_inbuilt_nn_modules": true, "specialize_float": true}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.496000 2235078 torch/_dynamo/convert_frame.py:915] {"dynamo_start": {"stack": [{"line": 916, "name": "", "filename": 1}, {"line": 14, "name": "run_tests", "filename": 2}, {"line": 38, "name": "run_tests", "filename": 3}, {"line": 1273, "name": "run_tests", "filename": 4}, {"line": 102, "name": "__init__", "filename": 5}, {"line": 274, "name": "runTests", "filename": 5}, {"line": 217, "name": "run", "filename": 6}, {"line": 84, "name": "__call__", "filename": 7}, {"line": 122, "name": "run", "filename": 7}, {"line": 84, "name": "__call__", "filename": 7}, {"line": 122, "name": "run", "filename": 7}, {"line": 678, "name": "__call__", "filename": 8}, {"line": 3116, "name": "run", "filename": 4}, {"line": 3088, "name": "_run_custom", "filename": 4}, {"line": 623, "name": "run", "filename": 8}, {"line": 579, "name": "_callTestMethod", "filename": 8}, {"line": 2983, "name": "wrapper", "filename": 4}, {"line": 81, "name": "inner", "filename": 9}, {"line": 81, "name": "inner", "filename": 9}, {"line": 414, "name": "test_flex_attention_caching", "filename": 1}, {"line": 385, "name": "fn2", "filename": 1}]}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.496000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "68d9557f97487c1ecd28ffaf4020aff9"} + { + "name": "_compile.compile_inner", + "ts": 1727975463496392.5, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.496000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "eab0483a7d9a0ee867d4596b4367f9e6"} + { + "name": "entire_frame_compile", + "ts": 1727975463496392.5, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.499000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 0, "describer_id": 362, "size": 524288}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.499000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 0, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1, 4, 512, 64], "is_leaf": true, "stride": [131072, 32768, 64, 1], "storage": 0, "view_func": "", "describer_id": 362}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.499000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 362, "id": 0, "source": "L['q']"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.508000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 1, "describer_id": 362, "size": 524288}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.508000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 1, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1, 4, 512, 64], "is_leaf": true, "stride": [131072, 32768, 64, 1], "storage": 1, "view_func": "", "describer_id": 362}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.509000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 362, "id": 1, "source": "L['k']"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.510000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 2, "describer_id": 362, "size": 524288}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.510000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1, 4, 512, 64], "is_leaf": true, "stride": [131072, 32768, 64, 1], "storage": 2, "view_func": "", "describer_id": 362}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.511000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 362, "id": 2, "source": "L['v']"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.520000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 3, "describer_id": 362, "size": 64}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.520000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 3, "ndim": 3, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16], "is_leaf": true, "stride": [16, 16, 1], "storage": 3, "view_func": "", "describer_id": 362}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.521000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 362, "id": 3, "source": "L['block_mask'].kv_num_blocks"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.653000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 4, "describer_id": 362, "size": 1024}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.654000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 5, "ndim": 4, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16, 16], "is_leaf": true, "stride": [256, 256, 16, 1], "storage": 4, "view_func": "", "describer_id": 362}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.654000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 362, "id": 5, "source": "L['block_mask'].kv_indices"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.655000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 5, "describer_id": 362, "size": 64}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.656000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 6, "ndim": 3, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16], "is_leaf": true, "stride": [16, 16, 1], "storage": 5, "view_func": "", "describer_id": 362}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.656000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 362, "id": 6, "source": "L['block_mask'].full_kv_num_blocks"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.657000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 6, "describer_id": 362, "size": 1024}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.658000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 7, "ndim": 4, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16, 16], "is_leaf": true, "stride": [256, 256, 16, 1], "storage": 6, "view_func": "", "describer_id": 362}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.658000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 362, "id": 7, "source": "L['block_mask'].full_kv_indices"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.659000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 7, "describer_id": 362, "size": 64}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.659000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 8, "ndim": 3, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16], "is_leaf": true, "stride": [16, 16, 1], "storage": 7, "view_func": "", "describer_id": 362}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.660000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 362, "id": 8, "source": "L['block_mask'].q_num_blocks"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.661000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 8, "describer_id": 362, "size": 1024}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.661000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 9, "ndim": 4, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16, 16], "is_leaf": true, "stride": [256, 256, 16, 1], "storage": 8, "view_func": "", "describer_id": 362}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.661000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 362, "id": 9, "source": "L['block_mask'].q_indices"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.663000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 9, "describer_id": 362, "size": 64}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.663000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 10, "ndim": 3, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16], "is_leaf": true, "stride": [16, 16, 1], "storage": 9, "view_func": "", "describer_id": 362}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.663000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 362, "id": 10, "source": "L['block_mask'].full_q_num_blocks"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.665000 2235078 torch/_subclasses/meta_utils.py:204] {"describe_storage": {"id": 10, "describer_id": 362, "size": 1024}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.665000 2235078 torch/_subclasses/meta_utils.py:417] {"describe_tensor": {"id": 11, "ndim": 4, "dtype": "torch.int32", "device": "device(type='cuda', index=0)", "size": [1, 1, 16, 16], "is_leaf": true, "stride": [256, 256, 16, 1], "storage": 10, "view_func": "", "describer_id": 362}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.665000 2235078 torch/_subclasses/meta_utils.py:1640] {"describe_source": {"describer_id": 362, "id": 11, "source": "L['block_mask'].full_q_indices"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} +V1003 10:11:03.673000 2235078 torch/_dynamo/output_graph.py:1347] {"dynamo_output_graph": {"sizes": {"l_q_": [1, 4, 512, 64], "l_k_": [1, 4, 512, 64], "l_v_": [1, 4, 512, 64], "l_block_mask_kv_num_blocks": [1, 1, 16], "l_block_mask_kv_indices": [1, 1, 16, 16], "l_block_mask_full_kv_num_blocks": [1, 1, 16], "l_block_mask_full_kv_indices": [1, 1, 16, 16], "l_block_mask_q_num_blocks": [1, 1, 16], "l_block_mask_q_indices": [1, 1, 16, 16], "l_block_mask_full_q_num_blocks": [1, 1, 16], "l_block_mask_full_q_indices": [1, 1, 16, 16], "child_1": [], "child_2": [], "child_3": [], "child_4": [], "child": [], "child_5": [], "child_6": [], "child_7": [], "child_8": [], "out": [1, 4, 512, 64]}}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "487c69baa1e2981125efdf989466d11f"} + class GraphModule(torch.nn.Module): + def forward(self, L_q_: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0", L_k_: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0", L_v_: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0", L_block_mask_kv_num_blocks: "i32[1, 1, 16][16, 16, 1]cuda:0", L_block_mask_kv_indices: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0", L_block_mask_full_kv_num_blocks: "i32[1, 1, 16][16, 16, 1]cuda:0", L_block_mask_full_kv_indices: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0", L_block_mask_q_num_blocks: "i32[1, 1, 16][16, 16, 1]cuda:0", L_block_mask_q_indices: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0", L_block_mask_full_q_num_blocks: "i32[1, 1, 16][16, 16, 1]cuda:0", L_block_mask_full_q_indices: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0"): + l_q_ = L_q_ + l_k_ = L_k_ + l_v_ = L_v_ + l_block_mask_kv_num_blocks = L_block_mask_kv_num_blocks + l_block_mask_kv_indices = L_block_mask_kv_indices + l_block_mask_full_kv_num_blocks = L_block_mask_full_kv_num_blocks + l_block_mask_full_kv_indices = L_block_mask_full_kv_indices + l_block_mask_q_num_blocks = L_block_mask_q_num_blocks + l_block_mask_q_indices = L_block_mask_q_indices + l_block_mask_full_q_num_blocks = L_block_mask_full_q_num_blocks + l_block_mask_full_q_indices = L_block_mask_full_q_indices + + # File: /data/users/oulgen/pytorch/torch/nn/attention/flex_attention.py:1032 in flex_attention, code: out, lse = flex_attention_hop( + child_1: "i32[][]cuda:0" = l_q_.new_empty([], dtype = torch.int32); child_1 = None + child_2: "i32[][]cuda:0" = l_q_.new_empty([], dtype = torch.int32); child_2 = None + child_3: "i32[][]cuda:0" = l_q_.new_empty([], dtype = torch.int32); child_3 = None + child_4: "i32[][]cuda:0" = l_q_.new_empty([], dtype = torch.int32); child_4 = None + child: "f32[][]cuda:0" = l_q_.new_empty([], requires_grad = False); child = None + score_mod_0 = self.score_mod_0 + child_5: "i32[][]cuda:0" = l_q_.new_empty([], dtype = torch.int32); child_5 = None + child_6: "i32[][]cuda:0" = l_q_.new_empty([], dtype = torch.int32); child_6 = None + child_7: "i32[][]cuda:0" = l_q_.new_empty([], dtype = torch.int32); child_7 = None + child_8: "i32[][]cuda:0" = l_q_.new_empty([], dtype = torch.int32); child_8 = None + mask_fn_0 = self.mask_fn_0 + flex_attention = torch.ops.higher_order.flex_attention(l_q_, l_k_, l_v_, score_mod_0, (l_block_mask_kv_num_blocks, l_block_mask_kv_indices, l_block_mask_full_kv_num_blocks, l_block_mask_full_kv_indices, l_block_mask_q_num_blocks, l_block_mask_q_indices, l_block_mask_full_q_num_blocks, l_block_mask_full_q_indices, 128, 128, mask_fn_0), 0.125, {'ROWS_GUARANTEED_SAFE': False, 'PRESCALE_QK': False, 'OUTPUT_LOGSUMEXP': False}, (), ()); l_q_ = l_k_ = l_v_ = score_mod_0 = l_block_mask_kv_num_blocks = l_block_mask_kv_indices = l_block_mask_full_kv_num_blocks = l_block_mask_full_kv_indices = l_block_mask_q_num_blocks = l_block_mask_q_indices = l_block_mask_full_q_num_blocks = l_block_mask_full_q_indices = mask_fn_0 = None + out: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0" = flex_attention[0]; flex_attention = None + return (out,) + + class score_mod_0(torch.nn.Module): + def forward(self, child: "f32[][]cuda:0", child_1: "i32[][]cuda:0", child_2: "i32[][]cuda:0", child_3: "i32[][]cuda:0", child_4: "i32[][]cuda:0"): + return child + + class mask_fn_0(torch.nn.Module): + def forward(self, child_5: "i32[][]cuda:0", child_6: "i32[][]cuda:0", child_7: "i32[][]cuda:0", child_8: "i32[][]cuda:0"): + # File: /data/users/oulgen/pytorch/test/inductor/test_codecache.py:373 in , code: lambda b, h, q, kv: q >= kv, None, None, 2048, 2048 + ge: "b8[][]cuda:0" = child_7 >= child_8; child_7 = child_8 = None + return ge + +V1003 10:11:03.674000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "94724b556ec4c03944e43538685317ee"} + { + "name": "OutputGraph.call_user_compiler", + "ts": 1727975463674357.2, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.674000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "726b102bddec88824ed82f86c8c02d39"} + { + "name": "backend_compile", + "ts": 1727975463674357.2, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.680000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "c103e7f8badb6398fb5a927c8aeef126"} + { + "name": "create_aot_dispatcher_function", + "ts": 1727975463680196.2, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.798000 2235078 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:215] {"aot_forward_graph": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "fe8d53ae08cd360e1378a8a527d186f0"} + class (torch.nn.Module): + def forward(self, arg0_1: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0", arg1_1: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0", arg2_1: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0", arg3_1: "i32[1, 1, 16][16, 16, 1]cuda:0", arg4_1: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0", arg5_1: "i32[1, 1, 16][16, 16, 1]cuda:0", arg6_1: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0", arg7_1: "i32[1, 1, 16][16, 16, 1]cuda:0", arg8_1: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0", arg9_1: "i32[1, 1, 16][16, 16, 1]cuda:0", arg10_1: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0"): + # File: /data/users/oulgen/pytorch/torch/nn/attention/flex_attention.py:1032 in flex_attention, code: out, lse = flex_attention_hop( + sdpa_score0 = self.sdpa_score0 + sdpa_mask0 = self.sdpa_mask0 + flex_attention = torch.ops.higher_order.flex_attention(arg0_1, arg1_1, arg2_1, sdpa_score0, (arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, 128, 128, sdpa_mask0), 0.125, {'ROWS_GUARANTEED_SAFE': False, 'PRESCALE_QK': False, 'OUTPUT_LOGSUMEXP': False}, (), ()); arg0_1 = arg1_1 = arg2_1 = sdpa_score0 = arg3_1 = arg4_1 = arg5_1 = arg6_1 = arg7_1 = arg8_1 = arg9_1 = arg10_1 = sdpa_mask0 = None + getitem: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0" = flex_attention[0]; flex_attention = None + return (getitem,) + + class sdpa_score0(torch.nn.Module): + def forward(self, arg0_1: "f32[][]cuda:0", arg1_1: "i32[][]cuda:0", arg2_1: "i32[][]cuda:0", arg3_1: "i32[][]cuda:0", arg4_1: "i32[][]cuda:0"): + return arg0_1 + + class sdpa_mask0(torch.nn.Module): + def forward(self, arg0_1: "i32[][]cuda:0", arg1_1: "i32[][]cuda:0", arg2_1: "i32[][]cuda:0", arg3_1: "i32[][]cuda:0"): + # File: /data/users/oulgen/pytorch/test/inductor/test_codecache.py:373 in , code: lambda b, h, q, kv: q >= kv, None, None, 2048, 2048 + ge: "b8[][]cuda:0" = torch.ops.aten.ge.Tensor(arg2_1, arg3_1); arg2_1 = arg3_1 = None + return ge + +V1003 10:11:03.799000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "19af874521da2f24c9f873fb757d7dc6"} + { + "name": "compile_fx..fw_compiler_base", + "ts": 1727975463799121.2, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.801000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "bb6048e2282795589bd2400ba093dfc8"} + { + "name": "compile_fx_inner", + "ts": 1727975463801024.5, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.801000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "b7b2a66f7ec1d6b6227990c8cd3b1d21"} + { + "name": "inductor_compile", + "ts": 1727975463801024.5, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.816000 2235078 torch/_inductor/compile_fx.py:731] {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "c507028eda974d39e4ca6593c6320460"} + + import torch + from torch import tensor, device + import torch.fx as fx + from torch._dynamo.testing import rand_strided + from math import inf + import torch._inductor.inductor_prims + + import torch._dynamo.config + import torch._inductor.config + import torch._functorch.config + import torch.fx.experimental._config + torch._dynamo.config.log_compilation_metrics = False + torch._dynamo.config.fake_tensor_cache_crosscheck_enabled = True + torch._inductor.config.fx_graph_remote_cache = False + torch._inductor.config.autotune_local_cache = False + torch._inductor.config.autotune_remote_cache = False + torch._functorch.config.unlift_effect_tokens = True + + + + isolate_fails_code_str = None + + + + # torch version: 2.5.0a0+git7647c39 + # torch cuda version: 12.0 + # torch git version: 7647c398ff87daf70260854cf0a7f7993b3abc76 + + + # CUDA Info: + # nvcc: NVIDIA (R) Cuda compiler driver + # Copyright (c) 2005-2023 NVIDIA Corporation + # Built on Fri_Jan__6_16:45:21_PST_2023 + # Cuda compilation tools, release 12.0, V12.0.140 + # Build cuda_12.0.r12.0/compiler.32267302_0 + + # GPU Hardware Info: + # NVIDIA PG509-210 : 8 + + + from torch.nn import * + class Repro(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self.sdpa_score0 = () + self.sdpa_mask0 = () + + + + def forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1): + sdpa_score0 = self.sdpa_score0 + sdpa_mask0 = self.sdpa_mask0 + flex_attention = torch.ops.higher_order.flex_attention(arg0_1, arg1_1, arg2_1, sdpa_score0, (arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, 128, 128, sdpa_mask0), 0.125, {'ROWS_GUARANTEED_SAFE': False, 'PRESCALE_QK': False, 'OUTPUT_LOGSUMEXP': False}, (), ()); arg0_1 = arg1_1 = arg2_1 = sdpa_score0 = arg3_1 = arg4_1 = arg5_1 = arg6_1 = arg7_1 = arg8_1 = arg9_1 = arg10_1 = sdpa_mask0 = None + getitem = flex_attention[0]; flex_attention = None + return (getitem,) + + def load_args(reader): + buf0 = reader.storage(None, 524288, device=device(type='cuda', index=0)) + reader.tensor(buf0, (1, 4, 512, 64), is_leaf=True) # arg0_1 + buf1 = reader.storage(None, 524288, device=device(type='cuda', index=0)) + reader.tensor(buf1, (1, 4, 512, 64), is_leaf=True) # arg1_1 + buf2 = reader.storage(None, 524288, device=device(type='cuda', index=0)) + reader.tensor(buf2, (1, 4, 512, 64), is_leaf=True) # arg2_1 + buf3 = reader.storage(None, 64, device=device(type='cuda', index=0), dtype_hint=torch.int32) + reader.tensor(buf3, (1, 1, 16), dtype=torch.int32, is_leaf=True) # arg3_1 + buf4 = reader.storage(None, 1024, device=device(type='cuda', index=0), dtype_hint=torch.int32) + reader.tensor(buf4, (1, 1, 16, 16), dtype=torch.int32, is_leaf=True) # arg4_1 + buf5 = reader.storage(None, 64, device=device(type='cuda', index=0), dtype_hint=torch.int32) + reader.tensor(buf5, (1, 1, 16), dtype=torch.int32, is_leaf=True) # arg5_1 + buf6 = reader.storage(None, 1024, device=device(type='cuda', index=0), dtype_hint=torch.int32) + reader.tensor(buf6, (1, 1, 16, 16), dtype=torch.int32, is_leaf=True) # arg6_1 + buf7 = reader.storage(None, 64, device=device(type='cuda', index=0), dtype_hint=torch.int32) + reader.tensor(buf7, (1, 1, 16), dtype=torch.int32, is_leaf=True) # arg7_1 + buf8 = reader.storage(None, 1024, device=device(type='cuda', index=0), dtype_hint=torch.int32) + reader.tensor(buf8, (1, 1, 16, 16), dtype=torch.int32, is_leaf=True) # arg8_1 + buf9 = reader.storage(None, 64, device=device(type='cuda', index=0), dtype_hint=torch.int32) + reader.tensor(buf9, (1, 1, 16), dtype=torch.int32, is_leaf=True) # arg9_1 + buf10 = reader.storage(None, 1024, device=device(type='cuda', index=0), dtype_hint=torch.int32) + reader.tensor(buf10, (1, 1, 16, 16), dtype=torch.int32, is_leaf=True) # arg10_1 + load_args._version = 0 + mod = Repro() + if __name__ == '__main__': + from torch._dynamo.repro.after_aot import run_repro + with torch.no_grad(): + run_repro(mod, load_args, accuracy=False, command='run', save_dir=None, tracing_mode='real', check_str=None) + # To run it separately, do + # mod, args = run_repro(mod, load_args, accuracy=False, command='get_args', save_dir=None, tracing_mode='real', check_str=None) + # mod(*args) +V1003 10:11:03.827000 2235078 torch/_inductor/compile_fx.py:795] {"inductor_post_grad_graph": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "fe8d53ae08cd360e1378a8a527d186f0"} + class (torch.nn.Module): + def forward(self, arg0_1: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0", arg1_1: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0", arg2_1: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0", arg3_1: "i32[1, 1, 16][16, 16, 1]cuda:0", arg4_1: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0", arg5_1: "i32[1, 1, 16][16, 16, 1]cuda:0", arg6_1: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0", arg7_1: "i32[1, 1, 16][16, 16, 1]cuda:0", arg8_1: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0", arg9_1: "i32[1, 1, 16][16, 16, 1]cuda:0", arg10_1: "i32[1, 1, 16, 16][256, 256, 16, 1]cuda:0"): + # File: /data/users/oulgen/pytorch/torch/nn/attention/flex_attention.py:1032 in flex_attention, code: out, lse = flex_attention_hop( + sdpa_score0 = self.sdpa_score0 + sdpa_mask0 = self.sdpa_mask0 + flex_attention = torch.ops.higher_order.flex_attention(arg0_1, arg1_1, arg2_1, sdpa_score0, (arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, 128, 128, sdpa_mask0), 0.125, {'ROWS_GUARANTEED_SAFE': False, 'PRESCALE_QK': False, 'OUTPUT_LOGSUMEXP': False}, (), ()); arg0_1 = arg1_1 = arg2_1 = sdpa_score0 = arg3_1 = arg4_1 = arg5_1 = arg6_1 = arg7_1 = arg8_1 = arg9_1 = arg10_1 = sdpa_mask0 = None + getitem: "f32[1, 4, 512, 64][131072, 32768, 64, 1]cuda:0" = flex_attention[0]; flex_attention = None + return (getitem,) + + class sdpa_score0(torch.nn.Module): + def forward(self, arg0_1: "f32[][]cuda:0", arg1_1: "i32[][]cuda:0", arg2_1: "i32[][]cuda:0", arg3_1: "i32[][]cuda:0", arg4_1: "i32[][]cuda:0"): + return arg0_1 + + class sdpa_mask0(torch.nn.Module): + def forward(self, arg0_1: "i32[][]cuda:0", arg1_1: "i32[][]cuda:0", arg2_1: "i32[][]cuda:0", arg3_1: "i32[][]cuda:0"): + # File: /data/users/oulgen/pytorch/test/inductor/test_codecache.py:373 in , code: lambda b, h, q, kv: q >= kv, None, None, 2048, 2048 + ge: "b8[][]cuda:0" = torch.ops.aten.ge.Tensor(arg2_1, arg3_1); arg2_1 = arg3_1 = None + return ge + +V1003 10:11:03.828000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "e24475885265d7add113de685b5b4785"} + { + "name": "GraphLowering.run", + "ts": 1727975463828251.2, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.863000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "df613d910e7ef57b6d235421c34afbec"} + { + "name": "GraphLowering.run", + "ts": 1727975463863685.0, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 1, + "fxgraph_cache_miss": 2, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.864000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "5fd8784b67f1838294fd144c5df7385a"} + { + "name": "GraphLowering.compile_to_module", + "ts": 1727975463864454.0, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.864000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "2614bec07c124d967f10a9f150623f5a"} + { + "name": "code_gen", + "ts": 1727975463864454.0, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.868000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "c941c5142cd262302a4ea27ba7d7eda9"} + { + "name": "Scheduler.__init__", + "ts": 1727975463868177.8, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.871000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "19fd40416e912c56022b05ddd45c6f91"} + { + "name": "Scheduler.__init__", + "ts": 1727975463871282.5, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 1, + "fxgraph_cache_miss": 2, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.871000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "1ae33988f4240ac670d91f8fe8bc3613"} + { + "name": "Scheduler.codegen", + "ts": 1727975463871646.2, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.881000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "4ea7f96a10ffbc186f91b8fbf392a78d"} + { + "name": "Scheduler.codegen", + "ts": 1727975463881456.5, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 1, + "fxgraph_cache_miss": 2, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.881000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "72cc59f88d66ead0086c546b18c184d7"} + { + "name": "PythonWrapperCodegen.generate", + "ts": 1727975463881839.8, + "args": null, + "ph": "B", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.884000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "20792f3cea34e70b42f7c6d998aaa598"} + { + "name": "PythonWrapperCodegen.generate", + "ts": 1727975463884612.0, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 1, + "fxgraph_cache_miss": 2, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:03.885000 2235078 torch/_inductor/graph.py:1899] {"inductor_output_code": {"filename": "/tmp/oulgen/tmp4z1i5ywe/si/csitvhfwicmtxv44ng5kkavccd5rcpalvwhjof7rk2hputpzelxm.py"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "c453185bb704fad4bd52bdb872abe96c"} + # AOT ID: ['2_inference'] + from ctypes import c_void_p, c_long, c_int + import torch + import math + import random + import os + import tempfile + from math import inf, nan + from torch._inductor.hooks import run_intermediate_hooks + from torch._inductor.utils import maybe_profile + from torch._inductor.codegen.memory_planning import _align as align + from torch import device, empty_strided + from torch._inductor.async_compile import AsyncCompile + from torch._inductor.select_algorithm import extern_kernels + from torch._inductor.codegen.multi_kernel import MultiKernelCall + import torch._inductor.kernel.flex_attention + from torch._C import _cuda_getCurrentRawStream as get_raw_stream + import triton + import triton.language as tl + from torch._inductor.runtime.triton_heuristics import grid, split_scan_grid, grid_combo_kernels, start_graph, end_graph + + aten = torch.ops.aten + inductor_ops = torch.ops.inductor + _quantized = torch.ops._quantized + assert_size_stride = torch._C._dynamo.guards.assert_size_stride + empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu + empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda + empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu + reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor + alloc_from_pool = torch.ops.inductor._alloc_from_pool + async_compile = AsyncCompile() + + + # kernel path: /tmp/oulgen/tmp4z1i5ywe/vr/cvrnbiynkuy34l7cucuaotjk5vdulwxhwmvmwz7gy5423p3sv57h.py + # Topologically Sorted Source Nodes: [flex_attention], Original ATen: [] + # Source node to ATen node mapping: + # flex_attention => flex_attention + # Graph fragment: + # %flex_attention : [num_users=1] = call_function[target=torch.ops.higher_order.flex_attention](args = (%arg0_1, %arg1_1, %arg2_1, %sdpa_score0, (%arg3_1, %arg4_1, %arg5_1, %arg6_1, %arg7_1, %arg8_1, %arg9_1, %arg10_1, 128, 128, %sdpa_mask0), 0.125, {ROWS_GUARANTEED_SAFE: False, PRESCALE_QK: False, OUTPUT_LOGSUMEXP: False}, (), ()), kwargs = {}) + triton_tem_fused_0 = async_compile.triton('triton_tem_fused_0', ''' + import triton + import triton.language as tl + from triton.compiler.compiler import AttrsDescriptor + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, instance_descriptor, DeviceProperties + + @triton_heuristics.template( + num_stages=3, + num_warps=4, + triton_meta={'signature': {'arg_Q': '*fp32', 'arg_K': '*fp32', 'arg_V': '*fp32', 'arg_LSE': '*fp32', 'arg_KV_NUM_BLKS': '*i32', 'arg_KV_IDX': '*i32', 'arg_FULL_KV_NUM_BLKS': '*i32', 'arg_FULL_KV_IDX': '*i32', 'out_ptr0': '*fp32'}, 'device': DeviceProperties(type='cuda', index=0, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, multi_processor_count=108, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8), equal_to_1=())]}, + inductor_meta={'kernel_name': 'triton_tem_fused_0', 'backend_hash': 'FB2CA426CF35F271C56C0D69873498391AC248E25890F2B631CA8B52D56952BD', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': False, 'autotune_pointwise': True, 'autotune_remote_cache': False, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}, + ) + @triton.jit + def triton_tem_fused_0(arg_Q, arg_K, arg_V, arg_LSE, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0): + ROWS_GUARANTEED_SAFE : tl.constexpr = False + PRESCALE_QK : tl.constexpr = False + OUTPUT_LOGSUMEXP : tl.constexpr = False + FLOAT32_PRECISION : tl.constexpr = 'ieee' + IS_DIVISIBLE : tl.constexpr = True + SM_SCALE : tl.constexpr = 0.125 + GQA_SHARED_HEADS : tl.constexpr = 1 + HAS_FULL_BLOCKS : tl.constexpr = True + QK_HEAD_DIM : tl.constexpr = 64 + V_HEAD_DIM : tl.constexpr = 64 + BLOCK_M : tl.constexpr = 128 + BLOCK_N : tl.constexpr = 32 + SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128 + SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128 + Q = arg_Q + K = arg_K + V = arg_V + LSE = arg_LSE + KV_NUM_BLKS = arg_KV_NUM_BLKS + KV_IDX = arg_KV_IDX + FULL_KV_NUM_BLKS = arg_FULL_KV_NUM_BLKS + FULL_KV_IDX = arg_FULL_KV_IDX + + # Sub notation for this kernel: + # + # Q: Query, K: Key, V: Value + # M: Number of queries, N: Number of keys/values, D: Model dimension + # QK_HEAD_DIM: The dimension of the query and key embeddings + # V_HEAD_DIM: The dimension of the value embeddings + # z: Batch size, h: Number of heads, m: Number of queries per head, k: Number of keys per head + # GQA_SHARED_HEADS: number of query heads sharing one kv head in GQA setups. + # + # The following FULL_* and PARTIAL_* is defined in the block sparse mask grid, rather than the thread block grid. + # KV_NUM_BLKS: The number of KV blocks (that may or may not require masking) for each query. + # KV_IDX: The indices of KV blocks (that may or may not require masking) for each query. + # FULL_KV_NUM_BLKS: The number of fully unmasked KV blocks (so we don't need masking) for each query. + # FULL_KV_IDX: The indices of fully unmasked KV blocks (so we don't need masking) for each query. + # + # OUTPUT_LOGSUMEXP: We only need to store the logsumexp if we require grad + # + # (Modifiable) Performance tuning options + # BLOCK_M: The thread block size across the seqlen dim of Q. + # BLOCK_N: Iterate over BLOCK_N across the seqlen dim of K/V in each thread block. + + # The below are kernel options that can be applied for certain score_mods, + # or involve a numerics vs. perf tradeoff + # PRESCALE_QK: Whether to pre-scale QK by 1/sqrt(d) and change of base. Has + # about 20% more numerical error, but slightly faster. + # ROWS_GUARANTEED_SAFE: Is it guaranteed that at least one value in each row + # is not masked out? If so, we can skip an extra safety check + + tl.static_assert(SPARSE_Q_BLOCK_SIZE >= BLOCK_M and SPARSE_Q_BLOCK_SIZE % BLOCK_M == 0) + tl.static_assert(SPARSE_KV_BLOCK_SIZE >= BLOCK_N and SPARSE_KV_BLOCK_SIZE % BLOCK_N == 0) + + # Define strides of inputs + stride_qz, stride_qh, stride_qm, stride_qk = 131072, 32768, 64, 1 + stride_kz, stride_kh, stride_kn, stride_kk = 131072, 32768, 64, 1 + stride_vz, stride_vh, stride_vn, stride_vk = 131072, 32768, 64, 1 + + ZQ = 1 + HQ = 4 + Q_LEN = 512 + ZKV = 1 + KV_LEN = 512 + + MATMUL_PRECISION = Q.dtype.element_ty + + q_start = tl.program_id(0) + off_zq = tl.program_id(1) // HQ + off_hq = tl.program_id(1) % HQ + + # We support two cases for batch dimension. a) (ZKV == ZQ) where off_zkv = off_zq. + # b) (ZKV == 1 and ZQ > 1) where KV is broadcasted along the batch dimension and off_zkv=0. + off_zkv = off_zq % ZKV + off_hkv = off_hq // GQA_SHARED_HEADS + off_g = off_hq % GQA_SHARED_HEADS + + q_offset = off_zq * stride_qz + off_hq * stride_qh + k_offset = off_zkv * stride_kz + off_hkv * stride_kh + v_offset = off_zkv * stride_vz + off_hkv * stride_vh + + Q = Q + q_offset + K = K + k_offset + V = V + v_offset + + SPARSE_Z = 1 + SPARSE_HQ = 1 + + sparse_idx_z = off_zq % SPARSE_Z + sparse_idx_hq = off_hq % SPARSE_HQ + + SPARSE_Q_MULTIPLE: tl.constexpr = (SPARSE_Q_BLOCK_SIZE // BLOCK_M) + SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N) + + stride_kv_num_blks_h = 16 + stride_kv_idx_h = 256 + stride_kv_idx_m = 16 + + # initialize pointer to m and l + m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") + l_i = tl.zeros([BLOCK_M], dtype=tl.float32) + acc = tl.zeros([BLOCK_M, V_HEAD_DIM], dtype=tl.float32) + + offs_m = q_start * BLOCK_M + tl.arange(0, BLOCK_M) + + # KV_IDX and KV_NUM_BLKS are always contiguous. + sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq + sparse_kv_num_blks_offset = sparse_hz_offset * stride_kv_num_blks_h + q_start // SPARSE_Q_MULTIPLE + sparse_kv_idx_offset = sparse_hz_offset * stride_kv_idx_h + (q_start // SPARSE_Q_MULTIPLE) * stride_kv_idx_m # noqa: B950 + + Q_block_ptr = tl.make_block_ptr( + base=Q, + shape=(Q_LEN, QK_HEAD_DIM), + strides=(stride_qm, stride_qk), + offsets=(q_start * BLOCK_M, 0), + block_shape=(BLOCK_M, QK_HEAD_DIM), + order=(1, 0) + ) + + # load q: it stays in SRAM throughout the inner loop. + if IS_DIVISIBLE: + q = tl.load(Q_block_ptr) + else: + # boundary check is not free, so we only do it when necessary. + q = tl.load(Q_block_ptr, boundary_check=(0,), padding_option = "zero") + + # ~~~~~~~~~~~~~~ normal blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # We don't know anything "special" about these blocks, so we need to apply + # both score_mod and mask_mod to it + kv_indices = KV_IDX + sparse_kv_idx_offset + kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading + kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_kv_num_blks_offset) + block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1)) + + K_block_ptr = tl.make_block_ptr( + base=K, + shape=(QK_HEAD_DIM, KV_LEN), + strides=(stride_kk, stride_kn), + offsets=(0, kv_start), + block_shape=(QK_HEAD_DIM, BLOCK_N), + order=(0, 1) + ) + V_block_ptr = tl.make_block_ptr( + base=V, + shape=(KV_LEN, V_HEAD_DIM), + strides=(stride_vn, stride_vk), + offsets=(kv_start, 0), + block_shape=(BLOCK_N, V_HEAD_DIM), + order=(1, 0) + ) + offs_n = kv_start + tl.arange(0, BLOCK_N) + + acc, l_i, m_i = forward_inner( + arg_Q, arg_K, arg_V, arg_LSE, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, + q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN, + acc, l_i, m_i, + off_zq, off_hq, offs_m[:, None], offs_n[None, :], + kv_indices, kv_num_blocks, + 0, block_n_end, + MATMUL_PRECISION, + IS_FULL_BLOCKS=False, + ) + + # ~~~~~~~~~~~~~~ "full" blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # We know these blocks are guaranteed to be "full", so we don't need to + # apply mask_mod to them - only score_mod + if HAS_FULL_BLOCKS: + # FULL_KV_IDX and FULL_KV_NUM_BLKS are always contiguous. + kv_indices = FULL_KV_IDX + sparse_kv_idx_offset + kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading + kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_kv_num_blks_offset) + block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1)) + + K_block_ptr = tl.make_block_ptr( + base=K, + shape=(QK_HEAD_DIM, KV_LEN), + strides=(stride_kk, stride_kn), + offsets=(0, kv_start), + block_shape=(QK_HEAD_DIM, BLOCK_N), + order=(0, 1) + ) + V_block_ptr = tl.make_block_ptr( + base=V, + shape=(KV_LEN, V_HEAD_DIM), + strides=(stride_vn, stride_vk), + offsets=(kv_start, 0), + block_shape=(BLOCK_N, V_HEAD_DIM), + order=(1, 0) + ) + offs_n = kv_start + tl.arange(0, BLOCK_N) + + acc, l_i, m_i = forward_inner( + arg_Q, arg_K, arg_V, arg_LSE, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, + q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN, + acc, l_i, m_i, + off_zq, off_hq, offs_m[:, None], offs_n[None, :], + kv_indices, kv_num_blocks, + 0, block_n_end, + MATMUL_PRECISION, + IS_FULL_BLOCKS=True, + ) + + + # [Note] Handle fully masked out rows: + # Li will be the sum(e^(-inf)) == 0.0 for masked out rows, mi will be -inf. + # We set Li to 1.0 which will result in lse/out = 0.0 | after the log(li) + mi(0.0) step + l_i = tl.where(l_i == 0.0, 1, l_i) + + acc = acc / l_i[:, None] + idx_zq = tl.program_id(1) // HQ + idx_hq = tl.program_id(1) % HQ + idx_m = offs_m[:, None] + idx_d = tl.arange(0, V_HEAD_DIM)[None, :] + + mask = idx_m < Q_LEN + # TODO generalize and add proper mask support + xindex = idx_d + (64*idx_m) + (32768*idx_hq) + (131072*idx_zq) + tl.store(out_ptr0 + (tl.broadcast_to(idx_d + (64*idx_m) + (32768*idx_hq), acc.shape)), acc, mask) + + # TODO dont want to write this if we dont require grad + if OUTPUT_LOGSUMEXP: + off_hz = tl.program_id(1) + l_ptrs = LSE + off_hz * Q_LEN + offs_m + lse = m_i + tl.math.log2(l_i) + if IS_DIVISIBLE: + tl.store(l_ptrs, lse) + else: + tl.store(l_ptrs, lse, mask=offs_m < Q_LEN) + + @triton.jit + def forward_inner( + arg_Q, arg_K, arg_V, arg_LSE, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, + q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN, + # accumulated values + acc, l_i, m_i, + # Offsets used as inputs to score_mod & mask_mod + # of size [BLOCK_M, BLOCK_N] or scalar. + off_z, off_h, offs_m, offs_n, + # blocksparse data + kv_indices, kv_num_blocks, + # start kv and end kv block + block_n_start, block_n_end, + MATMUL_PRECISION, + IS_FULL_BLOCKS, + ): + # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through + ROWS_GUARANTEED_SAFE : tl.constexpr = False + PRESCALE_QK : tl.constexpr = False + OUTPUT_LOGSUMEXP : tl.constexpr = False + FLOAT32_PRECISION : tl.constexpr = 'ieee' + IS_DIVISIBLE : tl.constexpr = True + SM_SCALE : tl.constexpr = 0.125 + GQA_SHARED_HEADS : tl.constexpr = 1 + HAS_FULL_BLOCKS : tl.constexpr = True + QK_HEAD_DIM : tl.constexpr = 64 + V_HEAD_DIM : tl.constexpr = 64 + BLOCK_M : tl.constexpr = 128 + BLOCK_N : tl.constexpr = 32 + SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128 + SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128 + + + SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N) + RCP_LN2: tl.constexpr = 1.44269504 + + if PRESCALE_QK: + q = (q * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION) + + # loop over k, v and update accumulator until block_n_end + for start_n in range(block_n_start, block_n_end): + if IS_DIVISIBLE: + acc, l_i, m_i = forward_block_mn( + arg_Q, arg_K, arg_V, arg_LSE, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, + q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN, + # accumulated values + acc, l_i, m_i, + # Offsets + off_z, off_h, offs_m, offs_n, + MATMUL_PRECISION, RCP_LN2, + IS_FULL_BLOCKS, + ) + else: + # Benchmark shows even we applied mod & mask to each block for non divisible seqlen, + # it's on par or slightly faster than only applying to the last block in fwd. + # However, we choose different strategy for bwd, where we only apply mod & mask + # to the last block because it's faster a lot. + acc, l_i, m_i = forward_block_mn( + arg_Q, arg_K, arg_V, arg_LSE, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, + q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN, + # accumulated values + acc, l_i, m_i, + # Offsets + off_z, off_h, offs_m, offs_n, + MATMUL_PRECISION, RCP_LN2, + IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=True, + ) + + # update pointers + offset = get_offset_for_next_block( + start_n, kv_indices, kv_num_blocks, + SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N + ) + + V_block_ptr = tl.advance(V_block_ptr, (offset, 0)) + K_block_ptr = tl.advance(K_block_ptr, (0, offset)) + + offs_n = offs_n + offset + + return acc, l_i, m_i + + + @triton.jit + def get_offset_for_next_block(loop_iter, col_indices, total_blocks, SPARSE_BLOCK, SPARSE_BLOCK_MULTIPLE, BLOCK): + cur_block_idx = loop_iter // SPARSE_BLOCK_MULTIPLE + cur_block = tl.load(col_indices + cur_block_idx, eviction_policy="evict_last") + next_block = tl.load(col_indices + cur_block_idx + 1, eviction_policy="evict_last", mask=cur_block_idx + 1 < total_blocks) + needs_jump = (loop_iter + 1) % SPARSE_BLOCK_MULTIPLE == 0 + jump_to_block = (next_block - cur_block ) * SPARSE_BLOCK - (SPARSE_BLOCK_MULTIPLE - 1) * BLOCK + + offset = jump_to_block * needs_jump + (1 - needs_jump) * BLOCK + return offset + + @triton.jit + def forward_block_mn( + arg_Q, arg_K, arg_V, arg_LSE, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, + q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN, + # accumulated values + acc, l_i, m_i, + # Offsets + off_z, off_h, offs_m, offs_n, + MATMUL_PRECISION, RCP_LN2, + IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=False, + ): + # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through + ROWS_GUARANTEED_SAFE : tl.constexpr = False + PRESCALE_QK : tl.constexpr = False + OUTPUT_LOGSUMEXP : tl.constexpr = False + FLOAT32_PRECISION : tl.constexpr = 'ieee' + IS_DIVISIBLE : tl.constexpr = True + SM_SCALE : tl.constexpr = 0.125 + GQA_SHARED_HEADS : tl.constexpr = 1 + HAS_FULL_BLOCKS : tl.constexpr = True + QK_HEAD_DIM : tl.constexpr = 64 + V_HEAD_DIM : tl.constexpr = 64 + BLOCK_M : tl.constexpr = 128 + BLOCK_N : tl.constexpr = 32 + SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128 + SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128 + + + # -- load k -- + if IS_DIVISIBLE: + k = tl.load(K_block_ptr) + else: + k = tl.load(K_block_ptr, boundary_check=(1,), padding_option = "zero") + # -- compute qk --- + qk = tl.dot(q, k, input_precision=FLOAT32_PRECISION) # TODO: use cuda matmul when q_len <= 2. + if not PRESCALE_QK: + qk *= SM_SCALE + # ~~~~~~~~~~~~~~~~~~~ Apply score modification ~~~~~~~~~~~~~~~~~~~ + if CHECK_BLOCK_BOUNDARY: + # If this is the last block of a non divisible seqlen, we still need to load [BLOCK_M, BLOCK_N] elements, + # which is larger than the actual number of elements. To avoid access memory out of bound, + # we need to mask out the elements that are out of Q_LEN & KV_LEN. + m = offs_m % Q_LEN + n = offs_n % KV_LEN + else: + m = offs_m + n = offs_n + + post_mod_scores = (qk) + + + if CHECK_BLOCK_BOUNDARY: + # Mask out the elements that are out of the KV_LEN for non divisible seqlen. + post_mod_scores = tl.where(offs_n < KV_LEN, post_mod_scores, float("-inf")) + + if not IS_FULL_BLOCKS: + tmp0 = (m) >= (n) + mask_mod_output = tmp0 + + + if CHECK_BLOCK_BOUNDARY: + mask_mod_output = tl.where(offs_n < KV_LEN, mask_mod_output, float("-inf")) + # apply mask for partially unmasked blocks + post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf")) + + # TODO: In the case that score_mod is linear, this can be LICMed + if not PRESCALE_QK: + post_mod_scores *= RCP_LN2 + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + # -- compute scaling constant --- + m_ij = tl.maximum(m_i, tl.max(post_mod_scores, 1)) + if not ROWS_GUARANTEED_SAFE: + masked_out_rows = (m_ij == float("-inf")) + m_ij_masked = tl.where(masked_out_rows, 0, m_ij) + else: + m_ij_masked = m_ij + + alpha = tl.math.exp2(m_i - m_ij_masked) + p = tl.math.exp2(post_mod_scores - m_ij_masked[:, None]) + + # NB: l_i update is pulled up here since it's a bit faster + # NB: For headdim=256, it's faster to move it back down to after m_i = + # m_ij + l_i = l_i * alpha + tl.sum(p, 1) + # # -- scale and update acc -- + acc = acc * alpha[:, None] + + if IS_DIVISIBLE: + v = tl.load(V_block_ptr) + else: + v = tl.load(V_block_ptr, boundary_check=(0,), padding_option = "zero") + acc = tl.dot(p.to(MATMUL_PRECISION), v, acc, input_precision=FLOAT32_PRECISION) + + # -- update m_i + m_i = m_ij + + return acc, l_i, m_i + ''', device_str='cuda') + meta0 = {'ROWS_GUARANTEED_SAFE': False, 'PRESCALE_QK': False, 'OUTPUT_LOGSUMEXP': False, 'FLOAT32_PRECISION': "'ieee'", 'IS_DIVISIBLE': True, 'SM_SCALE': 0.125, 'GQA_SHARED_HEADS': 1, 'HAS_FULL_BLOCKS': True, 'QK_HEAD_DIM': 64, 'V_HEAD_DIM': 64, 'BLOCK_M': 128, 'BLOCK_N': 32, 'SPARSE_Q_BLOCK_SIZE': 128, 'SPARSE_KV_BLOCK_SIZE': 128} + + + async_compile.wait(globals()) + del async_compile + + def call(args): + arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1 = args + args.clear() + assert_size_stride(arg0_1, (1, 4, 512, 64), (131072, 32768, 64, 1)) + assert_size_stride(arg1_1, (1, 4, 512, 64), (131072, 32768, 64, 1)) + assert_size_stride(arg2_1, (1, 4, 512, 64), (131072, 32768, 64, 1)) + assert_size_stride(arg3_1, (1, 1, 16), (16, 16, 1)) + assert_size_stride(arg4_1, (1, 1, 16, 16), (256, 256, 16, 1)) + assert_size_stride(arg5_1, (1, 1, 16), (16, 16, 1)) + assert_size_stride(arg6_1, (1, 1, 16, 16), (256, 256, 16, 1)) + assert_size_stride(arg7_1, (1, 1, 16), (16, 16, 1)) + assert_size_stride(arg8_1, (1, 1, 16, 16), (256, 256, 16, 1)) + assert_size_stride(arg9_1, (1, 1, 16), (16, 16, 1)) + assert_size_stride(arg10_1, (1, 1, 16, 16), (256, 256, 16, 1)) + buf0 = empty_strided_cuda((1, 4, 512), (2048, 512, 1), torch.float32) + with torch.cuda._DeviceGuard(0): + torch.cuda.set_device(0) + buf1 = empty_strided_cuda((1, 4, 512, 64), (131072, 32768, 64, 1), torch.float32) + # Topologically Sorted Source Nodes: [flex_attention], Original ATen: [] + stream0 = get_raw_stream(0) + triton_tem_fused_0.run(arg0_1, arg1_1, arg2_1, buf0, arg3_1, arg4_1, arg5_1, arg6_1, buf1, grid=torch._inductor.kernel.flex_attention.flex_attention_grid(1, 4, 512, 64, meta0), stream=stream0) + del arg0_1 + del arg1_1 + del arg2_1 + del arg3_1 + del arg4_1 + del arg5_1 + del arg6_1 + del buf0 + return (buf1, ) + + + def benchmark_compiled_module(times=10, repeat=10): + from torch._dynamo.testing import rand_strided + from torch._inductor.utils import print_performance + arg0_1 = rand_strided((1, 4, 512, 64), (131072, 32768, 64, 1), device='cuda:0', dtype=torch.float32) + arg1_1 = rand_strided((1, 4, 512, 64), (131072, 32768, 64, 1), device='cuda:0', dtype=torch.float32) + arg2_1 = rand_strided((1, 4, 512, 64), (131072, 32768, 64, 1), device='cuda:0', dtype=torch.float32) + arg3_1 = rand_strided((1, 1, 16), (16, 16, 1), device='cuda:0', dtype=torch.int32) + arg4_1 = rand_strided((1, 1, 16, 16), (256, 256, 16, 1), device='cuda:0', dtype=torch.int32) + arg5_1 = rand_strided((1, 1, 16), (16, 16, 1), device='cuda:0', dtype=torch.int32) + arg6_1 = rand_strided((1, 1, 16, 16), (256, 256, 16, 1), device='cuda:0', dtype=torch.int32) + arg7_1 = rand_strided((1, 1, 16), (16, 16, 1), device='cuda:0', dtype=torch.int32) + arg8_1 = rand_strided((1, 1, 16, 16), (256, 256, 16, 1), device='cuda:0', dtype=torch.int32) + arg9_1 = rand_strided((1, 1, 16), (16, 16, 1), device='cuda:0', dtype=torch.int32) + arg10_1 = rand_strided((1, 1, 16, 16), (256, 256, 16, 1), device='cuda:0', dtype=torch.int32) + fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1]) + return print_performance(fn, times=times, repeat=repeat) + + + if __name__ == "__main__": + from torch._inductor.wrapper_benchmark import compiled_module_main + compiled_module_main('None', benchmark_compiled_module) + +V1003 10:11:10.314000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "0c6c8a1109a650b2f61d8b8a52500c9f"} + { + "name": "code_gen", + "ts": 1727975470313939.2, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 1, + "fxgraph_cache_miss": 2, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:10.314000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "3d6e3c947033eed0a9cadd2013455900"} + { + "name": "GraphLowering.compile_to_module", + "ts": 1727975470314540.5, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 1, + "fxgraph_cache_miss": 2, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:10.316000 2235078 torch/_dynamo/utils.py:1020] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "17b87920e3cd4803f6a7235280e07423"} + { + "name": "fx_graph_cache_miss", + "ts": 1727975463815134.0, + "args": { + "key": "flfpwf422lzhb6yszystfjdqdgiiqyds6rdbqc57o3a6n4uehjgp", + "components": [ + "[nhtxa6qinb75ty5rqjx3bdkokm2s3bz3nrhke3zmvm73qypm2md] gm: (\n (sdpa_score0): ()\n (sdpa_mask0): ()\n)\n\n\n\ndef forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1):\n sdpa_score0 = self.sdpa_score0\n sdpa_mask0 = self.sdpa_mask0\n flex_attention = torch.ops.higher_order.flex_attention(arg0_1, arg1_1, arg2_1, sdpa_score0, (arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, 128, 128, sdpa_mask0), 0.125, {'ROWS_GUARANTEED_SAFE': False, 'PRESCALE_QK': False, 'OUTPUT_LOGSUMEXP': False}, (), ()); arg0_1 = arg1_1 = arg2_1 = sdpa_score0 = arg3_1 = arg4_1 = arg5_1 = arg6_1 = arg7_1 = arg8_1 = arg9_1 = arg10_1 = sdpa_mask0 = None\n getitem = flex_attention[0]; flex_attention = None\n return (getitem,)\n \n# To see more debug info, please use `graph_module.print_readable()`", + "[avf2u3luxvyabchjhbddapcjn5gev47wfdtkrprayuhv6lf2z6u] example_inputs[0]: TensorMetadata(dtype=torch.float32, shape=torch.Size([1, 4, 512, 64]), stride=(131072, 32768, 64, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", + "[avf2u3luxvyabchjhbddapcjn5gev47wfdtkrprayuhv6lf2z6u] example_inputs[1]: TensorMetadata(dtype=torch.float32, shape=torch.Size([1, 4, 512, 64]), stride=(131072, 32768, 64, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", + "[avf2u3luxvyabchjhbddapcjn5gev47wfdtkrprayuhv6lf2z6u] example_inputs[2]: TensorMetadata(dtype=torch.float32, shape=torch.Size([1, 4, 512, 64]), stride=(131072, 32768, 64, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", + "[zsk3gejenkcvvwhiyk36u5zdnlrcs6wgy3pina3csuierfd2zri] example_inputs[3]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16]), stride=(16, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", + "[hnbjjzmb63q27mbr22eubaelyb423burv27meouma6ccysmwu6g] example_inputs[4]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16, 16]), stride=(256, 256, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", + "[zsk3gejenkcvvwhiyk36u5zdnlrcs6wgy3pina3csuierfd2zri] example_inputs[5]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16]), stride=(16, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", + "[hnbjjzmb63q27mbr22eubaelyb423burv27meouma6ccysmwu6g] example_inputs[6]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16, 16]), stride=(256, 256, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", + "[zsk3gejenkcvvwhiyk36u5zdnlrcs6wgy3pina3csuierfd2zri] example_inputs[7]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16]), stride=(16, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", + "[hnbjjzmb63q27mbr22eubaelyb423burv27meouma6ccysmwu6g] example_inputs[8]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16, 16]), stride=(256, 256, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", + "[zsk3gejenkcvvwhiyk36u5zdnlrcs6wgy3pina3csuierfd2zri] example_inputs[9]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16]), stride=(16, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", + "[hnbjjzmb63q27mbr22eubaelyb423burv27meouma6ccysmwu6g] example_inputs[10]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16, 16]), stride=(256, 256, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] fx_kwargs[aot_mode]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] fx_kwargs[cpp_wrapper]: False", + "[xq2hdkbfkbcuye6rgtypayrkhqf4cntij2dsd24rei3lsknakkf] fx_kwargs[cudagraphs]: BoxedBool(value=False)", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] fx_kwargs[extern_node_serializer]: None", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] fx_kwargs[is_backward]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] fx_kwargs[is_inference]: True", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] fx_kwargs[layout_opt]: None", + "[h25wqx6vliw4j5rtzzbv6latydxyei3deyg6v7wzvnzryfktuki] fx_kwargs[static_input_idxs]: []", + "[f44ag5aflby2bkxl7a4k6whljrk7jat7bmreuxklei4p3czhk7p] fx_kwargs[user_visible_outputs]: {'getitem': None}", + "[vrl5ktomgtzox5xucd3np6vug3vyj6hwwzahqijuwpmamlv7ohi] inputs_to_check[0]: 0", + "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inputs_to_check[1]: 1", + "[pr5nr4a7dthirgd2ljo3d2xakc63ywxugusu6mkmr6gmpeliyib] inputs_to_check[2]: 2", + "[kcuxe2zwm3mzv2uk6adm6iskoy35bqfv725twacrdewod2dbl5d] inputs_to_check[3]: 3", + "[lkkae3meylaixfif4thncru4hjqeaislawjoghffrbwuscaagei] inputs_to_check[4]: 4", + "[qs5hilycp4ew4ivtc7m5jaxp7q4pm5slioxw3fi3ur6ei65ybz4] inputs_to_check[5]: 5", + "[agkvbkaha53nbz3aeeuhvxjvvc4glhfjofzkg6g2qjoo2e5otcx] inputs_to_check[6]: 6", + "[j3s5elu6itwgjafc7rzhy4whrbufl6kfmlufjhh25grt643bk5f] inputs_to_check[7]: 7", + "[yttmfmxblgcbsvbokguzowcorrcxz5uunxtcvsbe6nijgcx45he] inputs_to_check[8]: 8", + "[qlgfiyqewrmkgqth2qm6wkq2ja5lzkapg3ypgnvoyfqqnidaoj3] inputs_to_check[9]: 9", + "[j6c55jha5r2sdys2rwq7uqhtleea5dgjcye7nicfgft36v7xfvp] inputs_to_check[10]: 10", + "[du4vyrfyozrfxcf6kk6ma7oqwatapifazeelfsawmsiu6gjdtxp] deterministic_algorithms_settings: (False, False, True)", + "[qiptf2633zubseuei4bkisoq3not35l6lud6p23p4qmcsxiw2uq] cuda_matmul_settings: (False, True, True)", + "[7uhqwjfn75ek3woo3k7em2mluon5hx2ojvzlevlvjvz6xfxjhzl] torch_version: ", + "[c3z7bmoxyo6gl5hi47v6dc7jwsl55b3asd75nr25uyengi5ah3p] system_info[device]: {'name': 'NVIDIA PG509-210'}", + "[3fb7kae6ogkdd4zcm3fkjoipdpybxhn4aoxzv7z7xsfwq233e4l] system_info[version]: {'triton': '3.1.0+5fe38ffd73dc767c8fadcf23ea82d79e257c37d44077eae7f681cf967565fd43e9c017937b-835d4fc33500e1accafc5c5e00f4f73d87432c114860c04b68849bf6f942b8e5-dc767c8fadcf23ea82d79e257c37d44077eae7f681cf967565fd43e9c017937b-23d635e690d670bf61798e1259674b78c0ed5ba222ab6a455f329f27a758fc2d-e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855-20b017e9c4d858ab05e783f77df50b86c6d6eee5d79f3f4b158562b4a54f8443-f44338a31e0534290b08653050804c3fabbde403a6d3004ae04f0c28495f0802-e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855-a979896b9c0acfd41dd953b90bdc4b10968f7c0b45a286eae3f829aaddb2bb55-da771298f7bc45d24a61f35ef51742304421df1ab49d50bf1fc510dd5a46ea4b-a8fb7be728d460b7ec64ab62edb8af1bbca8994fd718cde7178d46bad64530a1-71330f394e584b0df29595d49f6ac8ac0c5503db9147090dc58ad888cebac7be-f24adfd52383f7866791ebaa5d45a5d2cc826e56ee2fd285f438e85d201fe643-a34be0d3ae4b3ac9aede195cfda42f8a0a097b2bc9642fb59673ce6b3b607f10-36130a37af1b19a0dec569aa08d30b00c74c8f02b6b632999d86dea169146792-36d42f0429aae027cb985b53b9abc616fae4dad9e0ea03953e1e9fb46d0fb9a0-e5d2cb724c08d0ef4130f3ba858d22cf21f834bfd970a5388aa6ad2a6bab91f9', 'cuda': '12.0'}", + "[z5x2bdhir5lzlbti73vdbfulnuu5vinzpwgmmgf4rjb775tzl3h] system_info[hash]: 9698c97edde4a99a2f3b54bbd0db5291bbcdb75c83acb376ccff61fb0bf0ac1a", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[TYPE_CHECKING]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[abi_compatible]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aggressive_fusion]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[allow_buffer_reuse]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[allow_stack_allocation]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[always_keep_tensor_constants]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aot_inductor.debug_compile]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aot_inductor.debug_dump_consts_bin]: False", + "[ngkkx5e6z7erl6da23zb2cmsctz4yvaqyameyg5hbqln4wrhh7x] inductor_config[aot_inductor.debug_intermediate_value_printer]: 0", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[aot_inductor.filtered_kernel_names]: None", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aot_inductor.force_mmap_weights]: False", + "[4bryyl4ahh5whyg3zwqebpwmjnx6w77nqgqbdjlowju6lkqtn7w] inductor_config[aot_inductor.metadata]: {}", + "[v3hzzlv4tjgvp3pyhmzagjd25orl6n7nynoa7svlhhwk73b7u3c] inductor_config[aot_inductor.output_path]: ", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aot_inductor.package]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aot_inductor.package_cpp_only]: False", + "[v3hzzlv4tjgvp3pyhmzagjd25orl6n7nynoa7svlhhwk73b7u3c] inductor_config[aot_inductor.serialized_in_spec]: ", + "[v3hzzlv4tjgvp3pyhmzagjd25orl6n7nynoa7svlhhwk73b7u3c] inductor_config[aot_inductor.serialized_out_spec]: ", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aot_inductor.use_runtime_constant_folding]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[assert_indirect_indexing]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[assume_aligned_inputs]: False", + "[v3hzzlv4tjgvp3pyhmzagjd25orl6n7nynoa7svlhhwk73b7u3c] inductor_config[autoheuristic_collect]: ", + "[jvchmi66fvqzlemhr5fcqorz5trfdtdalzfagtj2aolmimwqhdq] inductor_config[autoheuristic_log_path]: DEFAULT", + "[jwbrgxes7vjqumngs5hyj6gn5nytv2whnppnzngvaagfmawhkkd] inductor_config[autoheuristic_use]: mixed_mm", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[autotune_fallback_to_aten]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[autotune_in_subproc]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[autotune_local_cache]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[autotune_multi_device]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[autotune_remote_cache]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[b2b_gemm_pass]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[batch_fusion]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[benchmark_combo_kernel]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[benchmark_epilogue_fusion]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[benchmark_fusion]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[benchmark_harness]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[benchmark_kernel]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[bw_outputs_user_visible]: True", + "[b4ha3ravs3qv237q65hpfqegbnoww7tf2ahcbu2i7xo6te5spqs] inductor_config[c_shim_version]: 2", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[check_stack_no_cycles_TESTING_ONLY]: False", + "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inductor_config[combo_kernel_allow_mixed_sizes]: 1", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[combo_kernel_foreach_dynamic_shapes]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[combo_kernels]: False", + "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inductor_config[combo_kernels_autotune]: 1", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[comment_origin]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[comprehensive_padding]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[compute_all_bounds]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[constant_and_index_propagation]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[conv_1x1_as_mm]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[coordinate_descent_check_all_directions]: False", + "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inductor_config[coordinate_descent_search_radius]: 1", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[coordinate_descent_tuning]: False", + "[c7zj4qytmety6keurs3hsh5wn7foxp3dqx4kym2ucszzcb2ngrf] inductor_config[cpp.cxx]: (None, 'g++')", + "[yrty22bseefglnysuoec4ji7j2rnaggdj3g33zzj7avogwfmgdw] inductor_config[cpp.descriptive_names]: original_aten", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cpp.dynamic_threads]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cpp.enable_floating_point_contract_flag]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cpp.enable_kernel_profile]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[cpp.enable_loop_tail_vec]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[cpp.enable_tiling_heuristics]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cpp.enable_unsafe_math_opt_flag]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[cpp.fallback_scatter_reduce_sum]: True", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cpp.gemm_cache_blocking]: None", + "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inductor_config[cpp.gemm_max_k_slices]: 1", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cpp.gemm_thread_factors]: None", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cpp.inject_log1p_bug_TESTING_ONLY]: None", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cpp.inject_relu_bug_TESTING_ONLY]: None", + "[ebt2ncs4f5y7dn7btzi76mnouepvzad474tmp5iju4wiuumjl4s] inductor_config[cpp.max_horizontal_fusion_size]: 16", + "[g7rrnbg5yonzux3cfj5ovre5lob3ayda7qcfpxjvtwmiz4uicii] inductor_config[cpp.min_chunk_size]: 4096", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[cpp.no_redundant_loops]: True", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cpp.simdlen]: None", + "[sz3im5ogc6asp7g4uqocnovype63tkdexzfrniv6hn2oank3biu] inductor_config[cpp.threads]: -1", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cpp.vec_isa_ok]: None", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[cpp.weight_prepack]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cpp_wrapper]: False", + "[bsvfcwwoczx2rlkdz2eta6doujsymyihmi46hhwk6clrrvwcb6m] inductor_config[cpu_backend]: cpp", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cuda.arch]: None", + "[tvyftmtdmezlejo2xllu7awzv4pzc4vm4fub4b3gpl5jptjkosi] inductor_config[cuda.compile_opt_level]: -O1", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cuda.cuda_cxx]: None", + "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inductor_config[cuda.cutlass_backend_min_gemm_size]: 1", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cuda.cutlass_max_profiling_configs]: None", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cuda.cutlass_op_allowlist_regex]: None", + "[lwkz5chtpji756gurqw4foijfi7zfgljtnn5nmnvdi2skpt4mgh] inductor_config[cuda.cutlass_op_denylist_regex]: pingpong", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cuda.enable_cuda_lto]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cuda.enable_debug_info]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cuda.enable_ptxas_info]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[cuda.generate_test_runner]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cuda.use_fast_math]: False", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cuda.version]: None", + "[caw4ly2z672k6kjfahoxwpajp5idhhtrpgf3ma2clylcp7c7aid] inductor_config[cuda_backend]: triton", + "[pikr7bbcoixfzftsazp5ggufhdklj24babfry77bl4nuvyrrcp4] inductor_config[custom_op_default_layout_constraint]: needs_fixed_stride_order", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[dce]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[debug]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[debug_fusion]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[debug_index_asserts]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[debug_ir_traceback]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[decompose_mem_bound_mm]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[developer_warnings]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[disable_cpp_codegen]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[disable_padding_cpu]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[disable_progress]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[dynamic_scale_rblock]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[efficient_conv_bn_eval_fx_passes]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[emulate_precision_casts]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[enable_auto_functionalized_v2]: False", + "[v3hzzlv4tjgvp3pyhmzagjd25orl6n7nynoa7svlhhwk73b7u3c] inductor_config[enabled_metric_tables]: ", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[epilogue_fusion]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[epilogue_fusion_first]: False", + "[lxxtoqhcoepwfokeiibd575gnxo3uzwiv4hmpomlwkpzqz3qzsh] inductor_config[estimate_op_runtime]: default", + "[h25wqx6vliw4j5rtzzbv6latydxyei3deyg6v7wzvnzryfktuki] inductor_config[external_matmul]: []", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[fallback_random]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[force_disable_caches]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[force_fuse_int_mm_with_mul]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[force_layout_optimization]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[force_same_precision]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[force_shape_pad]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[freezing]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[freezing_discard_parameters]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[fx_graph_cache]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[fx_graph_remote_cache]: False", + "[62lrdx35b7hnumwb7mp5oc5y5csm2abylvtdzfloct3noaqov3n] inductor_config[fx_passes_numeric_check]: {'pre_grad': False, 'post_grad': False, 'precision': 0.0001, 'num_iterations': 1, 'requires_optimizer': True}", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[generate_intermediate_hooks]: False", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[global_cache_dir]: None", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[group_fusion]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[halide.asserts]: False", + "[ljhgflgihidopsfsdcbqynv27nceykby3nutyd5jlcpq7n6e7l4] inductor_config[halide.cpu_target]: host", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[halide.debug]: False", + "[wx7vmsmrdpk5ue2txlywp3lj3faqmdjphs5fgg2ehzsyno7uovg] inductor_config[halide.gpu_target]: host-cuda", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[halide.scan_kernels]: False", + "[k5ogk6345jvklsnu7g2njqstiz2g6pm5wmqpgg3kasrmuqwjvl6] inductor_config[halide.scheduler_cpu]: Adams2019", + "[svgytlua5wcyeia7wq7e6zgh5tsueikrnzchmdmouvmkpfsc2zq] inductor_config[halide.scheduler_cuda]: Anderson2021", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[implicit_fallbacks]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[inplace_buffers]: True", + "[5fxczt3ciyxitdhizb7sfsgn7fhpczcqsngttnt5ot2wyctk7co] inductor_config[inter_node_bw]: 25", + "[yezuzjtg4h3jjur4jwtwiehbyixa7eonq4tqsqmwqve2lvvmrem] inductor_config[intra_node_bw]: 300", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[is_nightly_or_source]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[is_predispatch]: False", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[joint_custom_post_pass]: None", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[joint_custom_pre_pass]: None", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[joint_graph_constant_folding]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[keep_output_stride]: True", + "[j6c55jha5r2sdys2rwq7uqhtleea5dgjcye7nicfgft36v7xfvp] inductor_config[kernel_name_max_ops]: 10", + "[4p2fdjlvxrcw7c7fvzm5huhtqxnro4kvkx56f7p5zyrxqkwooov] inductor_config[layout_opt_default]: 1", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[layout_optimization]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[loop_ordering_after_fusion]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[max_autotune]: False", + "[uqlsbif4zxd75vt522p52txyuguieipi2lwz5g5awt56lccqk7s] inductor_config[max_autotune_conv_backends]: ATEN,TRITON", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[max_autotune_gemm]: False", + "[2y7luesktjrque3nr7qtxnum2mkbeegzdrsvkm3rvdlhqboajhx] inductor_config[max_autotune_gemm_backends]: ATEN,TRITON,CPP", + "[jvchmi66fvqzlemhr5fcqorz5trfdtdalzfagtj2aolmimwqhdq] inductor_config[max_autotune_gemm_search_space]: DEFAULT", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[max_autotune_pointwise]: False", + "[bh33ranllcgilhgmgr3qvygzxjm6isq5iexnfm3zx6fnr2zwlp2] inductor_config[max_autotune_subproc_graceful_timeout_seconds]: 1.0", + "[iglov24t7x5ruci344aer2tm6nqshi4veuw4wxlssxtu46cx76m] inductor_config[max_autotune_subproc_result_timeout_seconds]: 60.0", + "[pwoh5aypf4fxbntdvwt67rppxorqos6xr3w7qzeun6kblbfg2ga] inductor_config[max_autotune_subproc_terminate_timeout_seconds]: 2.0", + "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inductor_config[max_epilogue_benchmarked_choices]: 1", + "[jykiys6ynafs3zdylwa5ggq6j655mxeh42d6mtdi22gffkrmiac] inductor_config[max_fusion_size]: 64", + "[yttmfmxblgcbsvbokguzowcorrcxz5uunxtcvsbe6nijgcx45he] inductor_config[max_pointwise_cat_inputs]: 8", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[memory_planning]: False", + "[x75won4jmsgeb63pcvwr2y4eteyzzdhmf5rv6xhjppie4hx2yu5] inductor_config[memory_pool]: intermediates", + "[v2td5s4lnsvyxvaevy4chx6kc5h3mm2axazbgwimqule5zrzao7] inductor_config[mixed_mm_choice]: heuristic", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[nan_asserts]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[optimize_scatter_upon_const_tensor]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[pad_channels_last]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[pad_outputs]: False", + "[ljdqgtysl3vdf7j6attlz5gmjg2ncihnveojfyubosplmkrjgra] inductor_config[padding_alignment_bytes]: 128", + "[dnnw5ks3yxrp7mwvihb2hh4tqx35ye637xt33x64kw4fvz2nyzg] inductor_config[padding_stride_threshold]: 1024", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[pattern_matcher]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[permute_fusion]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[pick_loop_orders]: True", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[post_grad_custom_post_pass]: None", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[post_grad_custom_pre_pass]: None", + "[4bryyl4ahh5whyg3zwqebpwmjnx6w77nqgqbdjlowju6lkqtn7w] inductor_config[post_grad_fusion_options]: {}", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[pre_grad_custom_pass]: None", + "[4bryyl4ahh5whyg3zwqebpwmjnx6w77nqgqbdjlowju6lkqtn7w] inductor_config[pre_grad_fusion_options]: {}", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[profile_bandwidth]: False", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[profile_bandwidth_output]: None", + "[v3hzzlv4tjgvp3pyhmzagjd25orl6n7nynoa7svlhhwk73b7u3c] inductor_config[profile_bandwidth_regex]: ", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[profile_bandwidth_with_do_bench_using_profiling]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[profiler_mark_wrapper_call]: False", + "[yttmfmxblgcbsvbokguzowcorrcxz5uunxtcvsbe6nijgcx45he] inductor_config[realize_acc_reads_threshold]: 8", + "[rr5m5hsocoyodldz7vcvaizdwvm2rt34evmqdxvng7wz3tufvo6] inductor_config[realize_opcount_threshold]: 30", + "[lkkae3meylaixfif4thncru4hjqeaislawjoghffrbwuscaagei] inductor_config[realize_reads_threshold]: 4", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[reorder_for_compute_comm_overlap]: False", + "[ssupi7bu3rrhdpg2jyegzncu3kg3nnhklyliqvutaxgs7y7k3dx] inductor_config[reorder_for_compute_comm_overlap_passes]: ['reorder_compute_for_overlap', 'sink_waits', 'raise_comms']", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[reorder_for_locality]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[reorder_for_peak_memory]: False", + "[h25wqx6vliw4j5rtzzbv6latydxyei3deyg6v7wzvnzryfktuki] inductor_config[rocm.arch]: []", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[rocm.ck_dir]: None", + "[oartxnko2l7d67tzwwm2otcumaut3n4wwcfgz3o377hmcveu5ft] inductor_config[rocm.ck_supported_arch]: ['gfx90a', 'gfx940', 'gfx941', 'gfx942']", + "[klfqjprnpfhcdurgvuikvc4rpd5ynkpk77toousr5h3u5roty6p] inductor_config[rocm.compile_opt_level]: -O2", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[rocm.flush_denormals]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[rocm.is_debug]: False", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[rocm.n_max_profiling_configs]: None", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[rocm.print_kernel_resource_usage]: False", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[rocm.rocm_home]: None", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[rocm.save_temps]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[rocm.use_fast_math]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[rocm.use_preselected_instances]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[save_args]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[search_autotune_cache]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[shape_padding]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[size_asserts]: True", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[sleep_sec_TESTING_ONLY]: None", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[split_cat_fx_passes]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[split_reductions]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[static_weight_shapes]: True", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[triton.autotune_at_compile_time]: None", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.autotune_cublasLt]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.autotune_pointwise]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.codegen_upcast_to_fp32]: True", + "[tuax46wac7rfv2trf5gcps6vleo3cq44lbnrdxtprvo3ljjaddj] inductor_config[triton.cudagraph_dynamic_shape_warn_limit]: 50", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.cudagraph_skip_dynamic_graphs]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.cudagraph_support_input_mutation]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.cudagraph_trees]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.cudagraph_trees_history_recording]: False", + "[ljdqgtysl3vdf7j6attlz5gmjg2ncihnveojfyubosplmkrjgra] inductor_config[triton.cudagraph_unexpected_rerecord_limit]: 128", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.cudagraphs]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.debug_sync_graph]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.debug_sync_kernel]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.dense_indexing]: False", + "[yrty22bseefglnysuoec4ji7j2rnaggdj3g33zzj7avogwfmgdw] inductor_config[triton.descriptive_names]: original_aten", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.divisible_by_16]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.fast_path_cudagraph_asserts]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.force_cudagraph_sync]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.force_cudagraphs_warmup]: False", + "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[triton.inject_relu_bug_TESTING_ONLY]: None", + "[pr5nr4a7dthirgd2ljo3d2xakc63ywxugusu6mkmr6gmpeliyib] inductor_config[triton.max_tiles]: 2", + "[fv6slhtedtydps5s5u2etitscliblzcidyitqf7krsv4e23fzk6] inductor_config[triton.min_split_scan_rblock]: 256", + "[vrl5ktomgtzox5xucd3np6vug3vyj6hwwzahqijuwpmamlv7ohi] inductor_config[triton.multi_kernel]: 0", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.persistent_reductions]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.prefer_nd_tiling]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.skip_cudagraph_warmup]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.slow_path_cudagraph_asserts]: True", + "[ebt2ncs4f5y7dn7btzi76mnouepvzad474tmp5iju4wiuumjl4s] inductor_config[triton.spill_threshold]: 16", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.store_cubin]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.tiling_prevents_pointwise_fusion]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.tiling_prevents_reduction_fusion]: True", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.unique_kernel_names]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.use_block_ptr]: False", + "[vzzema5ityqj2wepdmkulue7q5pcevdr5h27oxxutf35d4tjume] inductor_config[triton_kernel_default_layout_constraint]: flexible_layout", + "[wft6ljqsfr3x4m7fa5zuyb7cwknky4irrxz4bjr6uzr2yiopxqj] inductor_config[unbacked_symint_fallback]: 8192", + "[yttmfmxblgcbsvbokguzowcorrcxz5uunxtcvsbe6nijgcx45he] inductor_config[unroll_reductions_threshold]: 8", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[unsafe_ignore_unsupported_triton_autotune_args]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[use_minimal_arrayref_interface]: False", + "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[use_mixed_mm]: True", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[verbose_progress]: False", + "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[warn_mix_layout]: False" + ], + "cache_event_time": 1727975463815134085, + "cache_state": "miss", + "time_taken_ns": 6500727742 + }, + "ph": "i", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0, + "s": "p" + } +V1003 10:11:10.317000 2235078 torch/_inductor/codecache.py:1463] {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "2b8d4494cf5a9a8e000f402106cf2ba5"} + {"key": "flfpwf422lzhb6yszystfjdqdgiiqyds6rdbqc57o3a6n4uehjgp", "components": ["[nhtxa6qinb75ty5rqjx3bdkokm2s3bz3nrhke3zmvm73qypm2md] gm: (\n (sdpa_score0): ()\n (sdpa_mask0): ()\n)\n\n\n\ndef forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1):\n sdpa_score0 = self.sdpa_score0\n sdpa_mask0 = self.sdpa_mask0\n flex_attention = torch.ops.higher_order.flex_attention(arg0_1, arg1_1, arg2_1, sdpa_score0, (arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, 128, 128, sdpa_mask0), 0.125, {'ROWS_GUARANTEED_SAFE': False, 'PRESCALE_QK': False, 'OUTPUT_LOGSUMEXP': False}, (), ()); arg0_1 = arg1_1 = arg2_1 = sdpa_score0 = arg3_1 = arg4_1 = arg5_1 = arg6_1 = arg7_1 = arg8_1 = arg9_1 = arg10_1 = sdpa_mask0 = None\n getitem = flex_attention[0]; flex_attention = None\n return (getitem,)\n \n# To see more debug info, please use `graph_module.print_readable()`", "[avf2u3luxvyabchjhbddapcjn5gev47wfdtkrprayuhv6lf2z6u] example_inputs[0]: TensorMetadata(dtype=torch.float32, shape=torch.Size([1, 4, 512, 64]), stride=(131072, 32768, 64, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", "[avf2u3luxvyabchjhbddapcjn5gev47wfdtkrprayuhv6lf2z6u] example_inputs[1]: TensorMetadata(dtype=torch.float32, shape=torch.Size([1, 4, 512, 64]), stride=(131072, 32768, 64, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", "[avf2u3luxvyabchjhbddapcjn5gev47wfdtkrprayuhv6lf2z6u] example_inputs[2]: TensorMetadata(dtype=torch.float32, shape=torch.Size([1, 4, 512, 64]), stride=(131072, 32768, 64, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", "[zsk3gejenkcvvwhiyk36u5zdnlrcs6wgy3pina3csuierfd2zri] example_inputs[3]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16]), stride=(16, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", "[hnbjjzmb63q27mbr22eubaelyb423burv27meouma6ccysmwu6g] example_inputs[4]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16, 16]), stride=(256, 256, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", "[zsk3gejenkcvvwhiyk36u5zdnlrcs6wgy3pina3csuierfd2zri] example_inputs[5]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16]), stride=(16, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", "[hnbjjzmb63q27mbr22eubaelyb423burv27meouma6ccysmwu6g] example_inputs[6]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16, 16]), stride=(256, 256, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", "[zsk3gejenkcvvwhiyk36u5zdnlrcs6wgy3pina3csuierfd2zri] example_inputs[7]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16]), stride=(16, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", "[hnbjjzmb63q27mbr22eubaelyb423burv27meouma6ccysmwu6g] example_inputs[8]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16, 16]), stride=(256, 256, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", "[zsk3gejenkcvvwhiyk36u5zdnlrcs6wgy3pina3csuierfd2zri] example_inputs[9]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16]), stride=(16, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", "[hnbjjzmb63q27mbr22eubaelyb423burv27meouma6ccysmwu6g] example_inputs[10]: TensorMetadata(dtype=torch.int32, shape=torch.Size([1, 1, 16, 16]), stride=(256, 256, 16, 1), device=device(type='cuda', index=0), layout=torch.strided, memory_format=torch.contiguous_format, storage_offset=0, storage_bytes=None, requires_grad=False, is_quantized=False, is_conj=False, is_neg=False, is_inference=False, is_sparse=False, is_coalesced=None, dense_dim=None, sparse_dim=None)", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] fx_kwargs[aot_mode]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] fx_kwargs[cpp_wrapper]: False", "[xq2hdkbfkbcuye6rgtypayrkhqf4cntij2dsd24rei3lsknakkf] fx_kwargs[cudagraphs]: BoxedBool(value=False)", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] fx_kwargs[extern_node_serializer]: None", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] fx_kwargs[is_backward]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] fx_kwargs[is_inference]: True", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] fx_kwargs[layout_opt]: None", "[h25wqx6vliw4j5rtzzbv6latydxyei3deyg6v7wzvnzryfktuki] fx_kwargs[static_input_idxs]: []", "[f44ag5aflby2bkxl7a4k6whljrk7jat7bmreuxklei4p3czhk7p] fx_kwargs[user_visible_outputs]: {'getitem': None}", "[vrl5ktomgtzox5xucd3np6vug3vyj6hwwzahqijuwpmamlv7ohi] inputs_to_check[0]: 0", "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inputs_to_check[1]: 1", "[pr5nr4a7dthirgd2ljo3d2xakc63ywxugusu6mkmr6gmpeliyib] inputs_to_check[2]: 2", "[kcuxe2zwm3mzv2uk6adm6iskoy35bqfv725twacrdewod2dbl5d] inputs_to_check[3]: 3", "[lkkae3meylaixfif4thncru4hjqeaislawjoghffrbwuscaagei] inputs_to_check[4]: 4", "[qs5hilycp4ew4ivtc7m5jaxp7q4pm5slioxw3fi3ur6ei65ybz4] inputs_to_check[5]: 5", "[agkvbkaha53nbz3aeeuhvxjvvc4glhfjofzkg6g2qjoo2e5otcx] inputs_to_check[6]: 6", "[j3s5elu6itwgjafc7rzhy4whrbufl6kfmlufjhh25grt643bk5f] inputs_to_check[7]: 7", "[yttmfmxblgcbsvbokguzowcorrcxz5uunxtcvsbe6nijgcx45he] inputs_to_check[8]: 8", "[qlgfiyqewrmkgqth2qm6wkq2ja5lzkapg3ypgnvoyfqqnidaoj3] inputs_to_check[9]: 9", "[j6c55jha5r2sdys2rwq7uqhtleea5dgjcye7nicfgft36v7xfvp] inputs_to_check[10]: 10", "[du4vyrfyozrfxcf6kk6ma7oqwatapifazeelfsawmsiu6gjdtxp] deterministic_algorithms_settings: (False, False, True)", "[qiptf2633zubseuei4bkisoq3not35l6lud6p23p4qmcsxiw2uq] cuda_matmul_settings: (False, True, True)", "[7uhqwjfn75ek3woo3k7em2mluon5hx2ojvzlevlvjvz6xfxjhzl] torch_version: ", "[c3z7bmoxyo6gl5hi47v6dc7jwsl55b3asd75nr25uyengi5ah3p] system_info[device]: {'name': 'NVIDIA PG509-210'}", "[3fb7kae6ogkdd4zcm3fkjoipdpybxhn4aoxzv7z7xsfwq233e4l] system_info[version]: {'triton': '3.1.0+5fe38ffd73dc767c8fadcf23ea82d79e257c37d44077eae7f681cf967565fd43e9c017937b-835d4fc33500e1accafc5c5e00f4f73d87432c114860c04b68849bf6f942b8e5-dc767c8fadcf23ea82d79e257c37d44077eae7f681cf967565fd43e9c017937b-23d635e690d670bf61798e1259674b78c0ed5ba222ab6a455f329f27a758fc2d-e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855-20b017e9c4d858ab05e783f77df50b86c6d6eee5d79f3f4b158562b4a54f8443-f44338a31e0534290b08653050804c3fabbde403a6d3004ae04f0c28495f0802-e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855-a979896b9c0acfd41dd953b90bdc4b10968f7c0b45a286eae3f829aaddb2bb55-da771298f7bc45d24a61f35ef51742304421df1ab49d50bf1fc510dd5a46ea4b-a8fb7be728d460b7ec64ab62edb8af1bbca8994fd718cde7178d46bad64530a1-71330f394e584b0df29595d49f6ac8ac0c5503db9147090dc58ad888cebac7be-f24adfd52383f7866791ebaa5d45a5d2cc826e56ee2fd285f438e85d201fe643-a34be0d3ae4b3ac9aede195cfda42f8a0a097b2bc9642fb59673ce6b3b607f10-36130a37af1b19a0dec569aa08d30b00c74c8f02b6b632999d86dea169146792-36d42f0429aae027cb985b53b9abc616fae4dad9e0ea03953e1e9fb46d0fb9a0-e5d2cb724c08d0ef4130f3ba858d22cf21f834bfd970a5388aa6ad2a6bab91f9', 'cuda': '12.0'}", "[z5x2bdhir5lzlbti73vdbfulnuu5vinzpwgmmgf4rjb775tzl3h] system_info[hash]: 9698c97edde4a99a2f3b54bbd0db5291bbcdb75c83acb376ccff61fb0bf0ac1a", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[TYPE_CHECKING]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[abi_compatible]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aggressive_fusion]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[allow_buffer_reuse]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[allow_stack_allocation]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[always_keep_tensor_constants]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aot_inductor.debug_compile]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aot_inductor.debug_dump_consts_bin]: False", "[ngkkx5e6z7erl6da23zb2cmsctz4yvaqyameyg5hbqln4wrhh7x] inductor_config[aot_inductor.debug_intermediate_value_printer]: 0", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[aot_inductor.filtered_kernel_names]: None", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aot_inductor.force_mmap_weights]: False", "[4bryyl4ahh5whyg3zwqebpwmjnx6w77nqgqbdjlowju6lkqtn7w] inductor_config[aot_inductor.metadata]: {}", "[v3hzzlv4tjgvp3pyhmzagjd25orl6n7nynoa7svlhhwk73b7u3c] inductor_config[aot_inductor.output_path]: ", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aot_inductor.package]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aot_inductor.package_cpp_only]: False", "[v3hzzlv4tjgvp3pyhmzagjd25orl6n7nynoa7svlhhwk73b7u3c] inductor_config[aot_inductor.serialized_in_spec]: ", "[v3hzzlv4tjgvp3pyhmzagjd25orl6n7nynoa7svlhhwk73b7u3c] inductor_config[aot_inductor.serialized_out_spec]: ", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[aot_inductor.use_runtime_constant_folding]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[assert_indirect_indexing]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[assume_aligned_inputs]: False", "[v3hzzlv4tjgvp3pyhmzagjd25orl6n7nynoa7svlhhwk73b7u3c] inductor_config[autoheuristic_collect]: ", "[jvchmi66fvqzlemhr5fcqorz5trfdtdalzfagtj2aolmimwqhdq] inductor_config[autoheuristic_log_path]: DEFAULT", "[jwbrgxes7vjqumngs5hyj6gn5nytv2whnppnzngvaagfmawhkkd] inductor_config[autoheuristic_use]: mixed_mm", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[autotune_fallback_to_aten]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[autotune_in_subproc]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[autotune_local_cache]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[autotune_multi_device]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[autotune_remote_cache]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[b2b_gemm_pass]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[batch_fusion]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[benchmark_combo_kernel]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[benchmark_epilogue_fusion]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[benchmark_fusion]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[benchmark_harness]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[benchmark_kernel]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[bw_outputs_user_visible]: True", "[b4ha3ravs3qv237q65hpfqegbnoww7tf2ahcbu2i7xo6te5spqs] inductor_config[c_shim_version]: 2", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[check_stack_no_cycles_TESTING_ONLY]: False", "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inductor_config[combo_kernel_allow_mixed_sizes]: 1", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[combo_kernel_foreach_dynamic_shapes]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[combo_kernels]: False", "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inductor_config[combo_kernels_autotune]: 1", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[comment_origin]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[comprehensive_padding]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[compute_all_bounds]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[constant_and_index_propagation]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[conv_1x1_as_mm]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[coordinate_descent_check_all_directions]: False", "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inductor_config[coordinate_descent_search_radius]: 1", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[coordinate_descent_tuning]: False", "[c7zj4qytmety6keurs3hsh5wn7foxp3dqx4kym2ucszzcb2ngrf] inductor_config[cpp.cxx]: (None, 'g++')", "[yrty22bseefglnysuoec4ji7j2rnaggdj3g33zzj7avogwfmgdw] inductor_config[cpp.descriptive_names]: original_aten", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cpp.dynamic_threads]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cpp.enable_floating_point_contract_flag]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cpp.enable_kernel_profile]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[cpp.enable_loop_tail_vec]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[cpp.enable_tiling_heuristics]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cpp.enable_unsafe_math_opt_flag]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[cpp.fallback_scatter_reduce_sum]: True", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cpp.gemm_cache_blocking]: None", "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inductor_config[cpp.gemm_max_k_slices]: 1", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cpp.gemm_thread_factors]: None", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cpp.inject_log1p_bug_TESTING_ONLY]: None", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cpp.inject_relu_bug_TESTING_ONLY]: None", "[ebt2ncs4f5y7dn7btzi76mnouepvzad474tmp5iju4wiuumjl4s] inductor_config[cpp.max_horizontal_fusion_size]: 16", "[g7rrnbg5yonzux3cfj5ovre5lob3ayda7qcfpxjvtwmiz4uicii] inductor_config[cpp.min_chunk_size]: 4096", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[cpp.no_redundant_loops]: True", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cpp.simdlen]: None", "[sz3im5ogc6asp7g4uqocnovype63tkdexzfrniv6hn2oank3biu] inductor_config[cpp.threads]: -1", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cpp.vec_isa_ok]: None", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[cpp.weight_prepack]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cpp_wrapper]: False", "[bsvfcwwoczx2rlkdz2eta6doujsymyihmi46hhwk6clrrvwcb6m] inductor_config[cpu_backend]: cpp", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cuda.arch]: None", "[tvyftmtdmezlejo2xllu7awzv4pzc4vm4fub4b3gpl5jptjkosi] inductor_config[cuda.compile_opt_level]: -O1", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cuda.cuda_cxx]: None", "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inductor_config[cuda.cutlass_backend_min_gemm_size]: 1", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cuda.cutlass_max_profiling_configs]: None", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cuda.cutlass_op_allowlist_regex]: None", "[lwkz5chtpji756gurqw4foijfi7zfgljtnn5nmnvdi2skpt4mgh] inductor_config[cuda.cutlass_op_denylist_regex]: pingpong", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cuda.enable_cuda_lto]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cuda.enable_debug_info]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cuda.enable_ptxas_info]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[cuda.generate_test_runner]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[cuda.use_fast_math]: False", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[cuda.version]: None", "[caw4ly2z672k6kjfahoxwpajp5idhhtrpgf3ma2clylcp7c7aid] inductor_config[cuda_backend]: triton", "[pikr7bbcoixfzftsazp5ggufhdklj24babfry77bl4nuvyrrcp4] inductor_config[custom_op_default_layout_constraint]: needs_fixed_stride_order", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[dce]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[debug]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[debug_fusion]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[debug_index_asserts]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[debug_ir_traceback]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[decompose_mem_bound_mm]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[developer_warnings]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[disable_cpp_codegen]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[disable_padding_cpu]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[disable_progress]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[dynamic_scale_rblock]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[efficient_conv_bn_eval_fx_passes]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[emulate_precision_casts]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[enable_auto_functionalized_v2]: False", "[v3hzzlv4tjgvp3pyhmzagjd25orl6n7nynoa7svlhhwk73b7u3c] inductor_config[enabled_metric_tables]: ", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[epilogue_fusion]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[epilogue_fusion_first]: False", "[lxxtoqhcoepwfokeiibd575gnxo3uzwiv4hmpomlwkpzqz3qzsh] inductor_config[estimate_op_runtime]: default", "[h25wqx6vliw4j5rtzzbv6latydxyei3deyg6v7wzvnzryfktuki] inductor_config[external_matmul]: []", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[fallback_random]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[force_disable_caches]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[force_fuse_int_mm_with_mul]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[force_layout_optimization]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[force_same_precision]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[force_shape_pad]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[freezing]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[freezing_discard_parameters]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[fx_graph_cache]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[fx_graph_remote_cache]: False", "[62lrdx35b7hnumwb7mp5oc5y5csm2abylvtdzfloct3noaqov3n] inductor_config[fx_passes_numeric_check]: {'pre_grad': False, 'post_grad': False, 'precision': 0.0001, 'num_iterations': 1, 'requires_optimizer': True}", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[generate_intermediate_hooks]: False", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[global_cache_dir]: None", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[group_fusion]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[halide.asserts]: False", "[ljhgflgihidopsfsdcbqynv27nceykby3nutyd5jlcpq7n6e7l4] inductor_config[halide.cpu_target]: host", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[halide.debug]: False", "[wx7vmsmrdpk5ue2txlywp3lj3faqmdjphs5fgg2ehzsyno7uovg] inductor_config[halide.gpu_target]: host-cuda", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[halide.scan_kernels]: False", "[k5ogk6345jvklsnu7g2njqstiz2g6pm5wmqpgg3kasrmuqwjvl6] inductor_config[halide.scheduler_cpu]: Adams2019", "[svgytlua5wcyeia7wq7e6zgh5tsueikrnzchmdmouvmkpfsc2zq] inductor_config[halide.scheduler_cuda]: Anderson2021", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[implicit_fallbacks]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[inplace_buffers]: True", "[5fxczt3ciyxitdhizb7sfsgn7fhpczcqsngttnt5ot2wyctk7co] inductor_config[inter_node_bw]: 25", "[yezuzjtg4h3jjur4jwtwiehbyixa7eonq4tqsqmwqve2lvvmrem] inductor_config[intra_node_bw]: 300", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[is_nightly_or_source]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[is_predispatch]: False", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[joint_custom_post_pass]: None", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[joint_custom_pre_pass]: None", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[joint_graph_constant_folding]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[keep_output_stride]: True", "[j6c55jha5r2sdys2rwq7uqhtleea5dgjcye7nicfgft36v7xfvp] inductor_config[kernel_name_max_ops]: 10", "[4p2fdjlvxrcw7c7fvzm5huhtqxnro4kvkx56f7p5zyrxqkwooov] inductor_config[layout_opt_default]: 1", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[layout_optimization]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[loop_ordering_after_fusion]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[max_autotune]: False", "[uqlsbif4zxd75vt522p52txyuguieipi2lwz5g5awt56lccqk7s] inductor_config[max_autotune_conv_backends]: ATEN,TRITON", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[max_autotune_gemm]: False", "[2y7luesktjrque3nr7qtxnum2mkbeegzdrsvkm3rvdlhqboajhx] inductor_config[max_autotune_gemm_backends]: ATEN,TRITON,CPP", "[jvchmi66fvqzlemhr5fcqorz5trfdtdalzfagtj2aolmimwqhdq] inductor_config[max_autotune_gemm_search_space]: DEFAULT", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[max_autotune_pointwise]: False", "[bh33ranllcgilhgmgr3qvygzxjm6isq5iexnfm3zx6fnr2zwlp2] inductor_config[max_autotune_subproc_graceful_timeout_seconds]: 1.0", "[iglov24t7x5ruci344aer2tm6nqshi4veuw4wxlssxtu46cx76m] inductor_config[max_autotune_subproc_result_timeout_seconds]: 60.0", "[pwoh5aypf4fxbntdvwt67rppxorqos6xr3w7qzeun6kblbfg2ga] inductor_config[max_autotune_subproc_terminate_timeout_seconds]: 2.0", "[aghvyrrgwvxijco2pk5wzc3cgmmthrbmgxitiibxuuscxdwrjd3] inductor_config[max_epilogue_benchmarked_choices]: 1", "[jykiys6ynafs3zdylwa5ggq6j655mxeh42d6mtdi22gffkrmiac] inductor_config[max_fusion_size]: 64", "[yttmfmxblgcbsvbokguzowcorrcxz5uunxtcvsbe6nijgcx45he] inductor_config[max_pointwise_cat_inputs]: 8", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[memory_planning]: False", "[x75won4jmsgeb63pcvwr2y4eteyzzdhmf5rv6xhjppie4hx2yu5] inductor_config[memory_pool]: intermediates", "[v2td5s4lnsvyxvaevy4chx6kc5h3mm2axazbgwimqule5zrzao7] inductor_config[mixed_mm_choice]: heuristic", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[nan_asserts]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[optimize_scatter_upon_const_tensor]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[pad_channels_last]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[pad_outputs]: False", "[ljdqgtysl3vdf7j6attlz5gmjg2ncihnveojfyubosplmkrjgra] inductor_config[padding_alignment_bytes]: 128", "[dnnw5ks3yxrp7mwvihb2hh4tqx35ye637xt33x64kw4fvz2nyzg] inductor_config[padding_stride_threshold]: 1024", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[pattern_matcher]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[permute_fusion]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[pick_loop_orders]: True", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[post_grad_custom_post_pass]: None", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[post_grad_custom_pre_pass]: None", "[4bryyl4ahh5whyg3zwqebpwmjnx6w77nqgqbdjlowju6lkqtn7w] inductor_config[post_grad_fusion_options]: {}", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[pre_grad_custom_pass]: None", "[4bryyl4ahh5whyg3zwqebpwmjnx6w77nqgqbdjlowju6lkqtn7w] inductor_config[pre_grad_fusion_options]: {}", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[profile_bandwidth]: False", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[profile_bandwidth_output]: None", "[v3hzzlv4tjgvp3pyhmzagjd25orl6n7nynoa7svlhhwk73b7u3c] inductor_config[profile_bandwidth_regex]: ", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[profile_bandwidth_with_do_bench_using_profiling]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[profiler_mark_wrapper_call]: False", "[yttmfmxblgcbsvbokguzowcorrcxz5uunxtcvsbe6nijgcx45he] inductor_config[realize_acc_reads_threshold]: 8", "[rr5m5hsocoyodldz7vcvaizdwvm2rt34evmqdxvng7wz3tufvo6] inductor_config[realize_opcount_threshold]: 30", "[lkkae3meylaixfif4thncru4hjqeaislawjoghffrbwuscaagei] inductor_config[realize_reads_threshold]: 4", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[reorder_for_compute_comm_overlap]: False", "[ssupi7bu3rrhdpg2jyegzncu3kg3nnhklyliqvutaxgs7y7k3dx] inductor_config[reorder_for_compute_comm_overlap_passes]: ['reorder_compute_for_overlap', 'sink_waits', 'raise_comms']", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[reorder_for_locality]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[reorder_for_peak_memory]: False", "[h25wqx6vliw4j5rtzzbv6latydxyei3deyg6v7wzvnzryfktuki] inductor_config[rocm.arch]: []", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[rocm.ck_dir]: None", "[oartxnko2l7d67tzwwm2otcumaut3n4wwcfgz3o377hmcveu5ft] inductor_config[rocm.ck_supported_arch]: ['gfx90a', 'gfx940', 'gfx941', 'gfx942']", "[klfqjprnpfhcdurgvuikvc4rpd5ynkpk77toousr5h3u5roty6p] inductor_config[rocm.compile_opt_level]: -O2", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[rocm.flush_denormals]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[rocm.is_debug]: False", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[rocm.n_max_profiling_configs]: None", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[rocm.print_kernel_resource_usage]: False", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[rocm.rocm_home]: None", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[rocm.save_temps]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[rocm.use_fast_math]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[rocm.use_preselected_instances]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[save_args]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[search_autotune_cache]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[shape_padding]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[size_asserts]: True", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[sleep_sec_TESTING_ONLY]: None", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[split_cat_fx_passes]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[split_reductions]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[static_weight_shapes]: True", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[triton.autotune_at_compile_time]: None", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.autotune_cublasLt]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.autotune_pointwise]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.codegen_upcast_to_fp32]: True", "[tuax46wac7rfv2trf5gcps6vleo3cq44lbnrdxtprvo3ljjaddj] inductor_config[triton.cudagraph_dynamic_shape_warn_limit]: 50", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.cudagraph_skip_dynamic_graphs]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.cudagraph_support_input_mutation]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.cudagraph_trees]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.cudagraph_trees_history_recording]: False", "[ljdqgtysl3vdf7j6attlz5gmjg2ncihnveojfyubosplmkrjgra] inductor_config[triton.cudagraph_unexpected_rerecord_limit]: 128", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.cudagraphs]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.debug_sync_graph]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.debug_sync_kernel]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.dense_indexing]: False", "[yrty22bseefglnysuoec4ji7j2rnaggdj3g33zzj7avogwfmgdw] inductor_config[triton.descriptive_names]: original_aten", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.divisible_by_16]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.fast_path_cudagraph_asserts]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.force_cudagraph_sync]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.force_cudagraphs_warmup]: False", "[tquy2we2efmowuj4wuqzcfcfdcrkzkzmwdae6hprj7fa64jpusq] inductor_config[triton.inject_relu_bug_TESTING_ONLY]: None", "[pr5nr4a7dthirgd2ljo3d2xakc63ywxugusu6mkmr6gmpeliyib] inductor_config[triton.max_tiles]: 2", "[fv6slhtedtydps5s5u2etitscliblzcidyitqf7krsv4e23fzk6] inductor_config[triton.min_split_scan_rblock]: 256", "[vrl5ktomgtzox5xucd3np6vug3vyj6hwwzahqijuwpmamlv7ohi] inductor_config[triton.multi_kernel]: 0", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.persistent_reductions]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.prefer_nd_tiling]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.skip_cudagraph_warmup]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.slow_path_cudagraph_asserts]: True", "[ebt2ncs4f5y7dn7btzi76mnouepvzad474tmp5iju4wiuumjl4s] inductor_config[triton.spill_threshold]: 16", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.store_cubin]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.tiling_prevents_pointwise_fusion]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.tiling_prevents_reduction_fusion]: True", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[triton.unique_kernel_names]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[triton.use_block_ptr]: False", "[vzzema5ityqj2wepdmkulue7q5pcevdr5h27oxxutf35d4tjume] inductor_config[triton_kernel_default_layout_constraint]: flexible_layout", "[wft6ljqsfr3x4m7fa5zuyb7cwknky4irrxz4bjr6uzr2yiopxqj] inductor_config[unbacked_symint_fallback]: 8192", "[yttmfmxblgcbsvbokguzowcorrcxz5uunxtcvsbe6nijgcx45he] inductor_config[unroll_reductions_threshold]: 8", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[unsafe_ignore_unsupported_triton_autotune_args]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[use_minimal_arrayref_interface]: False", "[cev5uo2jlwdhw2uyzcm7vr6cl23azjfw437f5r5lskm7spucos6] inductor_config[use_mixed_mm]: True", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[verbose_progress]: False", "[esstihe2nyydk4mhzpvox3qkajyu5y5t23hk3fi2me7jn75xi3o] inductor_config[warn_mix_layout]: False"], "cache_event_time": 1727975463815134085, "cache_state": "miss", "time_taken_ns": 6500727742} +V1003 10:11:10.318000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "59401c365876e3190b828c9baef54d73"} + { + "name": "inductor_compile", + "ts": 1727975470318146.5, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 1, + "fxgraph_cache_miss": 2, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:10.318000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "75e1054f04f269f6411028767770c53c"} + { + "name": "compile_fx_inner", + "ts": 1727975470318439.5, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 1, + "fxgraph_cache_miss": 2, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:10.319000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "54543856a0e4177bd66c38146fda7120"} + { + "name": "compile_fx..fw_compiler_base", + "ts": 1727975470319104.2, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 1, + "fxgraph_cache_miss": 2, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:10.322000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "0017891a0ab9c15e62e1b8125198b31d"} + { + "name": "create_aot_dispatcher_function", + "ts": 1727975470322488.8, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 1, + "fxgraph_cache_miss": 2, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:10.323000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "0927305357c25444fda9907c2d33a10f"} + { + "name": "backend_compile", + "ts": 1727975470323094.0, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 1, + "fxgraph_cache_miss": 2, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:10.323000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "f3906ae17fd7aa60928cc5980667652c"} + { + "name": "OutputGraph.call_user_compiler", + "ts": 1727975470323375.2, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 1, + "fxgraph_cache_miss": 2, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:10.354000 2235078 torch/_dynamo/guards.py:2311] {"dynamo_cpp_guards_str": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "1b9c89081ea4384ef6cc3f1c7563841f"} + + TREE_GUARD_MANAGER: + +- RootGuardManager + | +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:471 in init_ambient_guards + | +- GLOBAL_STATE: ___check_global_state() + | +- TORCH_FUNCTION_MODE_STACK: ___check_torch_function_mode_stack() + | +- GuardManager: source=L['k'], accessed_by=DictGetItemGuardAccessor(k) + | | +- TYPE_MATCH: ___check_type_id(L['k'], 82028112) + | | +- TENSOR_MATCH: check_tensor(L['k'], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.float32, device=0, requires_grad=False, size=[1, 4, 512, 64], stride=[131072, 32768, 64, 1]) + | | +- NO_HASATTR: hasattr(L['k'], '_dynamo_dynamic_indices') == False + | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['k'], L['q'], L['v'], L['block_mask'].q_indices, L['block_mask'].kv_indices, L['block_mask'].q_num_blocks, L['block_mask'].kv_num_blocks, L['block_mask'].full_q_indices, L['block_mask'].full_kv_indices, L['block_mask'].full_q_num_blocks, L['block_mask'].full_kv_num_blocks) + | +- GuardManager: source=L['q'], accessed_by=DictGetItemGuardAccessor(q) + | | +- TYPE_MATCH: ___check_type_id(L['q'], 82028112) + | | +- TENSOR_MATCH: check_tensor(L['q'], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.float32, device=0, requires_grad=False, size=[1, 4, 512, 64], stride=[131072, 32768, 64, 1]) + | | +- NO_HASATTR: hasattr(L['q'], '_dynamo_dynamic_indices') == False + | | +- NO_TENSOR_ALIASING + | +- GuardManager: source=L['v'], accessed_by=DictGetItemGuardAccessor(v) + | | +- TYPE_MATCH: ___check_type_id(L['v'], 82028112) + | | +- TENSOR_MATCH: check_tensor(L['v'], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.float32, device=0, requires_grad=False, size=[1, 4, 512, 64], stride=[131072, 32768, 64, 1]) + | | +- NO_HASATTR: hasattr(L['v'], '_dynamo_dynamic_indices') == False + | | +- NO_TENSOR_ALIASING + | +- GuardManager: source=L['block_mask'], accessed_by=DictGetItemGuardAccessor(block_mask) + | | +- TYPE_MATCH: ___check_type_id(L['block_mask'], 387600320) + | | +- GuardManager: source=L['block_mask'].mask_mod, accessed_by=GetAttrGuardAccessor(mask_mod) + | | | +- GuardManager: source=L['block_mask'].mask_mod.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | +- ID_MATCH: ___check_obj_id(L['block_mask'].mask_mod.__code__, 140413271880128) + | | +- GuardManager: source=L['block_mask'].q_indices, accessed_by=GetAttrGuardAccessor(q_indices) + | | | +- TENSOR_MATCH: check_tensor(L['block_mask'].q_indices, Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16, 16], stride=[256, 256, 16, 1]) + | | | +- NO_HASATTR: hasattr(L['block_mask'].q_indices, '_dynamo_dynamic_indices') == False + | | | +- NO_TENSOR_ALIASING + | | +- GuardManager: source=L['block_mask'].BLOCK_SIZE, accessed_by=GetAttrGuardAccessor(BLOCK_SIZE) + | | | +- TYPE_MATCH: ___check_type_id(L['block_mask'].BLOCK_SIZE, 8815232) + | | | +- LENGTH_CHECK: len(L['block_mask'].BLOCK_SIZE) == 2 + | | | +- GuardManager: source=L['block_mask'].BLOCK_SIZE[0], accessed_by=TupleGetItemGuardAccessor(0) + | | | | +- EQUALS_MATCH: L['block_mask'].BLOCK_SIZE[0] == 128 + | | | +- GuardManager: source=L['block_mask'].BLOCK_SIZE[1], accessed_by=TupleGetItemGuardAccessor(1) + | | | | +- EQUALS_MATCH: L['block_mask'].BLOCK_SIZE[1] == 128 + | | +- GuardManager: source=L['block_mask'].kv_indices, accessed_by=GetAttrGuardAccessor(kv_indices) + | | | +- TENSOR_MATCH: check_tensor(L['block_mask'].kv_indices, Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16, 16], stride=[256, 256, 16, 1]) + | | | +- NO_HASATTR: hasattr(L['block_mask'].kv_indices, '_dynamo_dynamic_indices') == False + | | | +- NO_TENSOR_ALIASING + | | +- GuardManager: source=L['block_mask'].q_num_blocks, accessed_by=GetAttrGuardAccessor(q_num_blocks) + | | | +- TENSOR_MATCH: check_tensor(L['block_mask'].q_num_blocks, Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16], stride=[16, 16, 1]) + | | | +- NO_HASATTR: hasattr(L['block_mask'].q_num_blocks, '_dynamo_dynamic_indices') == False + | | | +- NO_TENSOR_ALIASING + | | +- GuardManager: source=L['block_mask'].kv_num_blocks, accessed_by=GetAttrGuardAccessor(kv_num_blocks) + | | | +- TENSOR_MATCH: check_tensor(L['block_mask'].kv_num_blocks, Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16], stride=[16, 16, 1]) + | | | +- NO_HASATTR: hasattr(L['block_mask'].kv_num_blocks, '_dynamo_dynamic_indices') == False + | | | +- NO_TENSOR_ALIASING + | | +- GuardManager: source=L['block_mask'].full_q_indices, accessed_by=GetAttrGuardAccessor(full_q_indices) + | | | +- TENSOR_MATCH: check_tensor(L['block_mask'].full_q_indices, Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16, 16], stride=[256, 256, 16, 1]) + | | | +- NO_HASATTR: hasattr(L['block_mask'].full_q_indices, '_dynamo_dynamic_indices') == False + | | | +- NO_TENSOR_ALIASING + | | +- GuardManager: source=L['block_mask'].full_kv_indices, accessed_by=GetAttrGuardAccessor(full_kv_indices) + | | | +- TENSOR_MATCH: check_tensor(L['block_mask'].full_kv_indices, Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16, 16], stride=[256, 256, 16, 1]) + | | | +- NO_HASATTR: hasattr(L['block_mask'].full_kv_indices, '_dynamo_dynamic_indices') == False + | | | +- NO_TENSOR_ALIASING + | | +- GuardManager: source=L['block_mask'].full_q_num_blocks, accessed_by=GetAttrGuardAccessor(full_q_num_blocks) + | | | +- TENSOR_MATCH: check_tensor(L['block_mask'].full_q_num_blocks, Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16], stride=[16, 16, 1]) + | | | +- NO_HASATTR: hasattr(L['block_mask'].full_q_num_blocks, '_dynamo_dynamic_indices') == False + | | | +- NO_TENSOR_ALIASING + | | +- GuardManager: source=L['block_mask'].full_kv_num_blocks, accessed_by=GetAttrGuardAccessor(full_kv_num_blocks) + | | | +- TENSOR_MATCH: check_tensor(L['block_mask'].full_kv_num_blocks, Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.int32, device=0, requires_grad=False, size=[1, 1, 16], stride=[16, 16, 1]) + | | | +- NO_HASATTR: hasattr(L['block_mask'].full_kv_num_blocks, '_dynamo_dynamic_indices') == False + | | | +- NO_TENSOR_ALIASING + | | +- GuardManager: source=L['block_mask'].as_tuple, accessed_by=GetAttrGuardAccessor(as_tuple) + | | | +- GuardManager: source=L['block_mask'].as_tuple, accessed_by=FuncDefaultsGuardAccessor + | | | | +- GuardManager: source=L['block_mask'].as_tuple.__defaults__[0], accessed_by=GetItemGuardAccessor(0) + | | | | | +- ID_MATCH: ___check_obj_id(L['block_mask'].as_tuple.__defaults__[0], 8911040) + | +- GuardManager: source=L['score_mod2'], accessed_by=DictGetItemGuardAccessor(score_mod2) + | | +- GuardManager: source=L['score_mod2'].__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | +- ID_MATCH: ___check_obj_id(L['score_mod2'].__code__, 140413275216112) + | +- GuardManager: source=L['flex_attention'], accessed_by=DictGetItemGuardAccessor(flex_attention) + | | +- GuardManager: source=L['flex_attention'].__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | +- ID_MATCH: ___check_obj_id(L['flex_attention'].__code__, 387082992) + | | +- GuardManager: source=L['flex_attention'], accessed_by=FuncDefaultsGuardAccessor + | | | +- GuardManager: source=L['flex_attention'].__defaults__[2], accessed_by=GetItemGuardAccessor(2) + | | | | +- ID_MATCH: ___check_obj_id(L['flex_attention'].__defaults__[2], 8825760) + | | | +- GuardManager: source=L['flex_attention'].__defaults__[3], accessed_by=GetItemGuardAccessor(3) + | | | | +- ID_MATCH: ___check_obj_id(L['flex_attention'].__defaults__[3], 8910592) + | | | +- GuardManager: source=L['flex_attention'].__defaults__[4], accessed_by=GetItemGuardAccessor(4) + | | | | +- ID_MATCH: ___check_obj_id(L['flex_attention'].__defaults__[4], 8910592) + | | | +- GuardManager: source=L['flex_attention'].__defaults__[5], accessed_by=GetItemGuardAccessor(5) + | | | | +- ID_MATCH: ___check_obj_id(L['flex_attention'].__defaults__[5], 8825760) + | +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor + | | +- GuardManager: source=G['__builtins_dict___10'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___10) + | | | +- GuardManager: source=G['__builtins_dict___10']['len'], accessed_by=DictGetItemGuardAccessor(len) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___10']['len'], 140413275558816) + | | | +- GuardManager: source=G['__builtins_dict___10']['sum'], accessed_by=DictGetItemGuardAccessor(sum) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___10']['sum'], 140413275559936) + | | | +- GuardManager: source=G['__builtins_dict___10']['list'], accessed_by=DictGetItemGuardAccessor(list) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___10']['list'], 8844320) + | | | +- GuardManager: source=G['__builtins_dict___10']['type'], accessed_by=DictGetItemGuardAccessor(type) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___10']['type'], 8813248) + | | | +- GuardManager: source=G['__builtins_dict___10']['tuple'], accessed_by=DictGetItemGuardAccessor(tuple) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___10']['tuple'], 8815232) + | | | +- GuardManager: source=G['__builtins_dict___10']['object'], accessed_by=DictGetItemGuardAccessor(object) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___10']['object'], 8813984) + | | | +- GuardManager: source=G['__builtins_dict___10']['isinstance'], accessed_by=DictGetItemGuardAccessor(isinstance) + | | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___10']['isinstance'], 140413275558496) + | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot_utils_dot__pytree) + | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree'], 140411217627952) + | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree'].TreeSpec, accessed_by=GetAttrGuardAccessor(TreeSpec) + | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree'].TreeSpec, 84866496) + | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._is_leaf, accessed_by=GetAttrGuardAccessor(_is_leaf) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._is_leaf.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree']._is_leaf.__code__, 140411217262720) + | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC, accessed_by=GetAttrGuardAccessor(_LEAF_SPEC) + | | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC, 85171104) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.type, accessed_by=GetAttrGuardAccessor(type) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.type, 8825760) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.context, accessed_by=GetAttrGuardAccessor(context) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.context, 8825760) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.num_nodes, accessed_by=GetAttrGuardAccessor(num_nodes) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.num_nodes == 1 + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.num_leaves, accessed_by=GetAttrGuardAccessor(num_leaves) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.num_leaves == 1 + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.num_children, accessed_by=GetAttrGuardAccessor(num_children) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.num_children == 0 + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.children_specs, accessed_by=GetAttrGuardAccessor(children_specs) + | | | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.children_specs, 8844320) + | | | | | +- LENGTH_CHECK: not G['__import_torch_dot_utils_dot__pytree']._LEAF_SPEC.children_specs + | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._get_node_type, accessed_by=GetAttrGuardAccessor(_get_node_type) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._get_node_type.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree']._get_node_type.__code__, 140411217262448) + | | | +- DictGuardManager: source=G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES, accessed_by=GetAttrGuardAccessor(SUPPORTED_NODES) + | | | | +- DICT_VERSION: ___dict_version(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES) == 519596 + | | | | +- KeyValueManager pair at index=1 + | | | | | +- ValueManager: GuardManager: source=G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]] + | | | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]].flatten_fn, accessed_by=GetAttrGuardAccessor(flatten_fn) + | | | | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]].flatten_fn.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]].flatten_fn.__code__, 140411196281984) + | | | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]].unflatten_fn, accessed_by=GetAttrGuardAccessor(unflatten_fn) + | | | | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]].unflatten_fn.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES[list(G['__import_torch_dot_utils_dot__pytree'].SUPPORTED_NODES.keys())[1]].unflatten_fn.__code__, 140411217182288) + | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._tree_flatten_helper, accessed_by=GetAttrGuardAccessor(_tree_flatten_helper) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._tree_flatten_helper.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree']._tree_flatten_helper.__code__, 140411217413040) + | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._is_namedtuple_instance, accessed_by=GetAttrGuardAccessor(_is_namedtuple_instance) + | | | | +- GuardManager: source=G['__import_torch_dot_utils_dot__pytree']._is_namedtuple_instance.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_utils_dot__pytree']._is_namedtuple_instance.__code__, 140411217412592) + | | +- GuardManager: source=G['__import_torch_dot__dynamo_dot_comptime'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot__dynamo_dot_comptime) + | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot__dynamo_dot_comptime'], 140410226912176) + | | +- GuardManager: source=G['__import_torch_dot__dynamo_dot_decorators'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot__dynamo_dot_decorators) + | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot__dynamo_dot_decorators'], 140410226910096) + | | | +- GuardManager: source=G['__import_torch_dot__dynamo_dot_decorators'].is_compiling, accessed_by=GetAttrGuardAccessor(is_compiling) + | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot__dynamo_dot_decorators'].is_compiling, 140410376252096) + | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot__utils'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot_nn_dot_attention_dot__utils) + | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot__utils'], 140409673896784) + | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS, accessed_by=GetAttrGuardAccessor(_SUPPORTED_HEAD_DIMS) + | | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS, 8844320) + | | | | +- LENGTH_CHECK: len(G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS) == 10 + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS[0], accessed_by=ListGetItemGuardAccessor(0) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS[0] == 2 + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS[1], accessed_by=ListGetItemGuardAccessor(1) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS[1] == 4 + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS[2], accessed_by=ListGetItemGuardAccessor(2) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS[2] == 8 + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS[3], accessed_by=ListGetItemGuardAccessor(3) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS[3] == 16 + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS[4], accessed_by=ListGetItemGuardAccessor(4) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS[4] == 32 + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS[5], accessed_by=ListGetItemGuardAccessor(5) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_nn_dot_attention_dot__utils']._SUPPORTED_HEAD_DIMS[5] == 64 + | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot_nn_dot_attention_dot_flex_attention) + | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention'], 140409673895824) + | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].math, accessed_by=GetAttrGuardAccessor(math) + | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].math, 140413266939392) + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].math.sqrt, accessed_by=GetAttrGuardAccessor(sqrt) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].math.sqrt, 140413266943072) + | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].torch, accessed_by=GetAttrGuardAccessor(torch) + | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].torch, 140413267918368) + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].torch._dynamo, accessed_by=GetAttrGuardAccessor(_dynamo) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].torch._dynamo, 140413260098400) + | | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].torch._dynamo.mark_static, accessed_by=GetAttrGuardAccessor(mark_static) + | | | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].torch._dynamo.mark_static.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].torch._dynamo.mark_static.__code__, 123166432) + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].torch.compiler, accessed_by=GetAttrGuardAccessor(compiler) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].torch.compiler, 140410826010400) + | | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].torch.compiler.is_dynamo_compiling, accessed_by=GetAttrGuardAccessor(is_dynamo_compiling) + | | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].torch.compiler.is_dynamo_compiling, 140410826132992) + | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._validate_device, accessed_by=GetAttrGuardAccessor(_validate_device) + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._validate_device.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._validate_device.__code__, 140409673611088) + | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].flex_attention_hop, accessed_by=GetAttrGuardAccessor(flex_attention_hop) + | | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].flex_attention_hop, 96992544) + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].flex_attention_hop.__name__, accessed_by=GetAttrGuardAccessor(__name__) + | | | | | +- EQUALS_MATCH: G['__import_torch_dot_nn_dot_attention_dot_flex_attention'].flex_attention_hop.__name__ == 'flex_attention' + | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._supported_head_dim, accessed_by=GetAttrGuardAccessor(_supported_head_dim) + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._supported_head_dim.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._supported_head_dim.__code__, 140409673231376) + | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._validate_embed_dim, accessed_by=GetAttrGuardAccessor(_validate_embed_dim) + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._validate_embed_dim.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._validate_embed_dim.__code__, 388086512) + | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._validate_sdpa_input, accessed_by=GetAttrGuardAccessor(_validate_sdpa_input) + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._validate_sdpa_input.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._validate_sdpa_input.__code__, 387915104) + | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._apply_kernel_options, accessed_by=GetAttrGuardAccessor(_apply_kernel_options) + | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._apply_kernel_options.__code__, accessed_by=GetAttrGuardAccessor(__code__) + | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_attention_dot_flex_attention']._apply_kernel_options.__code__, 140409683680752) + +V1003 10:11:10.355000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "68cb1c0ec8488a404219d0a05ec80df9"} + { + "name": "entire_frame_compile", + "ts": 1727975470355549.8, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 1, + "fxgraph_cache_miss": 2, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:10.355000 2235078 torch/_dynamo/utils.py:988] {"chromium_event": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "eb02aec5687f6fb5c0049bd4411b1b24"} + { + "name": "_compile.compile_inner", + "ts": 1727975470355876.5, + "args": { + "cache_stats": { + "fxgraph_cache_hit": 1, + "fxgraph_cache_miss": 2, + "fxgraph_cache_bypass": 0 + } + }, + "ph": "E", + "cat": "dynamo_timed", + "tid": 0, + "pid": 0 + } +V1003 10:11:10.356000 2235078 torch/_dynamo/utils.py:840] {"compilation_metrics": {"compile_id": "1/0", "frame_key": "2", "co_name": "fn2", "co_filename": "/data/users/oulgen/pytorch/test/inductor/test_codecache.py", "co_firstlineno": 385, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 80, "shape_env_guard_count": 0, "graph_op_count": 11, "graph_node_count": 25, "graph_input_count": 11, "start_time": 1727975463.4963815, "entire_frame_compile_time_s": 6.859049081802368, "backend_compile_time_s": 6.648646593093872, "inductor_compile_time_s": 6.517011404037476, "code_gen_time_s": 6.449295282363892, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0, "has_guarded_code": true, "possibly_missed_reinplacing_opportunities": 0, "remote_cache_time_saved_s": 0, "structured_logging_overhead_s": 0.085769178, "config_suppress_errors": false, "config_inline_inbuilt_nn_modules": true, "specialize_float": true}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} diff --git a/tests/integration_test.rs b/tests/integration_test.rs index 01c754f..345226b 100644 --- a/tests/integration_test.rs +++ b/tests/integration_test.rs @@ -152,3 +152,29 @@ fn test_parse_chromium_event() { ); } } + +#[test] +fn test_cache_hit_miss() { + let expected_files = [ + "1_0_0/fx_graph_cache_miss_8", + "1_0_0/fx_graph_cache_hit_17", + "index.html", + ]; + // Generated via TORCH_TRACE=~/trace_logs/test python test/inductor/test_codecache.py -k test_flex_attention_caching + let path = Path::new("tests/inputs/cache_hit_miss.log").to_path_buf(); + let config = tlparse::ParseConfig { + strict: true, + ..Default::default() + }; + let output = tlparse::parse_path(&path, config); + assert!(output.is_ok()); + let map: HashMap = output.unwrap().into_iter().collect(); + // Check all files are present + for prefix in expected_files { + assert!( + prefix_exists(&map, prefix), + "{} not found in output", + prefix + ); + } +}