From 4645de320e4d30b4e4ebf06ac721755890a19fab Mon Sep 17 00:00:00 2001
From: Xu Xing <xing.xu@intel.com>
Date: Thu, 12 Oct 2023 13:34:26 +0800
Subject: [PATCH] [webgpu] dump test

---
 cmake/onnxruntime_webassembly.cmake           |  1 +
 js/web/lib/index.ts                           |  2 +
 js/web/lib/onnxjs/graph.ts                    |  6 ++
 js/web/package.json                           |  2 +-
 .../debug_node_inputs_outputs_utils.cc        | 97 +++++++++++++++++--
 .../debug_node_inputs_outputs_utils.h         |  2 +
 .../framework/print_tensor_statistics_utils.h |  2 +-
 .../core/framework/sequential_executor.cc     | 21 +++-
 .../core/optimizer/graph_transformer_utils.cc |  2 +-
 9 files changed, 123 insertions(+), 12 deletions(-)

diff --git a/cmake/onnxruntime_webassembly.cmake b/cmake/onnxruntime_webassembly.cmake
index c6510c97a617e..dc8c803bd315c 100644
--- a/cmake/onnxruntime_webassembly.cmake
+++ b/cmake/onnxruntime_webassembly.cmake
@@ -265,6 +265,7 @@ else()
   if (onnxruntime_USE_WEBNN)
    set_property(TARGET onnxruntime_webassembly APPEND_STRING PROPERTY LINK_FLAGS " --bind -sWASM_BIGINT")
   endif()
+  set_property(TARGET onnxruntime_webassembly APPEND_STRING PROPERTY LINK_FLAGS " --bind -sWASM_BIGINT")
 
   # Set link flag to enable exceptions support, this will override default disabling exception throwing behavior when disable exceptions.
   target_link_options(onnxruntime_webassembly PRIVATE "SHELL:-s DISABLE_EXCEPTION_THROWING=0")
diff --git a/js/web/lib/index.ts b/js/web/lib/index.ts
index c5c27a4318049..c3b1c82a8788d 100644
--- a/js/web/lib/index.ts
+++ b/js/web/lib/index.ts
@@ -7,6 +7,8 @@
 // So we import code inside the if-clause to allow bundler remove the code safely.
 
 export * from 'onnxruntime-common';
+export * from './onnxjs/model';
+export * as onnxProto from './onnxjs/ort-schema/protobuf/onnx';
 import {registerBackend, env} from 'onnxruntime-common';
 import {version} from './version';
 
diff --git a/js/web/lib/onnxjs/graph.ts b/js/web/lib/onnxjs/graph.ts
index f16da42815957..3f71e8628da4b 100644
--- a/js/web/lib/onnxjs/graph.ts
+++ b/js/web/lib/onnxjs/graph.ts
@@ -118,15 +118,19 @@ class Node implements Graph.Node {
       this.attributes = new Attribute(ProtoUtil.tensorAttributesFromORTFormat(_nodeProto));
     }
 
+    this.inputNames = [];
     this.inputs = [];
     this.outputs = [];
+    this.outputNames = [];
     this.executeNode = true;
   }
 
   name: string;
   opType: string;
   inputs: number[];
+  inputNames: string[];
   outputs: number[];
+  outputNames: string[];
   attributes: Attribute;
   executeNode: boolean;
 }
@@ -297,6 +301,7 @@ class GraphImpl implements Graph, Graph.Transformer {
           dataIndices.set(output, dataIndex);
         }
         node.outputs.push(dataIndex);
+        node.outputNames.push(output);
 
         if (this._allData[dataIndex]._from !== undefined) {
           throw new Error(`multiple nodes output to one data value: ${dataIndex}`);
@@ -340,6 +345,7 @@ class GraphImpl implements Graph, Graph.Transformer {
           throw new Error(`unrecognized input '${input}' for node: ${nodeProto.name}`);
         }
         node.inputs.push(dataIndex);
+        node.inputNames.push(input);
 
         this._allData[dataIndex]._to.push(i);
       }
diff --git a/js/web/package.json b/js/web/package.json
index 15f13600c050e..ff72409861ec4 100644
--- a/js/web/package.json
+++ b/js/web/package.json
@@ -45,7 +45,7 @@
     "@webgpu/types": "^0.1.30",
     "base64-js": "^1.5.1",
     "chai": "^4.3.7",
-    "electron": "^23.1.2",
+    "electron": "^23.3.13",
     "globby": "^13.1.3",
     "karma": "^6.4.1",
     "karma-browserstack-launcher": "^1.6.0",
diff --git a/onnxruntime/core/framework/debug_node_inputs_outputs_utils.cc b/onnxruntime/core/framework/debug_node_inputs_outputs_utils.cc
index ec50bb7d6a5cb..2b96a445fe240 100644
--- a/onnxruntime/core/framework/debug_node_inputs_outputs_utils.cc
+++ b/onnxruntime/core/framework/debug_node_inputs_outputs_utils.cc
@@ -2,7 +2,9 @@
 // Licensed under the MIT License.
 
 #ifdef DEBUG_NODE_INPUTS_OUTPUTS
-
+//#include <iostream>
+//#include <string>
+#include <emscripten/emscripten.h>
 #include "core/framework/debug_node_inputs_outputs_utils.h"
 #include "core/framework/print_tensor_utils.h"
 #include "core/framework/print_tensor_statistics_utils.h"
@@ -59,7 +61,73 @@ bool FilterNode(const NodeDumpOptions& dump_options, const Node& node) {
 }
 
 template <typename T>
-void DumpTensorToStdOut(const Tensor& tensor, const NodeDumpOptions& dump_options) {
+void DumpTensorToStdOut(const Tensor& tensor, const std::string tensor_name, const NodeDumpOptions& dump_options) {
+  auto data = tensor.Data<T>();
+  const auto& shape = tensor.Shape();
+  auto num_items = shape.Size();
+  auto numDimensions = shape.NumDimensions();
+  int64_t shape_array[numDimensions];
+  for (size_t i =0 ; i < numDimensions; i ++) {
+    shape_array[i] = shape[i];
+  }
+  auto tensor_type = DataTypeImpl::ToString(tensor.DataType());
+  std::cout<<"tensor data type: "<<DataTypeImpl::ToString(tensor.DataType())<<"\n";
+
+  EM_ASM(
+      {
+        DataView.prototype.getUint64 = function(byteOffset, littleEndian) {
+          // split 64-bit number into two 32-bit parts
+          const left =  this.getUint32(byteOffset, littleEndian);
+          const right = this.getUint32(byteOffset+4, littleEndian);
+          const combined = littleEndian? left + 2**32*right : 2**32*left + right;
+
+          if (!Number.isSafeInteger(combined))
+            console.warn(combined, 'exceeds MAX_SAFE_INTEGER. Precision may be lost');
+          return combined;
+        };
+
+        BigInt.prototype.toJSON = function () {
+          return Number(this.toString(16));
+        };
+        function SaveObjectsToFile(json_object, name) {
+          // const name = json_object['name'];
+          const object = json_object;
+          const file_name = `${name}.json`;
+          const a = document.createElement('a');
+          const file = new Blob([JSON.stringify(object)], {
+            type: 'application/json'
+          });
+          a.href = URL.createObjectURL(file);
+          a.download = file_name;
+          a.click();
+        }
+
+        const name = UTF8ToString($0);
+
+        const buffer = $1;
+        const buffer_size = $2;
+        console.log(buffer_size);
+        const bytes = new Uint8Array(buffer_size);
+        bytes.set(HEAPU8.subarray(buffer, buffer + buffer_size));
+
+        const tensor_type = UTF8ToString($3);;
+        const shape_ptr = $4;
+        const shape_size = $5 * 8;
+        console.log(shape_size);
+        const shape_bytes = new Uint8Array(shape_size);
+        shape_bytes.set(HEAPU8.subarray(shape_ptr, shape_ptr + shape_size));
+
+        const shape_int64 = new BigInt64Array(shape_bytes.buffer);
+        SaveObjectsToFile({'data': Array.from(new Float32Array(bytes.buffer)),
+                           'dims':Array.from(shape_int64),  'type': tensor_type}, name);
+      },
+      reinterpret_cast<int32_t>(tensor_name.c_str()),
+      reinterpret_cast<int32_t>(data),
+      static_cast<int32_t>(num_items*4),
+      reinterpret_cast<int32_t>(tensor_type),
+      shape_array,
+      numDimensions);
+
   onnxruntime::utils::PrintCpuTensor<T>(tensor, dump_options.snippet_threshold, dump_options.snippet_edge_items);
   if (dump_options.dump_flags & NodeDumpOptions::DumpFlags::StatisticsData) {
     onnxruntime::utils::PrintCpuTensorStats<T>(tensor);
@@ -298,11 +366,12 @@ void DumpCpuTensor(
     const Tensor& tensor, const TensorMetadata& tensor_metadata) {
   switch (dump_options.data_destination) {
     case NodeDumpOptions::DataDestination::StdOut: {
-      DispatchOnTensorType(tensor.DataType(), DumpTensorToStdOut, tensor, dump_options);
+      DispatchOnTensorType(tensor.DataType(), DumpTensorToStdOut, tensor, tensor_metadata.name, dump_options);
       break;
     }
     case NodeDumpOptions::DataDestination::TensorProtoFiles: {
       const Path tensor_file = dump_options.output_dir / Path::Parse(MakeTensorFileName(tensor_metadata.name, dump_options));
+      std::cout<<" tensor_file =" <<tensor_file.ToPathString() <<", tensor_metadata.name="<<tensor_metadata.name<<"\n";
       DumpTensorToFile(tensor, tensor_metadata.name, tensor_file);
       break;
     }
@@ -325,6 +394,7 @@ void DumpTensor(
     const SessionState& session_state) {
   // check tensor is on CPU before dumping it
   auto& tensor_location = tensor.Location();
+  std::cout<< __FUNCTION__<<":"<<__LINE__<<"\n";
   if (tensor_location.device.Type() == OrtDevice::CPU ||
       tensor_location.mem_type == OrtMemTypeCPUInput ||
       tensor_location.mem_type == OrtMemTypeCPUOutput) {
@@ -447,6 +517,17 @@ static void PrintIf(bool boolean_expression, const std::string& message) {
   }
 }
 
+void DumpCpuTensorFromFrame(const Tensor& tensor, const SessionState& session_state, const std::string& name) {
+          TensorMetadata tensor_metadata;
+          tensor_metadata.name = name + "_Dump";
+          tensor_metadata.step = 1;
+          tensor_metadata.consumer = "unknowConsumer";
+          utils::NodeDumpOptions opts{};
+          opts.dump_flags |= utils::NodeDumpOptions::DumpFlags::InputData;
+          opts.dump_flags |= utils::NodeDumpOptions::DumpFlags::OutputData;
+          DumpTensor(opts, tensor, tensor_metadata, session_state);
+}
+
 void DumpNodeInputs(
     const NodeDumpOptions& dump_options,
     const NodeDumpContext& dump_context,
@@ -491,12 +572,12 @@ void DumpNodeInputs(
             const bool is_shape_set = (dump_options.dump_flags & NodeDumpOptions::DumpFlags::Shape) != 0;
             PrintIf(is_shape_set, MakeString(" Shape: ", shape, "\n"));
 
-            if ((dump_options.dump_flags & NodeDumpOptions::DumpFlags::InputData) != 0) {
+            //if ((dump_options.dump_flags & NodeDumpOptions::DumpFlags::InputData) != 0) {
               tensor_metadata.name = input_defs[i]->Name();
               tensor_metadata.step = dump_context.iteration;
               tensor_metadata.consumer = node.Name() + ":" + std::to_string(i);
               DumpTensor(dump_options, *tensor, tensor_metadata, session_state);
-            }
+            //}
           } else {
             std::cout << " is empty optional tensor.\n";
           }
@@ -562,12 +643,14 @@ void DumpNodeOutputs(
             const bool is_shape_set = (dump_options.dump_flags & NodeDumpOptions::DumpFlags::Shape) != 0;
             PrintIf(is_shape_set, MakeString(" Shape: ", shape, "\n"));
 
-            if ((dump_options.dump_flags & NodeDumpOptions::DumpFlags::OutputData) != 0) {
+            //if ((dump_options.dump_flags & NodeDumpOptions::DumpFlags::OutputData) != 0) {
               tensor_metadata.name = output_defs[i]->Name();
               tensor_metadata.step = dump_context.iteration;
               tensor_metadata.producer = node.Name() + ":" + std::to_string(i);
+              std::cout<< __FUNCTION__<<":"<<__LINE__<<"\n";
               DumpTensor(dump_options, *tensor, tensor_metadata, session_state);
-            }
+              std::cout<< __FUNCTION__<<":"<<__LINE__<<"\n";
+            //}
           } else {
             std::cout << " is empty optional tensor.\n";
           }
diff --git a/onnxruntime/core/framework/debug_node_inputs_outputs_utils.h b/onnxruntime/core/framework/debug_node_inputs_outputs_utils.h
index bde005fc204c8..f5b0a44ffe118 100644
--- a/onnxruntime/core/framework/debug_node_inputs_outputs_utils.h
+++ b/onnxruntime/core/framework/debug_node_inputs_outputs_utils.h
@@ -158,6 +158,8 @@ void DumpNodeOutputs(
     const Node& node,
     const SessionState& session_state);
 
+void DumpCpuTensorFromFrame(const Tensor& tensor, const SessionState& session_state, const std::string& name);
+
 }  // namespace utils
 }  // namespace onnxruntime
 
diff --git a/onnxruntime/core/framework/print_tensor_statistics_utils.h b/onnxruntime/core/framework/print_tensor_statistics_utils.h
index fd036114f3e76..40341c5547dd2 100644
--- a/onnxruntime/core/framework/print_tensor_statistics_utils.h
+++ b/onnxruntime/core/framework/print_tensor_statistics_utils.h
@@ -139,7 +139,7 @@ void PrintCpuTensorStats(const Tensor& tensor) {
   }
 
   const T* data = tensor.Data<T>();
-  PrintTensorStats<T>(data, num_items);
+  PrintTensorStats<T>(data, (size_t)num_items);
   std::cout << std::endl;
 }
 
diff --git a/onnxruntime/core/framework/sequential_executor.cc b/onnxruntime/core/framework/sequential_executor.cc
index ba68bc1d7d834..e47397567fc4e 100644
--- a/onnxruntime/core/framework/sequential_executor.cc
+++ b/onnxruntime/core/framework/sequential_executor.cc
@@ -327,7 +327,7 @@ class KernelScope {
 #endif
 
 #ifdef DEBUG_NODE_INPUTS_OUTPUTS
-    utils::DumpNodeInputs(dump_context_, kernel_context_, kernel_.Node(), session_state_);
+    // utils::DumpNodeInputs(dump_context_, kernel_context_, kernel_.Node(), session_state_);
 #endif
 
 #ifdef ENABLE_NVTX_PROFILE
@@ -401,6 +401,7 @@ class KernelScope {
 #endif
 
 #ifdef DEBUG_NODE_INPUTS_OUTPUTS
+    utils::DumpNodeInputs(dump_context_, kernel_context_, kernel_.Node(), session_state_);
     utils::DumpNodeOutputs(dump_context_, kernel_context_, kernel_.Node(), session_state_);
 #endif
   }  //~KernelScope
@@ -607,7 +608,23 @@ onnxruntime::Status ExecuteThePlan(const SessionState& session_state, gsl::span<
       ORT_RETURN_IF_ERROR(session_state.UpdateMemoryPatternGroupCache(feeds, std::move(mem_patterns)));
     }
   }
-
+  {
+    /*
+    // auto frame = ctx.GetExecutionFrame();
+    //auto ort_value_idx_map = session_state.GetOrtValueNameIdxMap()
+    auto num_tensor = static_cast<size_t>(session_state.GetOrtValueNameIdxMap().MaxIdx()) + 1;
+
+    std::cout<<"ort_value_idx_map:   "<<num_tensor<< "\n";
+    for (size_t i =0 ; i < num_tensor; i ++) {
+        std::string name;// = '';
+        auto status = session_state.GetOrtValueNameIdxMap().GetName(i, name);
+        std::cout<<status<<", name: "<<name<< ", "<< i <<"\n";
+
+        OrtValue* p_ml_value = ctx.GetExecutionFrame().GetMutableNodeInputOrOutputMLValue(i);
+        Tensor* tensor = p_ml_value ? p_ml_value->GetMutable<Tensor>() : nullptr;
+        utils::DumpCpuTensorFromFrame(*tensor, session_state, name);
+    }*/
+  }
   return Status::OK();
 }
 
diff --git a/orttraining/orttraining/core/optimizer/graph_transformer_utils.cc b/orttraining/orttraining/core/optimizer/graph_transformer_utils.cc
index e5c65b2a96d8c..3a461e2214317 100644
--- a/orttraining/orttraining/core/optimizer/graph_transformer_utils.cc
+++ b/orttraining/orttraining/core/optimizer/graph_transformer_utils.cc
@@ -116,7 +116,7 @@ std::vector<std::unique_ptr<GraphTransformer>> GeneratePreTrainingTransformers(
       // Put ConstantSharing before CommonSubexpressionElimination by intention as it can create more opportunities for
       // CSE. For example, if A and B nodes both do Add operation with a same value but different initializers, by
       // default, CSE will not merge them, because the different initializers are represented by different NodeArg.
-      transformers.emplace_back(std::make_unique<ConstantSharing>(compatible_eps));
+      // transformers.emplace_back(std::make_unique<ConstantSharing>(compatible_eps));
       // LayerNormFusion must be applied before CommonSubexpressionElimination as the latter will break the pattern when 2 LayerNormFusion share the same input.
       transformers.emplace_back(std::make_unique<LayerNormFusion>(compatible_eps));
       // Remove duplicate nodes. Must be applied before any recompute transformations.