From 5cbb81efb0f17e993f5e653d17be1609a0533abc Mon Sep 17 00:00:00 2001
From: Xu Xing <xing.xu@intel.com>
Date: Thu, 12 Oct 2023 13:34:26 +0800
Subject: [PATCH] [webgpu] dump test

---
 cmake/onnxruntime_webassembly.cmake           |   1 +
 js/web/lib/index.ts                           |   3 +
 js/web/lib/onnxjs/graph.ts                    |   6 +
 js/web/package.json                           |   2 +-
 .../debug_node_inputs_outputs_utils.cc        | 107 ++++++++++++++++--
 .../debug_node_inputs_outputs_utils.h         |   2 +
 .../framework/print_tensor_statistics_utils.h |   2 +-
 .../core/framework/sequential_executor.cc     |   1 -
 onnxruntime/core/graph/model.cc               |   9 +-
 .../core/optimizer/graph_transformer_utils.cc |   2 +
 .../core/optimizer/graph_transformer_utils.cc |   2 +-
 11 files changed, 121 insertions(+), 16 deletions(-)

diff --git a/cmake/onnxruntime_webassembly.cmake b/cmake/onnxruntime_webassembly.cmake
index c6510c97a617e..e64878feef558 100644
--- a/cmake/onnxruntime_webassembly.cmake
+++ b/cmake/onnxruntime_webassembly.cmake
@@ -265,6 +265,7 @@ else()
   if (onnxruntime_USE_WEBNN)
    set_property(TARGET onnxruntime_webassembly APPEND_STRING PROPERTY LINK_FLAGS " --bind -sWASM_BIGINT")
   endif()
+  # set_property(TARGET onnxruntime_webassembly APPEND_STRING PROPERTY LINK_FLAGS " --bind -sWASM_BIGINT")
 
   # Set link flag to enable exceptions support, this will override default disabling exception throwing behavior when disable exceptions.
   target_link_options(onnxruntime_webassembly PRIVATE "SHELL:-s DISABLE_EXCEPTION_THROWING=0")
diff --git a/js/web/lib/index.ts b/js/web/lib/index.ts
index 6060271ced156..44e42c854a71c 100644
--- a/js/web/lib/index.ts
+++ b/js/web/lib/index.ts
@@ -10,6 +10,9 @@ export * from 'onnxruntime-common';
 import * as ort from 'onnxruntime-common';
 export default ort;
 
+export * from './onnxjs/model';
+export * as JsTensor from './onnxjs/tensor';
+export * as OnnxProto from './onnxjs/ort-schema/protobuf/onnx';
 import {registerBackend, env} from 'onnxruntime-common';
 import {version} from './version';
 
diff --git a/js/web/lib/onnxjs/graph.ts b/js/web/lib/onnxjs/graph.ts
index f16da42815957..3f71e8628da4b 100644
--- a/js/web/lib/onnxjs/graph.ts
+++ b/js/web/lib/onnxjs/graph.ts
@@ -118,15 +118,19 @@ class Node implements Graph.Node {
       this.attributes = new Attribute(ProtoUtil.tensorAttributesFromORTFormat(_nodeProto));
     }
 
+    this.inputNames = [];
     this.inputs = [];
     this.outputs = [];
+    this.outputNames = [];
     this.executeNode = true;
   }
 
   name: string;
   opType: string;
   inputs: number[];
+  inputNames: string[];
   outputs: number[];
+  outputNames: string[];
   attributes: Attribute;
   executeNode: boolean;
 }
@@ -297,6 +301,7 @@ class GraphImpl implements Graph, Graph.Transformer {
           dataIndices.set(output, dataIndex);
         }
         node.outputs.push(dataIndex);
+        node.outputNames.push(output);
 
         if (this._allData[dataIndex]._from !== undefined) {
           throw new Error(`multiple nodes output to one data value: ${dataIndex}`);
@@ -340,6 +345,7 @@ class GraphImpl implements Graph, Graph.Transformer {
           throw new Error(`unrecognized input '${input}' for node: ${nodeProto.name}`);
         }
         node.inputs.push(dataIndex);
+        node.inputNames.push(input);
 
         this._allData[dataIndex]._to.push(i);
       }
diff --git a/js/web/package.json b/js/web/package.json
index 7271fed99d709..66e3b093c4bb3 100644
--- a/js/web/package.json
+++ b/js/web/package.json
@@ -45,7 +45,7 @@
     "@webgpu/types": "^0.1.30",
     "base64-js": "^1.5.1",
     "chai": "^4.3.7",
-    "electron": "^23.1.2",
+    "electron": "^23.3.13",
     "globby": "^13.1.3",
     "karma": "^6.4.1",
     "karma-browserstack-launcher": "^1.6.0",
diff --git a/onnxruntime/core/framework/debug_node_inputs_outputs_utils.cc b/onnxruntime/core/framework/debug_node_inputs_outputs_utils.cc
index ec50bb7d6a5cb..14ac25a8f54a5 100644
--- a/onnxruntime/core/framework/debug_node_inputs_outputs_utils.cc
+++ b/onnxruntime/core/framework/debug_node_inputs_outputs_utils.cc
@@ -2,7 +2,9 @@
 // Licensed under the MIT License.
 
 #ifdef DEBUG_NODE_INPUTS_OUTPUTS
-
+//#include <iostream>
+//#include <string>
+#include <emscripten/emscripten.h>
 #include "core/framework/debug_node_inputs_outputs_utils.h"
 #include "core/framework/print_tensor_utils.h"
 #include "core/framework/print_tensor_statistics_utils.h"
@@ -59,8 +61,83 @@ bool FilterNode(const NodeDumpOptions& dump_options, const Node& node) {
 }
 
 template <typename T>
-void DumpTensorToStdOut(const Tensor& tensor, const NodeDumpOptions& dump_options) {
-  onnxruntime::utils::PrintCpuTensor<T>(tensor, dump_options.snippet_threshold, dump_options.snippet_edge_items);
+void DumpTensorToStdOut(const Tensor& tensor, const std::string tensor_name, const NodeDumpOptions& dump_options) {
+  auto data = tensor.Data<T>();
+  const auto& shape = tensor.Shape();
+  auto num_items = shape.Size();
+  auto numDimensions = shape.NumDimensions();
+  int64_t shape_array[numDimensions];
+  for (size_t i =0 ; i < numDimensions; i ++) {
+    shape_array[i] = shape[i];
+  }
+  auto tensor_type = DataTypeImpl::ToString(tensor.DataType());
+
+  EM_ASM(
+      {
+        if (window.dump != 1) {
+          return;
+        }
+
+        DataView.prototype.getUint64 = function(byteOffset, littleEndian) {
+          // split 64-bit number into two 32-bit parts
+          const left =  this.getUint32(byteOffset, littleEndian);
+          const right = this.getUint32(byteOffset+4, littleEndian);
+          const combined = littleEndian? left + 2**32*right : 2**32*left + right;
+
+          if (!Number.isSafeInteger(combined))
+            console.warn(combined, 'exceeds MAX_SAFE_INTEGER. Precision may be lost');
+          return combined;
+        };
+
+        BigInt.prototype.toJSON = function () {
+          return Number(this.toString());
+        };
+
+        function SaveObjectToFile(object, name) {
+          if (window.dumpBlobUrlMap == null) {
+            window.dumpBlobUrlMap = new Map();
+          }
+          const file = new Blob([JSON.stringify(object)], {
+            type: 'application/json'
+          });
+          console.log(name);
+          const url = URL.createObjectURL(file);
+          window.dumpBlobUrlMap.set(name, url);
+        }
+
+        const name = UTF8ToString($0);
+        const buffer = $1;
+        const tensor_type = UTF8ToString($3);
+        let data_buffer;
+        if (tensor_type === 'int64') {
+          const buffer_size = $2*8;
+          const bytes = new Uint8Array(buffer_size);
+          bytes.set(HEAPU8.subarray(buffer, buffer + buffer_size));
+          data_buffer = new BigInt64Array(bytes.buffer);
+        } else {
+          const buffer_size = $2*4;
+          const bytes = new Uint8Array(buffer_size);
+          bytes.set(HEAPU8.subarray(buffer, buffer + buffer_size));
+          data_buffer = new Float32Array(bytes.buffer)
+        }
+
+        const shape_ptr = $4;
+        const shape_size = $5 * 8;
+        const shape_bytes = new Uint8Array(shape_size);
+        shape_bytes.set(HEAPU8.subarray(shape_ptr, shape_ptr + shape_size));
+
+        const shape_int64 = new BigInt64Array(shape_bytes.buffer);
+        SaveObjectToFile({'data': Array.from(data_buffer),
+                           'dims':Array.from(shape_int64),  'type': tensor_type}, name);
+      },
+      reinterpret_cast<int32_t>(tensor_name.c_str()),
+      reinterpret_cast<int32_t>(data),
+      static_cast<int32_t>(num_items),
+      reinterpret_cast<int32_t>(tensor_type),
+      shape_array,
+      numDimensions);
+
+  // onnxruntime::utils::PrintCpuTensor<T>(tensor, dump_options.snippet_threshold, dump_options.snippet_edge_items);
   if (dump_options.dump_flags & NodeDumpOptions::DumpFlags::StatisticsData) {
     onnxruntime::utils::PrintCpuTensorStats<T>(tensor);
   }
@@ -298,11 +375,12 @@ void DumpCpuTensor(
     const Tensor& tensor, const TensorMetadata& tensor_metadata) {
   switch (dump_options.data_destination) {
     case NodeDumpOptions::DataDestination::StdOut: {
-      DispatchOnTensorType(tensor.DataType(), DumpTensorToStdOut, tensor, dump_options);
+      DispatchOnTensorType(tensor.DataType(), DumpTensorToStdOut, tensor, tensor_metadata.name, dump_options);
       break;
     }
     case NodeDumpOptions::DataDestination::TensorProtoFiles: {
       const Path tensor_file = dump_options.output_dir / Path::Parse(MakeTensorFileName(tensor_metadata.name, dump_options));
+      std::cout<<" tensor_file =" <<tensor_file.ToPathString() <<", tensor_metadata.name="<<tensor_metadata.name<<"\n";
       DumpTensorToFile(tensor, tensor_metadata.name, tensor_file);
       break;
     }
@@ -447,6 +525,17 @@ static void PrintIf(bool boolean_expression, const std::string& message) {
   }
 }
 
+void DumpCpuTensorFromFrame(const Tensor& tensor, const SessionState& session_state, const std::string& name) {
+          TensorMetadata tensor_metadata;
+          tensor_metadata.name = name + "_Dump";
+          tensor_metadata.step = 1;
+          tensor_metadata.consumer = "unknowConsumer";
+          utils::NodeDumpOptions opts{};
+          opts.dump_flags |= utils::NodeDumpOptions::DumpFlags::InputData;
+          opts.dump_flags |= utils::NodeDumpOptions::DumpFlags::OutputData;
+          DumpTensor(opts, tensor, tensor_metadata, session_state);
+}
+
 void DumpNodeInputs(
     const NodeDumpOptions& dump_options,
     const NodeDumpContext& dump_context,
@@ -480,9 +569,7 @@ void DumpNodeInputs(
   for (auto i = 0, end = context.InputCount(); i < end; ++i) {
     if (input_defs[i]->Exists()) {
       std::cout << "Input " << i << " Name: " << input_defs[i]->Name() << "\n";
-
       const auto* type = context.InputType(i);
-
       if (type) {
         if (type->IsTensorType()) {
           if (const auto* tensor = context.Input<Tensor>(i); tensor != nullptr) {
@@ -491,12 +578,12 @@ void DumpNodeInputs(
             const bool is_shape_set = (dump_options.dump_flags & NodeDumpOptions::DumpFlags::Shape) != 0;
             PrintIf(is_shape_set, MakeString(" Shape: ", shape, "\n"));
 
-            if ((dump_options.dump_flags & NodeDumpOptions::DumpFlags::InputData) != 0) {
+            //if ((dump_options.dump_flags & NodeDumpOptions::DumpFlags::InputData) != 0) {
               tensor_metadata.name = input_defs[i]->Name();
               tensor_metadata.step = dump_context.iteration;
               tensor_metadata.consumer = node.Name() + ":" + std::to_string(i);
               DumpTensor(dump_options, *tensor, tensor_metadata, session_state);
-            }
+            //}
           } else {
             std::cout << " is empty optional tensor.\n";
           }
@@ -562,12 +649,12 @@ void DumpNodeOutputs(
             const bool is_shape_set = (dump_options.dump_flags & NodeDumpOptions::DumpFlags::Shape) != 0;
             PrintIf(is_shape_set, MakeString(" Shape: ", shape, "\n"));
 
-            if ((dump_options.dump_flags & NodeDumpOptions::DumpFlags::OutputData) != 0) {
+            //if ((dump_options.dump_flags & NodeDumpOptions::DumpFlags::OutputData) != 0) {
               tensor_metadata.name = output_defs[i]->Name();
               tensor_metadata.step = dump_context.iteration;
               tensor_metadata.producer = node.Name() + ":" + std::to_string(i);
               DumpTensor(dump_options, *tensor, tensor_metadata, session_state);
-            }
+            //}
           } else {
             std::cout << " is empty optional tensor.\n";
           }
diff --git a/onnxruntime/core/framework/debug_node_inputs_outputs_utils.h b/onnxruntime/core/framework/debug_node_inputs_outputs_utils.h
index bde005fc204c8..f5b0a44ffe118 100644
--- a/onnxruntime/core/framework/debug_node_inputs_outputs_utils.h
+++ b/onnxruntime/core/framework/debug_node_inputs_outputs_utils.h
@@ -158,6 +158,8 @@ void DumpNodeOutputs(
     const Node& node,
     const SessionState& session_state);
 
+void DumpCpuTensorFromFrame(const Tensor& tensor, const SessionState& session_state, const std::string& name);
+
 }  // namespace utils
 }  // namespace onnxruntime
 
diff --git a/onnxruntime/core/framework/print_tensor_statistics_utils.h b/onnxruntime/core/framework/print_tensor_statistics_utils.h
index fd036114f3e76..40341c5547dd2 100644
--- a/onnxruntime/core/framework/print_tensor_statistics_utils.h
+++ b/onnxruntime/core/framework/print_tensor_statistics_utils.h
@@ -139,7 +139,7 @@ void PrintCpuTensorStats(const Tensor& tensor) {
   }
 
   const T* data = tensor.Data<T>();
-  PrintTensorStats<T>(data, num_items);
+  PrintTensorStats<T>(data, (size_t)num_items);
   std::cout << std::endl;
 }
 
diff --git a/onnxruntime/core/framework/sequential_executor.cc b/onnxruntime/core/framework/sequential_executor.cc
index ba68bc1d7d834..a127f000f173b 100644
--- a/onnxruntime/core/framework/sequential_executor.cc
+++ b/onnxruntime/core/framework/sequential_executor.cc
@@ -607,7 +607,6 @@ onnxruntime::Status ExecuteThePlan(const SessionState& session_state, gsl::span<
       ORT_RETURN_IF_ERROR(session_state.UpdateMemoryPatternGroupCache(feeds, std::move(mem_patterns)));
     }
   }
-
   return Status::OK();
 }
 
diff --git a/onnxruntime/core/graph/model.cc b/onnxruntime/core/graph/model.cc
index 076332a65c8f2..be016d596139b 100644
--- a/onnxruntime/core/graph/model.cc
+++ b/onnxruntime/core/graph/model.cc
@@ -541,6 +541,10 @@ static Status SaveModel(Model& model, const T& file_path) {
   model_proto.SerializeToArray(buffer, buffer_size);
 
   EM_ASM(({
+          if (window.dump != 2) {
+            console.log("not dump");
+            return;
+          }
            const buffer = $0;
            const buffer_size = $1;
            const file_path = UTF8ToString($2);
@@ -552,8 +556,9 @@ static Status SaveModel(Model& model, const T& file_path) {
            } else {
              // Browser
              const file = new File([bytes], file_path, {type: "application/octet-stream" });
-             const url = URL.createObjectURL(file);
-             window.open(url, '_blank');
+             // const url = URL.createObjectURL(file);
+             // window.open(url, '_blank');
+             window.optmizedModelBlobUrl = URL.createObjectURL(file);
            }
          }),
          reinterpret_cast<int32_t>(buffer),
diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc
index 5a441b1d1701e..9331283492098 100644
--- a/onnxruntime/core/optimizer/graph_transformer_utils.cc
+++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc
@@ -219,7 +219,9 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
         excluded_initializers.insert(p.first);
       }
       const InlinedHashSet<std::string_view> no_limit_empty_ep_list = {};
+      #ifndef DEBUG_NODE_INPUTS_OUTPUTS
       transformers.emplace_back(std::make_unique<ConstantSharing>(no_limit_empty_ep_list, excluded_initializers));
+      #endif
 
       transformers.emplace_back(std::make_unique<CommonSubexpressionElimination>());
       transformers.emplace_back(std::make_unique<ConstantFolding>(cpu_execution_provider, !disable_quant_qdq));
diff --git a/orttraining/orttraining/core/optimizer/graph_transformer_utils.cc b/orttraining/orttraining/core/optimizer/graph_transformer_utils.cc
index 57d76577f1ba7..51ea0b5cad75b 100644
--- a/orttraining/orttraining/core/optimizer/graph_transformer_utils.cc
+++ b/orttraining/orttraining/core/optimizer/graph_transformer_utils.cc
@@ -117,7 +117,7 @@ std::vector<std::unique_ptr<GraphTransformer>> GeneratePreTrainingTransformers(
       // Put ConstantSharing before CommonSubexpressionElimination by intention as it can create more opportunities for
       // CSE. For example, if A and B nodes both do Add operation with a same value but different initializers, by
       // default, CSE will not merge them, because the different initializers are represented by different NodeArg.
-      transformers.emplace_back(std::make_unique<ConstantSharing>(compatible_eps));
+      // transformers.emplace_back(std::make_unique<ConstantSharing>(compatible_eps));
       // LayerNormFusion must be applied before CommonSubexpressionElimination as the latter will break the pattern when 2 LayerNormFusion share the same input.
       transformers.emplace_back(std::make_unique<LayerNormFusion>(compatible_eps));
       // Remove duplicate nodes. Must be applied before any recompute transformations.