microsoft · liqunfu · Aug 2, 2024 · May 3, 2024 · May 6, 2024 · May 7, 2024
diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
@@ -104,6 +104,8 @@
 
     ORT_ENFORCE(nbits_ == 4,
                 "Only 4b quantization is supported for MatMulNBits op, additional bits support is planned.");
+    const Tensor* tensor_zero_point = nullptr;
+    has_zp_input_ = info.TryGetConstantInput(3, &tensor_zero_point);
 #ifdef ORT_NEURAL_SPEED
     const Tensor* tensor_B = nullptr;
     const Tensor* tensor_scale = nullptr;
@@ -139,6 +141,7 @@
   IAllocatorUniquePtr<void> packed_b_{};
   size_t packed_b_size_{0};
 
+  bool has_zp_input_{false};
 #if defined(ORT_NEURAL_SPEED)
 
   bool is_asym_{false};
@@ -208,9 +211,8 @@
   }
 
 #else   // defined(ORT_NEURAL_SPEED)
-
+  const auto compute_type = static_cast<MLAS_SQNBIT_GEMM_COMPUTE_TYPE>(accuracy_level_);
   if (input_idx == InputIndex::B) {
-    const auto compute_type = static_cast<MLAS_SQNBIT_GEMM_COMPUTE_TYPE>(accuracy_level_);
     if (!MlasIsSQNBitGemmAvailable(nbits_, block_size_, compute_type)) {
       return Status::OK();
     }
@@ -220,13 +222,24 @@
     }
     auto qptr = tensor.DataRaw();
     packed_b_ = IAllocator::MakeUniquePtr<void>(alloc, packed_b_size_, true);
-    MlasSQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type, qptr, packed_b_.get());
+    MlasSQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type, qptr, packed_b_.get(), nullptr, has_zp_input_, nullptr, nullptr);
     if (prepacked_weights) {
+      // TODO: cannot use packed_b_ after
+      assert(false);
       prepacked_weights->buffers_.push_back(std::move(packed_b_));
       prepacked_weights->buffer_sizes_.push_back(packed_b_size_);
     }
     is_packed = true;
   }
+  else if (input_idx == InputIndex::scales && packed_b_ != nullptr) {
+    auto sptr = tensor.Data<float>();
+    MlasSQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type, nullptr, packed_b_.get(), sptr, has_zp_input_, nullptr, nullptr);
+    is_packed = false;
+  } else if (input_idx == InputIndex::zero_points && packed_b_ != nullptr) {
+    auto zptr = tensor.Data<uint8_t>();
+    MlasSQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type, nullptr, packed_b_.get(), nullptr, has_zp_input_, zptr, nullptr);
+    is_packed = false;
+  }
 #endif  // defined(ORT_NEURAL_SPEED)
 
   return Status::OK();
@@ -265,6 +278,7 @@
 }
 
 Status MatMulNBits::Compute(OpKernelContext* ctx) const {
+  //auto start = std::chrono::high_resolution_clock::now();  // Start timing here
   concurrency::ThreadPool* thread_pool = ctx->GetOperatorThreadPool();
   const Tensor* a = ctx->Input<Tensor>(InputIndex::A);
   const auto* a_data = a->Data<float>();
@@ -332,9 +346,9 @@
       const auto* bias_data = bias == nullptr ? nullptr : bias->Data<float>();
 
       IAllocatorUniquePtr<std::byte> workspace{};
-      if (const size_t workspace_size = MlasSQNBitGemmBatchWorkspaceSize(M, N, K, batch_count,
-                                                                         nbits_, block_size_, compute_type);
-          workspace_size > 0) {
+      const size_t workspace_size = MlasSQNBitGemmBatchWorkspaceSize(
+        M, N, K, batch_count, nbits_, block_size_, compute_type);
+      if (workspace_size > 0) {
         AllocatorPtr allocator;
         ORT_RETURN_IF_ERROR(ctx->GetTempSpaceAllocator(&allocator));
         workspace = IAllocator::MakeUniquePtr<std::byte>(allocator, workspace_size);
@@ -344,17 +358,29 @@
       for (size_t i = 0; i < batch_count; ++i) {
         data[i].A = a_data + helper.LeftOffsets()[i];
         data[i].lda = lda;
-        data[i].QuantBData = packed_b_.get();
+        data[i].QuantBDataWorkspace = packed_b_.get();
         data[i].QuantBScale = scales_data;
         data[i].QuantBZeroPoint = zero_points_data;
         data[i].Bias = bias_data;
         data[i].C = y_data + helper.OutputOffsets()[i];
         data[i].ldc = N;
+        data[i].node_name = this->Node().Name();
       }
+      //auto start2 = std::chrono::high_resolution_clock::now();  // Start timing here
+
+      //const int CountTotal = 2000;
+      //int count = CountTotal;
+      //while (count-- > 0)
+        MlasSQNBitGemmBatch(M, N, K, batch_count, nbits_, block_size_, compute_type, data.data(), workspace.get(),
+                            thread_pool);
 
-      MlasSQNBitGemmBatch(M, N, K, batch_count, nbits_, block_size_, compute_type, data.data(), workspace.get(),
-                          thread_pool);
+      //auto end = std::chrono::high_resolution_clock::now();  // End timing here
 
+      //std::chrono::duration<double, std::nano> elapsed2 = end - start2;
+      //// Calculate and print the duration in nanoseconds
+      //std::chrono::duration<double, std::nano> elapsed = end - start;
+      //std::cout << "MlasSQNBitGemmBatch: " << elapsed2.count() / CountTotal << " ns\n";
+      //std::cout << "main Duration_M" << M << "xN" << N << "xK" << K << ": " << elapsed.count() / CountTotal << " ns\n";
       return Status::OK();
     }
   }

@@ -22,6 +22,7 @@ Module Name:
 
 #include "mlas.h"
 #include "mlas_gemm_postprocessor.h"
+#include <string>
 
 /**
  * @brief Define compute types of block quantization, in order of decreasing accuracy.
@@ -45,15 +46,18 @@ typedef enum {
 struct MLAS_SQNBIT_GEMM_DATA_PARAMS {
     const float* A = nullptr;               ///< address of A (float32 matrix)
     size_t lda = 0;                         ///< leading dimension of A
-    const void* QuantBData = nullptr;       ///< address of quantized B (quantized n-bit int values)
+    const void* QuantBDataWorkspace;        ///< address of quantized B (quantized n-bit int values)
+    const std::byte* PackedQuantBData = nullptr;       
     const float* QuantBScale = nullptr;     ///< address of scale values of quantized B, one per block
     const void* QuantBZeroPoint = nullptr;  ///< optional address of zero point values of quantized B, one per block
+    const float* QuantBBlkSum = nullptr;    ///< optional address of scale * zp, one per block
     const float* Bias = nullptr;            ///< optional address of Bias, vector size N
     float* C = nullptr;                     ///< address of result matrix
     size_t ldc = 0;                         ///< leading dimension of C
 
     ///< optional post processing to apply to result matrix
     MLAS_GEMM_POSTPROCESSOR<float>* PostProcessor = nullptr;
+    std::string node_name = "";
 };
 
 /**
@@ -159,14 +163,26 @@ MlasSQNBitGemmPackQuantBDataSize(
 /**
  * @brief Packs the quantized B data in a format that the kernel expects.
  *
+ * If the function is called without QuantBScale and QuantBZeroPoint,
+ * it just packs QuantBData into PackedQuantBDataAndOrBlkSum.
+ *
+ * If the function is called with QuantBData, QuantBScale, and QuantBZeroPoint
+ * additional BlkSum (Scale * zeropoint) is computed and stored at the second part of PackedQuantBDataAndOrBlkSum.
+ *
+ * Because ORT OpKernel::PrePack is called for each input (in this case, QuantBData,
+ * QuantBScale, and QuantBZeroPoint) separately, this function may be called 3 times, first with QuantBData,
+ * and then QuantBScale and QuantBZeroPoint. The second time the function is called with QuantBScale,
+ * BlkSum is computed with default zero point 8 and stored at the second part of PackedQuantBDataAndOrBlkSum.
+ * If there is a third call with QuantBZeroPoint, BlkSum is recomputed/adjusted with provided zeropoint.
+ *
  * @param[in]   N                   column size of matrix B and C
  * @param[in]   K                   column size of matrix A and row size of matrix B
  * @param[in]   BlkBitWidth         quantized value bit width (e.g., 4 means 4 bit ints)
  * @param[in]   BlkLen              number of quantized values per block
  * @param[in]   ComputeType         GEMM compute type (e.g., multiplying float or int8 values)
  * @param[in]   QuantBData          quantized B data
  * @param[out]  PackedQuantBData    packed quantized B data
- * @param[in]   ThreadPool          optional thread pool to use
+ * @param[in]   ThreadPool          thread pool to use (no parallel if nullptr)
  */
 void MLASCALL
 MlasSQNBitGemmPackQuantBData(
@@ -176,6 +192,9 @@ MlasSQNBitGemmPackQuantBData(
     size_t BlkLen,
     MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType,
     const void* QuantBData,
-    void* PackedQuantBData,
-    MLAS_THREADPOOL* ThreadPool = nullptr
+    void* PackedQuantBDataAndOrBlkSum,
+    const void* QuantBScale,
+    bool has_zp_input,
+    const void* QuantBZeroPoint,
+    MLAS_THREADPOOL* ThreadPool
 );
@@ -455,7 +455,7 @@ Return Value:
                             this->GemvU8S8Kernel = MlasGemvU8S8KernelAvx512Vnni;
                             this->ConvSymU8S8Dispatch = &MlasConvSymDispatchAvx512Vnni;
                             this->Q8Q4GemmDispatch = &MlasQ8Q4GemmDispatchAvx512vnni;
-                            this->SQNBitGemmDispatch = &MlasSQNBitGemmDispatchAvx512vnni;
+                            //this->SQNBitGemmDispatch = &MlasSQNBitGemmDispatchAvx512vnni;
                         }
                     }
                 }