First commit

sdpython · May 7, 2024 · 5cc5ca3 · 5cc5ca3
1 parent e2e4f39
commit 5cc5ca3
Show file tree

Hide file tree

Showing 9 changed files with 545 additions and 17 deletions.
diff --git a/CHANGELOGS.rst b/CHANGELOGS.rst
@@ -4,6 +4,7 @@ Change Logs
 0.3.0
 +++++
 
+* :pr:`180`: add MaskedScatterNDOfShape custom operator
 * :pr:`175`: adds custom operator MulSub and SubMul on CUDA
 * :pr:`173`: adds custom operator AddSharedInput, MulSharedInput on CUDA
 * :pr:`170`: adds custom operator TriMatrix on CUDA

diff --git a/_cmake/targets/ortops_optim_cuda.cmake b/_cmake/targets/ortops_optim_cuda.cmake
@@ -20,6 +20,7 @@ if(CUDA_AVAILABLE)
     ../onnx_extended/ortops/optim/cuda/replace_zero.cu
     ../onnx_extended/ortops/optim/cuda/rotary.cu
     ../onnx_extended/ortops/optim/cuda/scatter_nd_of_shape.cu
+    ../onnx_extended/ortops/optim/cuda/scatter_nd_of_shape_masked.cu
     ../onnx_extended/ortops/optim/cuda/submul.cu
     ../onnx_extended/ortops/optim/cuda/transpose_cast_2d.cu
     ../onnx_extended/ortops/optim/cuda/tri_matrix.cu

diff --git a/_unittests/ut_ortops/test_optim_cuda.py b/_unittests/ut_ortops/test_optim_cuda.py
@@ -120,6 +120,115 @@ def test_scatternd_of_shape_standalone_cuda(self):
         self._scatternd_of_shape_cuda("add", 1, TensorProto.FLOAT)
         self._scatternd_of_shape_cuda("add", 1, TensorProto.FLOAT16)
 
+    def _masked_scatternd_of_shape_cuda(self, reduction, line, itype):
+        import onnxruntime
+        from onnx_extended.ortops.optim.cuda import get_ort_ext_libs
+
+        dtype = np.float32 if itype == TensorProto.FLOAT else np.float16
+
+        model1 = oh.make_model(
+            oh.make_graph(
+                [
+                    oh.make_node("Equal", ["indices", "mone"], ["masked_indices"]),
+                    oh.make_node(
+                        "Where",
+                        ["masked_indices", "zero", "updates"],
+                        ["masked_updates"],
+                    ),
+                    oh.make_node(
+                        "ScatterND",
+                        inputs=["data", "indices", "masked_updates"],
+                        outputs=["y"],
+                        reduction=reduction,
+                    ),
+                ],
+                "nd",
+                [
+                    oh.make_tensor_value_info("data", itype, [None, None]),
+                    oh.make_tensor_value_info(
+                        "indices", TensorProto.INT64, [None, None, 1]
+                    ),
+                    oh.make_tensor_value_info("updates", itype, [None, None, None]),
+                ],
+                [oh.make_tensor_value_info("y", itype, [None, None])],
+                [
+                    onh.from_array(np.array([-1], dtype=np.int64), name="mone"),
+                    onh.from_array(np.array([0], dtype=dtype), name="zero"),
+                ],
+            ),
+            opset_imports=[oh.make_opsetid("", 18)],
+            ir_version=9,
+        )
+
+        model2 = oh.make_model(
+            oh.make_graph(
+                [
+                    oh.make_node(
+                        "MaskedScatterNDOfShape",
+                        inputs=["shape", "indices", "updates"],
+                        outputs=["y"],
+                        reduction=reduction,
+                        maskedValue=-1,
+                        domain="onnx_extended.ortops.optim.cuda",
+                    )
+                ],
+                "nd",
+                [
+                    oh.make_tensor_value_info("shape", TensorProto.INT64, [None]),
+                    oh.make_tensor_value_info(
+                        "indices", TensorProto.INT64, [None, None, 1]
+                    ),
+                    oh.make_tensor_value_info("updates", itype, [None, None, None]),
+                ],
+                [oh.make_tensor_value_info("y", itype, [None, None])],
+            ),
+            opset_imports=[
+                oh.make_opsetid("", 18),
+                oh.make_opsetid("onnx_extended.ortops.optim.cuda", 1),
+            ],
+            ir_version=9,
+        )
+
+        data = np.zeros((32, 16), dtype=dtype)
+        indices = np.array(
+            [
+                [0, 1, 2],
+                [2, 3, 4],
+                [-1, 30, 31],
+                [-1, 7, 8],
+                [10, 11, -1],
+                [20, -1, 21],
+            ],
+            dtype=np.int64,
+        )
+        indices = indices[..., np.newaxis]
+        shape = (6, 3, data.shape[-1])
+        updates = (np.arange(np.prod(shape)).reshape(shape) + 1).astype(dtype)
+
+        feeds1 = dict(data=data, indices=indices, updates=updates)
+        feeds2 = dict(
+            shape=np.array(data.shape, dtype=np.int64), indices=indices, updates=updates
+        )
+        ref = CReferenceEvaluator(model1)
+        expected = ref.run(None, feeds1)[0]
+
+        opts = onnxruntime.SessionOptions()
+        opts.register_custom_ops_library(get_ort_ext_libs()[0])
+        # opts.log_severity_level = 0
+        # opts.log_verbosity_level = 0
+        sess = onnxruntime.InferenceSession(
+            model2.SerializeToString(), opts, providers=["CUDAExecutionProvider"]
+        )
+        got = sess.run(None, feeds2)[0]
+        self.assertEqual(expected.tolist(), got.tolist())
+
+    @unittest.skipIf(not has_cuda(), reason="cuda not available")
+    def test_masked_scatternd_of_shape_standalone_cuda(self):
+        self._masked_scatternd_of_shape_cuda("add", 0, TensorProto.FLOAT)
+        self._masked_scatternd_of_shape_cuda("add", 0, TensorProto.FLOAT16)
+        self._masked_scatternd_of_shape_cuda("add", 1, TensorProto.FLOAT)
+        self._masked_scatternd_of_shape_cuda("add", 1, TensorProto.FLOAT16)
+
     def _addaddmulmul_cuda(self, itype, op_type, broad=False):
         import onnxruntime
         from onnx_extended.ortops.optim.cuda import get_ort_ext_libs

diff --git a/onnx_extended/ortops/optim/cuda/__init__.py b/onnx_extended/ortops/optim/cuda/__init__.py
@@ -133,6 +133,37 @@ def documentation() -> List[str]:
 
     **Constraints**
 
+    * T: float, float16
+    """,
+                """
+    onnx_extended.ortops.optim.cuda.MaskedScatterNDOfShape
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    ConstantOfShape + Where + ScatterND,
+    updates a null matrix with updates if only indices are not
+    equal to a value (usually -1)
+
+    **Provider**
+    
+    CUDAExecutionProvider
+    
+    **Attributes**
+    
+    * maskedValue (int): updates are ignore the indices are equal to this value.
+    
+    **Inputs**
+    
+    * shape (I): tensor of type I
+    * indices (I): tensor of type I
+    * updates (T): tensor of type T
+
+    **Outputs**
+
+    * Z (T): updated tensor
+
+    **Constraints**
+
+    * I: int64
     * T: float, float16
     """,
                 """

diff --git a/onnx_extended/ortops/optim/cuda/ort_optim_cuda_lib.cc b/onnx_extended/ortops/optim/cuda/ort_optim_cuda_lib.cc
@@ -16,6 +16,7 @@
 #include "replace_zero.h"
 #include "rotary.h"
 #include "scatter_nd_of_shape.h"
+#include "scatter_nd_of_shape_masked.h"
 #include "submul.h"
 #include "transpose_cast_2d.h"
 #include "tri_matrix.h"
@@ -76,6 +77,9 @@ OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
   static ortops::ScatterNDOfShapeOp<float> c_ScatterNDOfShapeOp32;
   static ortops::ScatterNDOfShapeOp<half> c_ScatterNDOfShapeOp16;
 
+  static ortops::MaskedScatterNDOfShapeOp<float> c_MaskedScatterNDOfShapeOp32;
+  static ortops::MaskedScatterNDOfShapeOp<half> c_MaskedScatterNDOfShapeOp16;
+
   static ortops::Transpose2DCastOp c_Transpose2DCast16(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT,
                                                        ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16);
   static ortops::Transpose2DCastOp c_Transpose2DCast32(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16,
@@ -128,6 +132,9 @@ OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
     domain.Add(&c_ScatterNDOfShapeOp32);
     domain.Add(&c_ScatterNDOfShapeOp16);
 
+    domain.Add(&c_MaskedScatterNDOfShapeOp32);
+    domain.Add(&c_MaskedScatterNDOfShapeOp16);
+
     domain.Add(&c_Transpose2DCast16);
     domain.Add(&c_Transpose2DCast32);
 

diff --git a/onnx_extended/ortops/optim/cuda/scatter_nd_of_shape.h b/onnx_extended/ortops/optim/cuda/scatter_nd_of_shape.h
@@ -2,27 +2,11 @@
 
 #include "common/common_kernels.h"
 #include "cublas_v2.h"
+#include "scatter_nd_of_shape_common.h"
 #include <cuda_runtime.h>
 
 namespace ortops {
 
-enum class Reduction : int {
-  None = 0,
-  Add = 1,
-  Mul = 2,
-  Min = 3,
-  Max = 4,
-};
-
-enum class Strategy : int {
-  None = 0,
-  Optimize = 1,
-};
-
-struct Shape2 {
-  int64_t dims[12];
-};
-
 template <typename T> struct ScatterNDOfShapeKernel {
   ScatterNDOfShapeKernel(const OrtApi &api, const OrtKernelInfo *info);
   void Compute(OrtKernelContext *context);

diff --git a/onnx_extended/ortops/optim/cuda/scatter_nd_of_shape_common.h b/onnx_extended/ortops/optim/cuda/scatter_nd_of_shape_common.h
@@ -0,0 +1,22 @@
+#pragma once
+
+namespace ortops {
+
+enum class Reduction : int {
+  None = 0,
+  Add = 1,
+  Mul = 2,
+  Min = 3,
+  Max = 4,
+};
+
+enum class Strategy : int {
+  None = 0,
+  Optimize = 1,
+};
+
+struct Shape2 {
+  int64_t dims[12];
+};
+
+} // namespace ortops