diff --git a/.gitignore b/.gitignore
index 794fba36..28358319 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,7 @@
 *.sln
 *.cmake
 *.whl
+*.def
 /*.png
 /*.onnx
 .build_path.txt
diff --git a/CHANGELOGS.rst b/CHANGELOGS.rst
index a304f4c3..fbec9801 100644
--- a/CHANGELOGS.rst
+++ b/CHANGELOGS.rst
@@ -4,6 +4,7 @@ Change Logs
 0.2.0
 +++++
 
+* :pr:`41`: implements a custom kernel for RandomForestRegressor easier to optimize
 * :pr:`34`: update to onnxruntime v1.15.1
 * :pr:`31`: implement a custom CUDA kernel (gemm)
 * :pr:`32`: update to onnxruntime v1.15.0
diff --git a/README.rst b/README.rst
index f26f13b5..23de400d 100644
--- a/README.rst
+++ b/README.rst
@@ -29,7 +29,7 @@ onnx-extended: extensions for onnx and onnxruntime
 **onnx-extended** extends the list of supported operators in onnx
 reference implementation, or implements faster versions in C++.
 Documentation `onnx-extended
-<http://www.xavierdupre.fr/app/onnx-extended/helpsphinx/index.html>`_.
+<https://sdpython.github.io/doc/onnx-extended/>`_.
 Source are available on `github/onnx-extended
 <https://github.com/sdpython/onnx-extended>`_.
 
@@ -115,9 +115,9 @@ can be enabled with the following command:
 
 ::
 
-    python setup.py build_ext --inplace --enable_nvtx 1
-    # or
-    pip install -e . --config-settings="--enable_nvtx=1"
+    python setup.py build_ext --inplace --use_nvtx 1
+    # or (not working yet)
+    pip install -e . --config-settings="--use_nvtx=1"
 
 Experimental cython binding for onnxruntime
 +++++++++++++++++++++++++++++++++++++++++++
diff --git a/_cmake/CMakeLists.txt b/_cmake/CMakeLists.txt
index 6476872a..0f20b76f 100644
--- a/_cmake/CMakeLists.txt
+++ b/_cmake/CMakeLists.txt
@@ -1,11 +1,12 @@
 cmake_minimum_required(VERSION 3.24.0)
-project(onnx_extended VERSION 0.2.0)
+project(onnx_extended VERSION ${ONNX_EXTENDED_VERSION})
 
 #
 # initialisation
 #
 
 message(STATUS "-------------------")
+message(STATUS "ONNX_EXTENDED_VERSION=${ONNX_EXTENDED_VERSION}")
 message(STATUS "CMAKE_VERSION=${CMAKE_VERSION}")
 message(STATUS "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}")
 message(STATUS "CMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}")
@@ -25,6 +26,7 @@ message(STATUS "USE_CUDA=${USE_CUDA}")
 message(STATUS "CUDA_BUILD=${CUDA_BUILD}")
 message(STATUS "USE_NVTX=${USE_NVTX}")
 message(STATUS "ORT_VERSION=${ORT_VERSION}")
+
 # message(STATUS "ENV-PATH=$ENV{PATH}")
 # message(STATUS "ENV-PYTHONPATH=$ENV{PYTHONPATH}")
 message(STATUS "--------------------------------------------")
@@ -44,8 +46,8 @@ list(APPEND CMAKE_MODULE_PATH
 # Packages and constants
 #
 
-include("load_externals.cmake")
 include("constants.cmake")
+include("load_externals.cmake")
 
 #
 # modules
@@ -61,8 +63,10 @@ include("targets/cuda_example_py.cmake")
 include("targets/vector_function_cy.cmake")
 
 set(ORTOPS_INCLUDE_DIR "${ROOT_INCLUDE_PATH}/onnx_extended/ortops")
+set(REFOPS_INCLUDE_DIR "${ROOT_INCLUDE_PATH}/onnx_extended/reference/c_ops")
 include("targets/ortops_tutorial_cpu.cmake")
 include("targets/ortops_tutorial_cuda.cmake")
+include("targets/ortops_optim_cpu.cmake")
 
 #
 # write version
diff --git a/_cmake/clang_format.sh b/_cmake/clang_format.sh
index be13c1a7..4938cf4a 100644
--- a/_cmake/clang_format.sh
+++ b/_cmake/clang_format.sh
@@ -6,11 +6,11 @@ echo "--cython-lint--"
 cython-lint .
 echo "--clang-format--"
 find onnx_extended -type f \( -name "*.h" -o -name "*.hpp" -o -name "*.cuh" -o -name "*.cpp" -o -name "*.cc" -o -name "*.cu" \) | while read f; do
-    echo "Processing '$f'";
-    clang-format --length 88 -i $f;
+    echo "clang-format -i $f";
+    clang-format -i $f;
 done
 echo "--cmake-lint--"
 find _cmake -type f \( -name "*.cmake" -o -name "*.txt" \) | while read f; do
-    echo "Processing '$f'";
+    echo "cmake-lint $f --line-width=88 --disabled-codes C0103 C0113";
     cmake-lint $f --line-width=88 --disabled-codes C0103 C0113;
 done
diff --git a/_cmake/constants.cmake b/_cmake/constants.cmake
index 0c784d0a..36775dfc 100644
--- a/_cmake/constants.cmake
+++ b/_cmake/constants.cmake
@@ -1,3 +1,14 @@
+#
+# python extension
+#
+if(MSVC)
+  set(DLLEXT "dll")
+elseif(APPLE)
+  set(DLLEXT "dylib")
+else()
+  set(DLLEXT "so")
+endif()
+
 #
 # C++ 14 or C++ 17
 #
@@ -37,6 +48,9 @@ else()
 endif()
 
 if(APPLE)
+  message(STATUS "APPLE: set env var for open mp: CC, CCX, LDFLAGS, CPPFLAGS")
+  set(ENV{CC} "/usr/local/opt/llvm/bin/clang")
+  set(ENV{CXX} "/usr/local/opt/llvm/bin/clang++")
   set(ENV(LDFLAGS) "-L/usr/local/opt/llvm/lib")
   set(ENV(CPPFLAGS) "-I/usr/local/opt/llvm/include")
 endif()
diff --git a/_cmake/externals/FindCudaExtension.cmake b/_cmake/externals/FindCudaExtension.cmake
index 86778062..eca09065 100644
--- a/_cmake/externals/FindCudaExtension.cmake
+++ b/_cmake/externals/FindCudaExtension.cmake
@@ -4,6 +4,14 @@
 # Defines USE_NTVX to enable profiling with NVIDIA profiler.
 # CUDA_VERSION must be defined as well.
 
+if(${CMAKE_CUDA_COMPILER} STREQUAL "/usr/bin/nvcc")
+  message(FATAL_ERROR
+          "CMAKE_CUDA_COMPILER is equal to '${CMAKE_CUDA_COMPILER}', "
+          "CUDA_VERSION=${CUDA_VERSION}, "
+          "CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}, "
+          "You should specify the cuda version by adding --cuda-version=...")
+endif()
+
 if(CUDA_VERSION)
   find_package(CUDAToolkit ${CUDA_VERSION} EXACT)
 else()
@@ -14,6 +22,27 @@ message(STATUS "CUDAToolkit_FOUND=${CUDAToolkit_FOUND}")
 
 if(CUDAToolkit_FOUND)
 
+  message(STATUS "befor1 language CUDA_VERSION=${CUDA_VERSION}")
+  message(STATUS "befor1 language CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}")
+  message(STATUS "befor1 language CMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}")
+
+  if(CMAKE_CUDA_ARCHITECTURES STREQUAL "")
+    set(CMAKE_CUDA_ARCHITECTURES "native")
+  endif()
+  if(CMAKE_CUDA_COMPILER STREQUAL "CMAKE_CUDA_COMPILER-NOTFOUND")
+    if(CUDA_VERSION STREQUAL "")
+      message(FATAL_ERROR "No CMAKE_CUDA_COMPILER for CUDA_VERSION=${CUDA_VERSION}. "
+                          "You can use --cuda-version=<CUDA_VERSION> or set "
+                          "CUDACXX=/usr/local/cuda-<CUDA_VERSION>/bin/nvcc")
+    else()
+      set(CMAKE_CUDA_COMPILER "/usr/local/cuda-${CUDA_VERSION}/bin/nvcc")
+      message(STATUS "set CMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}")
+    endif()
+  endif()
+
+  message(STATUS "before language CUDA_VERSION=${CUDA_VERSION}")
+  message(STATUS "before language CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}")
+  message(STATUS "before language CMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}")
   enable_language(CUDA)
   message(STATUS "------------- CUDA settings")
   message(STATUS "CUDA_VERSION=${CUDA_VERSION}")
@@ -30,7 +59,7 @@ if(CUDAToolkit_FOUND)
                         "< ${CUDA_VERSION}, nvcc is not setup properly. "
                         "Try 'whereis nvcc' and chack the version.")
   endif()
-  
+
   set(CMAKE_CUDA_STANDARD 17)
   set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 
@@ -42,11 +71,13 @@ if(CUDAToolkit_FOUND)
 
   if(CUDA_BUILD STREQUAL "H100opt")
 
-    # see https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+    # see https://arnon.dk/
+    # matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
     set(CMAKE_CUDA_ARCHITECTURES 90)
     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_90,code=sm_90")
     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_90a,code=sm_90a")
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_90a,code=compute_90a")
+    set(CMAKE_CUDA_FLAGS
+        "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_90a,code=compute_90a")
 
   else()  # H100, DEFAULT
 
@@ -64,25 +95,36 @@ if(CUDAToolkit_FOUND)
     endif()
 
     if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11)
-      message(FATAL_ERROR "CUDA verions must be >= 11 but is ${CMAKE_CUDA_COMPILER_VERSION}.")
+      message(FATAL_ERROR "CUDA verions must be >= 11 but is "
+                          "${CMAKE_CUDA_COMPILER_VERSION}.")
     endif()
     if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12)
-      # 37, 50 still work in CUDA 11 but are marked deprecated and will be removed in future CUDA version.
-      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_37,code=sm_37") # K80
-      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_50,code=sm_50") # M series
+      # 37, 50 still work in CUDA 11
+      # but are marked deprecated and will be removed in future CUDA version.
+      # K80
+      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_37,code=sm_37")
+      # M series
+      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_50,code=sm_50")
     endif()
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_52,code=sm_52") # M60
-    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_60,code=sm_60") # P series
-    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_61,code=sm_61") # P series
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_70,code=sm_70") # V series
-    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_75,code=sm_75") # T series
+    # M60
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_52,code=sm_52")
+    # P series
+    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_60,code=sm_60")
+    # P series
+    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_61,code=sm_61")
+    # V series
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_70,code=sm_70")
+    # T series
+    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_75,code=sm_75")
     if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11)
-      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_80,code=sm_80") # A series
-      # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_86,code=sm_86") # A series
-      # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_87,code=sm_87") # A series
+      # A series
+      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_80,code=sm_80")
+      # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_86,code=sm_86")
+      # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_87,code=sm_87")
     endif()
     if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.8)
-      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_90,code=sm_90") # H series
+      # H series
+      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_90,code=sm_90")
     endif()
   endif()
 
diff --git a/_cmake/externals/FindMyPython.cmake b/_cmake/externals/FindMyPython.cmake
index d371dfc4..0bc26951 100644
--- a/_cmake/externals/FindMyPython.cmake
+++ b/_cmake/externals/FindMyPython.cmake
@@ -71,12 +71,6 @@ else()
   message(STATUS "Use find_package(Python3).")
   set(Python3_EXECUTABLE ${PYTHON_EXECUTABLE})
   if(APPLE)
-    message(STATUS "APPLE: set env var for open mp: CC, CCX, LDFLAGS, CPPFLAGS")
-    set(ENV{CC} "/usr/local/opt/llvm/bin/clang")
-    set(ENV{CXX} "/usr/local/opt/llvm/bin/clang++")
-    set(ENV{LDFLAGS} "-L/usr/local/opt/llvm/lib")
-    set(ENV{CPPFLAGS} "-I/usr/local/opt/llvm/include")
-
     find_package(Python3 ${PYTHON_VERSION} COMPONENTS
                 Interpreter Development.Module
                 REQUIRED)
diff --git a/_cmake/externals/FindOrt.cmake b/_cmake/externals/FindOrt.cmake
index 56c56ed0..733dff0b 100644
--- a/_cmake/externals/FindOrt.cmake
+++ b/_cmake/externals/FindOrt.cmake
@@ -4,6 +4,8 @@
 # downloads onnxruntime as a binary
 # functions ort_add_dependency, ort_add_custom_op
 
+file(WRITE "../_setup_ext.txt" "")
+
 if(NOT ORT_VERSION)
   set(ORT_VERSION 1.15.1)
   set(ORT_VERSION_INT 1150)
@@ -58,14 +60,6 @@ else()
   set(ORT_URL ${ORT_VERSION})
 endif()
 
-if(MSVC)
-  set(DLLEXT "dll")
-elseif(APPLE)
-  set(DLLEXT "dylib")
-else()
-  set(DLLEXT "so")
-endif()
-
 find_library(ONNXRUNTIME onnxruntime HINTS "${ONNXRUNTIME_LIB_DIR}")
 if(ONNXRUNTIME-NOTFOUND)
   message(FATAL_ERROR "onnxruntime cannot be found at '${ONNXRUNTIME_LIB_DIR}'")
@@ -96,26 +90,27 @@ endif()
 #
 function(ort_add_dependency name folder_copy)
   get_target_property(target_output_directory ${name} BINARY_DIR)
-  message(STATUS "ort copy ${ORT_LIB_FILES_LENGTH} files from '${ONNXRUNTIME_LIB_DIR}'")
+  message(STATUS "ort: copy-1 ${ORT_LIB_FILES_LENGTH} files from '${ONNXRUNTIME_LIB_DIR}'")
   if(MSVC)
     set(destination_dir ${target_output_directory}/${CMAKE_BUILD_TYPE})
   else()
     set(destination_dir ${target_output_directory})
   endif()
-  message(STATUS "ort copy to '${destination_dir}'")
+  message(STATUS "ort: copy-2 to '${destination_dir}'")
   if(folder_copy)
-    message(STATUS "ort copy to '${folder_copy}'")
+    message(STATUS "ort: copy-3 to '${folder_copy}'")
   endif()
   foreach(file_i ${ORT_LIB_FILES})
     if(NOT EXISTS ${destination_dir}/${file_i})
-      message(STATUS "ort copy '${file_i}' to '${destination_dir}'")
+      message(STATUS "ort: copy-4 '${file_i}' to '${destination_dir}'")
       add_custom_command(
         TARGET ${name} POST_BUILD
         COMMAND ${CMAKE_COMMAND} ARGS -E copy ${file_i} ${destination_dir})
     endif()
     if(folder_copy)
       if(NOT EXISTS ${folder_copy}/${file_i})
-        message(STATUS "ort copy '${file_i}' to '${folder_copy}'")
+        message(STATUS "ort: copy-5 '${file_i}' to '${folder_copy}'")
+        # file(APPEND "../_setup_ext.txt" "copy,${file_i},${folder_copy}\n")
         add_custom_command(
           TARGET ${name} POST_BUILD
           COMMAND ${CMAKE_COMMAND} ARGS -E copy ${file_i} ${folder_copy})
@@ -125,8 +120,6 @@ function(ort_add_dependency name folder_copy)
   # file(COPY ${ORT_LIB_FILES} DESTINATION ${target_output_directory})
 endfunction()
 
-file(WRITE "../_setup_ext.txt" "")
-
 #
 #! ort_add_custom_op : compile a pyx file into cpp
 #
@@ -136,8 +129,13 @@ file(WRITE "../_setup_ext.txt" "")
 # \argn: C++ file to compile
 #
 function(ort_add_custom_op name provider folder)
+  if (WIN32)
+    file(WRITE "${folder}/${name}.def" "LIBRARY "
+               "\"${name}.dll\"\nEXPORTS\n  RegisterCustomOps @1")
+    list(APPEND ARGN "${folder}/${name}.def")
+  endif()
   if (provider STREQUAL "CUDA")
-    message(STATUS "ort custom op ${provider}: '${name}': ${ARGN}")
+    message(STATUS "ort: custom op ${provider}: '${name}': ${ARGN}")
     add_library(${name} SHARED ${ARGN})
 
     # add property --use_fast_math to cu files
@@ -173,7 +171,7 @@ function(ort_add_custom_op name provider folder)
       PRIVATE
       ${ONNXRUNTIME_INCLUDE_DIR})
   else()
-    message(STATUS "ort custom op CPU: '${name}': ${ARGN}")
+    message(STATUS "ort: custom op CPU: '${name}': ${ARGN}")
     add_library(${name} SHARED ${ARGN})
     target_include_directories(${name} PRIVATE ${ONNXRUNTIME_INCLUDE_DIR})
     target_compile_definitions(${name} PRIVATE ORT_VERSION=${ORT_VERSION_INT})
diff --git a/_cmake/load_externals.cmake b/_cmake/load_externals.cmake
index 71f65954..ec6a3635 100644
--- a/_cmake/load_externals.cmake
+++ b/_cmake/load_externals.cmake
@@ -158,7 +158,8 @@ message(STATUS "-------------------")
 if(CUDA_AVAILABLE)
   set(
     config_content
-    "HAS_CUDA = 1\nCUDA_VERSION = '${CUDA_VERSION}'\nCUDA_VERSION_INT = ${CUDA_VERSION_INT}")
+    "HAS_CUDA = 1\nCUDA_VERSION = '${CUDA_VERSION}'"
+    "\nCUDA_VERSION_INT = ${CUDA_VERSION_INT}")
 else()
   set(config_content "HAS_CUDA = 0")
 endif()
diff --git a/_cmake/targets/c_op_conv_.cmake b/_cmake/targets/c_op_conv_.cmake
index ad334831..10c7717c 100644
--- a/_cmake/targets/c_op_conv_.cmake
+++ b/_cmake/targets/c_op_conv_.cmake
@@ -9,9 +9,21 @@ local_pybind11_add_module(
   ../onnx_extended/reference/c_ops/cpu/c_op_conv_.cpp)
 eigen_add_dependency(c_op_conv_)
 
+target_include_directories(
+  c_op_conv_
+  PRIVATE
+  ${ROOT_INCLUDE_PATH}/onnx_extended)
+
 add_executable(test_c_op_conv_cpp
                ../_unittests/ut_reference/test_c_op_conv.cpp
                ../onnx_extended/reference/c_ops/cpu/c_op_common.cpp)
-target_include_directories(test_c_op_conv_cpp PRIVATE ${ROOT_INCLUDE_PATH})
+
+target_include_directories(
+  test_c_op_conv_cpp
+  PRIVATE
+  ${ROOT_INCLUDE_PATH}
+  ${ROOT_INCLUDE_PATH}/onnx_extended)
+
 eigen_add_dependency(test_c_op_conv_cpp)
+
 add_test(NAME test_c_op_conv_cpp COMMAND test_c_op_conv_cpp)
diff --git a/_cmake/targets/c_op_tree_ensemble_py_.cmake b/_cmake/targets/c_op_tree_ensemble_py_.cmake
index b8d66c6c..3356f9f1 100644
--- a/_cmake/targets/c_op_tree_ensemble_py_.cmake
+++ b/_cmake/targets/c_op_tree_ensemble_py_.cmake
@@ -8,3 +8,7 @@ local_pybind11_add_module(
   ../onnx_extended/reference/c_ops/cpu/c_op_common.cpp
   ../onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_py_.cpp)
 
+target_include_directories(
+  c_op_tree_ensemble_py_
+  PRIVATE
+  ${ROOT_INCLUDE_PATH}/onnx_extended)
diff --git a/_cmake/targets/ortinf.cmake b/_cmake/targets/ortinf.cmake
index e189f7a1..619f136c 100644
--- a/_cmake/targets/ortinf.cmake
+++ b/_cmake/targets/ortinf.cmake
@@ -4,7 +4,10 @@
 message(STATUS "+ CYTHON onnx_extended.ortcy.wrap.ortapi")
 
 add_library(lib_ortapi STATIC ../onnx_extended/ortcy/wrap/ortapi.cpp)
-target_include_directories(lib_ortapi PUBLIC ${ONNXRUNTIME_INCLUDE_DIR})
+target_include_directories(
+  lib_ortapi PUBLIC
+  ${ONNXRUNTIME_INCLUDE_DIR}
+  ${ROOT_INCLUDE_PATH}/onnx_extended)
 
 cython_add_module(
   ortinf
@@ -13,14 +16,17 @@ cython_add_module(
 target_link_directories(ortinf PRIVATE ${ONNXRUNTIME_LIB_DIR})
 message(STATUS "    LINK ortinf <- lib_ortapi onnxruntime")
 target_link_libraries(ortinf PRIVATE lib_ortapi onnxruntime)
+target_include_directories(ortinf PRIVATE ${ROOT_INCLUDE_PATH}/onnx_extended)
 ort_add_dependency(ortinf ${CMAKE_CURRENT_SOURCE_DIR}/../onnx_extended/ortcy/wrap/)
 
 set(ORTAPI_INCLUDE_DIR "${ROOT_INCLUDE_PATH}/onnx_extended/ortcy/wrap")
 
 add_executable(test_ortcy_inference_cpp ../_unittests/ut_ortcy/test_inference.cpp)
 target_include_directories(
-  test_ortcy_inference_cpp PRIVATE
+  test_ortcy_inference_cpp
+  PRIVATE
   ${ROOT_INCLUDE_PATH}
+  ${ROOT_INCLUDE_PATH}/onnx_extended
   ${ORT_DIR}/include)
 message(STATUS "    LINK test_ortcy_inference_cpp <- lib_ortapi onnxruntime")
 target_link_directories(test_ortcy_inference_cpp PRIVATE ${ONNXRUNTIME_LIB_DIR})
diff --git a/_cmake/targets/ortops_optim_cpu.cmake b/_cmake/targets/ortops_optim_cpu.cmake
new file mode 100644
index 00000000..184b4146
--- /dev/null
+++ b/_cmake/targets/ortops_optim_cpu.cmake
@@ -0,0 +1,26 @@
+#
+# module: onnx_extended.reference.c_ops.cpu.c_op_conv_
+#
+message(STATUS "+ KERNEL onnx_extended.ortops.optim.cpu")
+
+ort_add_custom_op(
+  ortops_optim_cpu
+  "CPU"
+  ../onnx_extended/ortops/optim/cpu
+  ../onnx_extended/ortops/optim/cpu/tree_ensemble.cc
+  ../onnx_extended/ortops/optim/cpu/ort_optim_cpu_lib.cc
+  ../onnx_extended/reference/c_ops/cpu/c_op_common.cpp)
+
+target_include_directories(
+  ortops_optim_cpu
+  PRIVATE
+  ${ROOT_INCLUDE_PATH}/onnx_extended)
+
+target_include_directories(
+  ortops_optim_cpu
+  PRIVATE
+  "${ORTAPI_INCLUDE_DIR}"
+  "${ORTOPS_INCLUDE_DIR}"
+  "${REFOPS_INCLUDE_DIR}")
+
+target_link_libraries(ortops_optim_cpu PRIVATE OpenMP::OpenMP_CXX)
diff --git a/_cmake/targets/ortops_tutorial_cpu.cmake b/_cmake/targets/ortops_tutorial_cpu.cmake
index 36242cf8..57d5a4b8 100644
--- a/_cmake/targets/ortops_tutorial_cpu.cmake
+++ b/_cmake/targets/ortops_tutorial_cpu.cmake
@@ -10,9 +10,11 @@ ort_add_custom_op(
   ../onnx_extended/ortops/tutorial/cpu/my_kernel.cc
   ../onnx_extended/ortops/tutorial/cpu/my_kernel_attr.cc
   ../onnx_extended/ortops/tutorial/cpu/ort_tutorial_cpu_lib.cc)
-# needed to include helpers.h
+
+# needed to include onnx_extended_helpers.h
 target_include_directories(
   ortops_tutorial_cpu
   PRIVATE
+  "${ROOT_INCLUDE_PATH}/onnx_extended"
   "${ORTAPI_INCLUDE_DIR}"
   "${ORTOPS_INCLUDE_DIR}")
diff --git a/_cmake/targets/ortops_tutorial_cuda.cmake b/_cmake/targets/ortops_tutorial_cuda.cmake
index 4d45d7d2..8c4f9794 100644
--- a/_cmake/targets/ortops_tutorial_cuda.cmake
+++ b/_cmake/targets/ortops_tutorial_cuda.cmake
@@ -12,10 +12,13 @@ if(CUDA_AVAILABLE)
     ../onnx_extended/ortops/tutorial/cuda
     ../onnx_extended/ortops/tutorial/cuda/custom_gemm.cu
     ../onnx_extended/ortops/tutorial/cuda/ort_tutorial_cuda_lib.cc)
-  # needed to include helpers.h
+
+  # needed to include onnx_extended_helpers.h
   target_include_directories(
     ortops_tutorial_cuda
     PRIVATE
+    "${ROOT_INCLUDE_PATH}/onnx_extended"
     "${ORTAPI_INCLUDE_DIR}"
     "${ORTOPS_INCLUDE_DIR}")
+
 endif()
diff --git a/_doc/api/ortops.rst b/_doc/api/ortops.rst
index 764710c5..b4ea9007 100644
--- a/_doc/api/ortops.rst
+++ b/_doc/api/ortops.rst
@@ -3,20 +3,8 @@
 ortops
 ======
 
-get_ort_ext_libs
-================
+.. toctree::
+    :maxdepth:
 
-.. autofunction:: onnx_extended.ortops.tutorial.cpu.get_ort_ext_libs
-
-List of implemented kernels
-===========================
-
-onnx_extented.ortops.tutorial.cpu
-+++++++++++++++++++++++++++++++++
-
-.. runpython::
-    :showcode:
-    :rst:
-
-    from onnx_extended.ortops.tutorial.cpu import documentation
-    print("\n".join(documentation()))
+    ortops_tutorial
+    ortops_optim
diff --git a/_doc/api/ortops_optim.rst b/_doc/api/ortops_optim.rst
new file mode 100644
index 00000000..c321d0c7
--- /dev/null
+++ b/_doc/api/ortops_optim.rst
@@ -0,0 +1,23 @@
+
+============
+ortops.optim
+============
+
+optimize
+========
+
+.. autofunction:: onnx_extended.ortops.optim.optimize.change_onnx_operator_domain
+
+CPU: onnx_extented.ortops.optim.cpu
+===================================
+
+.. autofunction:: onnx_extended.ortops.optim.cpu.get_ort_ext_libs
+
+**List of implemented kernels**
+
+.. runpython::
+    :showcode:
+    :rst:
+
+    from onnx_extended.ortops.optim.cpu import documentation
+    print("\n".join(documentation()))
diff --git a/_doc/api/ortops_tutorial.rst b/_doc/api/ortops_tutorial.rst
new file mode 100644
index 00000000..6ee12e9d
--- /dev/null
+++ b/_doc/api/ortops_tutorial.rst
@@ -0,0 +1,32 @@
+
+===============
+ortops.tutorial
+===============
+
+CPU: onnx_extented.ortops.tutorial.cpu
+======================================
+
+.. autofunction:: onnx_extended.ortops.tutorial.cpu.get_ort_ext_libs
+
+**List of implemented kernels**
+
+.. runpython::
+    :showcode:
+    :rst:
+
+    from onnx_extended.ortops.tutorial.cpu import documentation
+    print("\n".join(documentation()))
+
+CUDA: onnx_extented.ortops.tutorial.cuda
+========================================
+
+.. autofunction:: onnx_extended.ortops.tutorial.cuda.get_ort_ext_libs
+
+**List of implemented kernels**
+
+.. runpython::
+    :showcode:
+    :rst:
+
+    from onnx_extended.ortops.tutorial.cuda import documentation
+    print("\n".join(documentation()))
diff --git a/_doc/conf.py b/_doc/conf.py
index 12e6b5cb..947955fc 100644
--- a/_doc/conf.py
+++ b/_doc/conf.py
@@ -74,7 +74,8 @@
 epkg_dictionary = {
     "cmake": "https://cmake.org/",
     "CPUExecutionProvider": "https://onnxruntime.ai/docs/execution-providers/",
-    "cublasLtMatmul": "https://docs.nvidia.com/cuda/cublas/index.html?highlight=cublasltmatmul#cublasltmatmul",
+    "cublasLtMatmul": "https://docs.nvidia.com/cuda/cublas/index.html?"
+    "highlight=cublasltmatmul#cublasltmatmul",
     "CUDA": "https://developer.nvidia.com/",
     "cudnn": "https://developer.nvidia.com/cudnn",
     "cython": "https://cython.org/",
@@ -97,7 +98,8 @@
     ),
     "onnxruntime C API": "https://onnxruntime.ai/docs/api/c/",
     "onnxruntime Graph Optimizations": (
-        "https://onnxruntime.ai/docs/performance/model-optimizations/graph-optimizations.html"
+        "https://onnxruntime.ai/docs/performance/"
+        "model-optimizations/graph-optimizations.html"
     ),
     "openmp": "https://www.openmp.org/",
     "protobuf": "https://github.com/protocolbuffers/protobuf",
diff --git a/_doc/examples/plot_bench_ort.py b/_doc/examples/plot_bench_cypy_ort.py
similarity index 99%
rename from _doc/examples/plot_bench_ort.py
rename to _doc/examples/plot_bench_cypy_ort.py
index 0bebb160..1fabfb5c 100644
--- a/_doc/examples/plot_bench_ort.py
+++ b/_doc/examples/plot_bench_cypy_ort.py
@@ -1,6 +1,4 @@
 """
-.. _l-example-bench-ort:
-
 Measuring onnxruntime performance
 =================================
 
diff --git a/_doc/index.rst b/_doc/index.rst
index 79bb2b76..dfd1cd04 100644
--- a/_doc/index.rst
+++ b/_doc/index.rst
@@ -105,15 +105,15 @@ If not, some extensions might not be available.
 ::
 
     python setup.py build_ext --inplace
-    # or
-    pip install -e . --config-settings="--enable_nvtx=1"
+    # or (not working yet)
+    pip install -e . --config-settings="--use_nvtx=1"
 
 `NVTX <https://github.com/NVIDIA/NVTX>`_
 can be enabled with the following command:
 
 ::
 
-    python setup.py build_ext --inplace --enable_nvtx 1
+    python setup.py build_ext --inplace --use_nvtx 1
 
 Experimental cython binding for onnxruntime
 +++++++++++++++++++++++++++++++++++++++++++
diff --git a/_doc/tutorial/index.rst b/_doc/tutorial/index.rst
index 7b84ed93..b851d4b2 100644
--- a/_doc/tutorial/index.rst
+++ b/_doc/tutorial/index.rst
@@ -37,7 +37,7 @@ with two instructions:
 By default, *cmake* builds with CUDA if it is available. It can be disabled:
 
 * ``python setup.py build_ext --inplace --with-cuda=0``, the legacy way
-* ``pip install -e . --config-settings="--with-cuda=0"``, the new way
+* ``pip install -e . --config-settings="--with-cuda=0"``, the new way (not fully working yet)
 
 In case there are multiple versions of CUDA installed, option `cuda-version`
 can be specified:
@@ -72,7 +72,7 @@ Validation, Experiments
     ../auto_examples/plot_bench_cpu_vector_sum_parallel
     ../auto_examples/plot_bench_cpu_vector_sum_avx_parallel
     ../auto_examples/plot_bench_gpu_vector_sum_gpu
-    ../auto_examples/plot_bench_ort
+    ../auto_examples/plot_bench_cypy_ort
     ../auto_examples/plot_bench_gemm_f8
     ../auto_examples/plot_bench_gemm_ort
     ../auto_examples/plot_profile_gemm_ort
diff --git a/_unittests/ut_ortcy/test_ortcy.py b/_unittests/ut_ortcy/test_ortcy.py
index eef1e03d..86dabc50 100644
--- a/_unittests/ut_ortcy/test_ortcy.py
+++ b/_unittests/ut_ortcy/test_ortcy.py
@@ -1,4 +1,5 @@
 import unittest
+import warnings
 import os
 import numpy
 from onnx import TensorProto
@@ -13,8 +14,26 @@
 from onnx.checker import check_model
 from onnx_extended.ext_test_case import ExtTestCase
 
+try:
+    from onnx_extended.ortcy.wrap.ortinf import OrtSession
+except ImportError as e:
+    msg = "libonnxruntime.so.1.15.1: cannot open shared object file"
+    if msg in str(e):
+        from onnx_extended.ortcy.wrap import __file__ as loc
+
+        all_files = os.listdir(os.path.dirname(loc))
+        warnings.warn(
+            f"Unable to find onnxruntime {e!r}, found files in {os.path.dirname(loc)}: "
+            f"{all_files}."
+        )
+        OrtSession = None
+        here = os.path.dirname(__file__)
+    else:
+        OrtSession = "OrtSession is not initialized"
+
 
 class TestOrtCy(ExtTestCase):
+    @unittest.skipIf(OrtSession is None, reason="libonnxruntime installation failed")
     def test_ort_get_available_providers(self):
         from onnx_extended.ortcy.wrap.ortinf import ort_get_available_providers
 
@@ -23,6 +42,7 @@ def test_ort_get_available_providers(self):
         self.assertGreater(len(res), 0)
         self.assertIn("CPUExecutionProvider", res)
 
+    @unittest.skipIf(OrtSession is None, reason="libonnxruntime installation failed")
     def test_session(self):
         from onnx_extended.ortcy.wrap.ortinf import OrtSession
 
@@ -66,6 +86,7 @@ def test_session(self):
         self.assertEqual(len(got), 1)
         self.assertEqualArray(got[0], x + y)
 
+    @unittest.skipIf(OrtSession is None, reason="libonnxruntime installation failed")
     def test_my_custom_ops_cy(self):
         from onnx_extended.ortcy.wrap.ortinf import OrtSession
         from onnx_extended.ortops.tutorial.cpu import get_ort_ext_libs
@@ -95,6 +116,7 @@ def test_my_custom_ops_cy(self):
         got = session.run_2(x, y)[0]
         self.assertEqualArray(x + y, got)
 
+    @unittest.skipIf(OrtSession is None, reason="libonnxruntime installation failed")
     def test_my_custom_ops_with_attributes(self):
         from onnx_extended.ortcy.wrap.ortinf import OrtSession
         from onnx_extended.ortops.tutorial.cpu import get_ort_ext_libs
diff --git a/_unittests/ut_ortops/test_optim_cpu.py b/_unittests/ut_ortops/test_optim_cpu.py
new file mode 100644
index 00000000..380496af
--- /dev/null
+++ b/_unittests/ut_ortops/test_optim_cpu.py
@@ -0,0 +1,91 @@
+import unittest
+import numpy
+from sklearn.datasets import make_regression
+from sklearn.ensemble import RandomForestRegressor
+from skl2onnx import to_onnx
+from onnx_extended.ortops.tutorial.cpu import documentation
+from onnx_extended.ortops.optim.optimize import (
+    change_onnx_operator_domain,
+    get_node_attribute,
+)
+from onnx_extended.reference import CReferenceEvaluator
+from onnx_extended.ext_test_case import ExtTestCase
+
+try:
+    from onnxruntime import InferenceSession, SessionOptions
+except ImportError:
+    SessionOptions, InferenceSession = None, None
+
+
+class TestOrtOpOptimCpu(ExtTestCase):
+    def test_get_ort_ext_libs(self):
+        from onnx_extended.ortops.optim.cpu import get_ort_ext_libs
+
+        r = get_ort_ext_libs()
+        self.assertEqual(len(r), 1)
+
+    def test_documentation(self):
+        doc = documentation()
+        self.assertIsInstance(doc, list)
+        self.assertEqual(len(doc), 2)
+        for d in doc:
+            self.assertIn("~~~~", d)
+            self.assertIsInstance(d, str)
+
+    @unittest.skipIf(InferenceSession is None, "onnxruntime not installed")
+    def test_random_forest_regressor(self):
+        from onnx_extended.ortops.optim.cpu import get_ort_ext_libs
+
+        X, y = make_regression(100, 2, n_informative=1, random_state=32)
+        X = X.astype(numpy.float32)
+        y = y.astype(numpy.float32)
+
+        rf = RandomForestRegressor(3, max_depth=2, random_state=32)
+        rf.fit(X[:80], y[:80])
+        expected = rf.predict(X[80:]).astype(numpy.float32).reshape((-1, 1))
+        onx = to_onnx(rf, X[:1])
+        feeds = {"X": X[80:]}
+
+        # check with onnxruntime
+        sess = InferenceSession(
+            onx.SerializeToString(), providers=["CPUExecutionProvider"]
+        )
+        got = sess.run(None, feeds)[0]
+        self.assertEqualArray(expected, got, atol=1e-5)
+
+        # check with CReferenceEvaluator
+        ref = CReferenceEvaluator(onx)
+        got = ref.run(None, feeds)[0]
+        self.assertEqualArray(expected, got.reshape((-1, 1)), atol=1e-5)
+
+        # transformation
+        att = get_node_attribute(onx.graph.node[0], "nodes_modes")
+        modes = ",".join(map(lambda s: s.decode("ascii"), att.strings))
+        onx2 = change_onnx_operator_domain(
+            onx,
+            op_type="TreeEnsembleRegressor",
+            op_domain="ai.onnx.ml",
+            new_op_domain="onnx_extented.ortops.optim.cpu",
+            nodes_modes=modes,
+        )
+        self.assertIn("onnx_extented.ortops.optim.cpu", str(onx2))
+
+        # check with CReferenceEvaluator
+        ref = CReferenceEvaluator(onx2)
+        got = ref.run(None, feeds)[0]
+        self.assertEqualArray(expected, got.reshape((-1, 1)), atol=1e-5)
+
+        # check with onnxruntime + custom op
+        r = get_ort_ext_libs()
+        self.assertExists(r[0])
+        opts = SessionOptions()
+        opts.register_custom_ops_library(r[0])
+        sess = InferenceSession(
+            onx2.SerializeToString(), opts, providers=["CPUExecutionProvider"]
+        )
+        got = sess.run(None, feeds)[0]
+        self.assertEqualArray(expected, got, atol=1e-5)
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/_unittests/ut_ortops/test_optim_py.py b/_unittests/ut_ortops/test_optim_py.py
new file mode 100644
index 00000000..a7eef2a2
--- /dev/null
+++ b/_unittests/ut_ortops/test_optim_py.py
@@ -0,0 +1,164 @@
+import unittest
+import numpy
+from onnx import TensorProto
+from onnx.checker import check_model
+from onnx.reference import ReferenceEvaluator
+from onnx.helper import (
+    make_model,
+    make_node,
+    make_graph,
+    make_opsetid,
+    make_tensor_value_info,
+)
+from onnx_extended.ext_test_case import ExtTestCase
+from onnx_extended.ortops.optim.optimize import change_onnx_operator_domain
+
+
+class TestOrtOpOptimPy(ExtTestCase):
+    def test_replace_add(self):
+        X = make_tensor_value_info("X", TensorProto.FLOAT, [None])
+        Y = make_tensor_value_info("Y", TensorProto.FLOAT, [None])
+        Z = make_tensor_value_info("Z", TensorProto.FLOAT, [None])
+        node = make_node("Add", ["X", "Y"], ["Z"])
+        graph = make_graph([node], "g", [X, Y], [Z])
+        onnx_model = make_model(graph, opset_imports=[make_opsetid("", 16)])
+        check_model(onnx_model)
+
+        repl = change_onnx_operator_domain(onnx_model, op_type="Add", new_op_type="Sub")
+        check_model(repl)
+        self.assertEqual(len(repl.graph.node), 1)
+        self.assertEqual(repl.graph.node[0].op_type, "Sub")
+        ref = ReferenceEvaluator(repl)
+        x = numpy.arange(5).astype(numpy.float32)
+        y = (x * 10).astype(numpy.float32)
+        got = ref.run(None, {"X": x, "Y": y})
+        self.assertEqualArray(x - y, got[0])
+
+    def test_replace_argmin_1(self):
+        X = make_tensor_value_info("X", TensorProto.FLOAT, [None])
+        Z = make_tensor_value_info("Z", TensorProto.FLOAT, [None])
+        node = make_node("ArgMin", ["X"], ["Z"], axis=0)
+        graph = make_graph([node], "g", [X], [Z])
+        onnx_model = make_model(graph, opset_imports=[make_opsetid("", 16)])
+        check_model(onnx_model)
+
+        repl = change_onnx_operator_domain(
+            onnx_model, op_type="ArgMin", new_op_type="ArgMin", axis=None
+        )
+        check_model(repl)
+        self.assertEqual(len(repl.graph.node), 1)
+        self.assertEqual(repl.graph.node[0].op_type, "ArgMin")
+        self.assertEqual(len(repl.graph.node[0].attribute), 0)
+        ref = ReferenceEvaluator(repl)
+        x = numpy.arange(5).astype(numpy.float32)
+        got = ref.run(None, {"X": x})
+        self.assertEqualArray(numpy.argmin(x).reshape((-1,)), got[0])
+
+    def test_replace_argmin_2(self):
+        X = make_tensor_value_info("X", TensorProto.FLOAT, [None])
+        Z = make_tensor_value_info("Z", TensorProto.FLOAT, [None])
+        node = make_node("ArgMin", ["X"], ["Z"])
+        graph = make_graph([node], "g", [X], [Z])
+        onnx_model = make_model(graph, opset_imports=[make_opsetid("", 16)])
+        check_model(onnx_model)
+
+        repl = change_onnx_operator_domain(
+            onnx_model, op_type="ArgMin", new_op_type="ArgMin", axis=0
+        )
+        check_model(repl)
+        self.assertEqual(len(repl.graph.node), 1)
+        self.assertEqual(repl.graph.node[0].op_type, "ArgMin")
+        self.assertEqual(len(repl.graph.node[0].attribute), 1)
+        ref = ReferenceEvaluator(repl)
+        x = numpy.arange(5).astype(numpy.float32)
+        got = ref.run(None, {"X": x})
+        self.assertEqualArray(numpy.argmin(x).reshape((-1,)), got[0])
+
+    def test_replace_argmin_3(self):
+        X = make_tensor_value_info("X", TensorProto.FLOAT, [None, None])
+        Z = make_tensor_value_info("Z", TensorProto.FLOAT, [None, None])
+        node = make_node("ArgMin", ["X"], ["Z"], axis=1)
+        graph = make_graph([node], "g", [X], [Z])
+        onnx_model = make_model(graph, opset_imports=[make_opsetid("", 16)])
+        check_model(onnx_model)
+
+        repl = change_onnx_operator_domain(
+            onnx_model, op_type="ArgMin", new_op_type="ArgMax", axis=0
+        )
+        check_model(repl)
+        self.assertEqual(len(repl.graph.node), 1)
+        self.assertEqual(repl.graph.node[0].op_type, "ArgMax")
+        self.assertEqual(len(repl.graph.node[0].attribute), 1)
+        ref = ReferenceEvaluator(repl)
+        x = numpy.arange(4).astype(numpy.float32).reshape((2, -1))
+        got = ref.run(None, {"X": x})
+        self.assertEqualArray(numpy.argmax(x, axis=0, keepdims=1), got[0])
+
+    def test_replace_domain(self):
+        X = make_tensor_value_info("X", TensorProto.FLOAT, [None])
+        Y = make_tensor_value_info("Y", TensorProto.FLOAT, [None])
+        Z = make_tensor_value_info("Z", TensorProto.FLOAT, [None])
+        node = make_node("Add", ["X", "Y"], ["Z"])
+        graph = make_graph([node], "g", [X, Y], [Z])
+        onnx_model = make_model(graph, opset_imports=[make_opsetid("", 16)])
+        check_model(onnx_model)
+
+        repl = change_onnx_operator_domain(
+            onnx_model,
+            op_type="Add",
+            new_op_type="Sub",
+            new_op_domain="NEW",
+        )
+        check_model(repl)
+        self.assertEqual(len(repl.graph.node), 1)
+        self.assertEqual(repl.graph.node[0].op_type, "Sub")
+        self.assertIn('domain: "NEW"', str(repl))
+
+    def test_replace_domain_att(self):
+        X = make_tensor_value_info("X", TensorProto.FLOAT, [None])
+        Y = make_tensor_value_info("Y", TensorProto.FLOAT, [None])
+        Z = make_tensor_value_info("Z", TensorProto.FLOAT, [None])
+        node = make_node("Add", ["X", "Y"], ["Z"])
+        graph = make_graph([node], "g", [X, Y], [Z])
+        onnx_model = make_model(graph, opset_imports=[make_opsetid("", 16)])
+        check_model(onnx_model)
+
+        repl = change_onnx_operator_domain(
+            onnx_model,
+            op_type="Add",
+            new_op_type="Sub",
+            new_op_domain="NEW",
+            ATTR=6,
+        )
+        check_model(repl)
+        self.assertEqual(len(repl.graph.node), 1)
+        self.assertEqual(repl.graph.node[0].op_type, "Sub")
+        self.assertIn('domain: "NEW"', str(repl))
+        self.assertIn('name: "ATTR"', str(repl))
+        self.assertIn("i: 6", str(repl))
+
+    def test_replace_domain_att_same(self):
+        X = make_tensor_value_info("X", TensorProto.FLOAT, [None])
+        Y = make_tensor_value_info("Y", TensorProto.FLOAT, [None])
+        Z = make_tensor_value_info("Z", TensorProto.FLOAT, [None])
+        node = make_node("Add", ["X", "Y"], ["Z"])
+        graph = make_graph([node], "g", [X, Y], [Z])
+        onnx_model = make_model(graph, opset_imports=[make_opsetid("", 16)])
+        check_model(onnx_model)
+
+        repl = change_onnx_operator_domain(
+            onnx_model,
+            op_type="Add",
+            new_op_domain="NEW",
+            ATTR=6,
+        )
+        check_model(repl)
+        self.assertEqual(len(repl.graph.node), 1)
+        self.assertEqual(repl.graph.node[0].op_type, "Add")
+        self.assertIn('domain: "NEW"', str(repl))
+        self.assertIn('name: "ATTR"', str(repl))
+        self.assertIn("i: 6", str(repl))
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/_unittests/ut_ortops/test_tutorial_cuda.py b/_unittests/ut_ortops/test_tutorial_cuda.py
index 1df38c72..737c8b4c 100644
--- a/_unittests/ut_ortops/test_tutorial_cuda.py
+++ b/_unittests/ut_ortops/test_tutorial_cuda.py
@@ -16,27 +16,19 @@
 except ImportError:
     onnx_simple_text_plot = str
 try:
+    from onnxruntime import InferenceSession
+except ImportError:
+    InferenceSession = None
+    ort_version = "0.0"
+if InferenceSession is not None:
     from onnxruntime import (
-        InferenceSession,
         SessionOptions,
         get_available_providers,
         __version__ as ort_version,
     )
     from onnxruntime.capi.onnxruntime_pybind11_state import Fail as OrtFail
-except ImportError:
-    (
-        SessionOptions,
-        InferenceSession,
-        get_available_providers,
-        ort_version,
-        OrtFail,
-    ) = (
-        None,
-        None,
-        None,
-        None,
-        None,
-    )
+
+
 from onnx_extended.ortops.tutorial.cuda import documentation
 from onnx_extended.ext_test_case import ExtTestCase
 from onnx_extended import has_cuda
@@ -47,7 +39,21 @@
     get_device_prop = None
 
 
+from onnx_extended.validation.cuda import cuda_version
+
+
+def has_cuda_ort():
+    if not has_cuda():
+        return False
+    if InferenceSession is None:
+        return False
+    if "CUDAExecutionProvider" not in get_available_providers():
+        return False
+    return True
+
+
 class TestOrtOpTutorialCuda(ExtTestCase):
+    @unittest.skipIf(get_device_prop is None, reason="CUDA not available")
     def test_get_ort_ext_libs(self):
         from onnx_extended.ortops.tutorial.cuda import get_ort_ext_libs
 
@@ -211,10 +217,9 @@ def check(f):
                 f"\n----\ngot=\n{got[0][:2,:2]}"
             ) from e
 
-    @unittest.skipIf(InferenceSession is None, "onnxruntime not installed")
     @unittest.skipIf(
-        "CUDAExecutionProvider" not in get_available_providers(),
-        reason="CUDA provider not available",
+        not has_cuda_ort(),
+        reason="onnxruntime not installed or CUDA provider not available",
     )
     def test_custom_gemm_float32_default(self):
         self.common_test_custom_gemm(
@@ -226,10 +231,9 @@ def test_custom_gemm_float32_default(self):
             computeType="CUBLAS_COMPUTE_32F_FAST_TF32",
         )
 
-    @unittest.skipIf(InferenceSession is None, "onnxruntime not installed")
     @unittest.skipIf(
-        "CUDAExecutionProvider" not in get_available_providers(),
-        reason="CUDA provider not available",
+        not has_cuda_ort(),
+        reason="onnxruntime not installed or CUDA provider not available",
     )
     def test_custom_gemm_float32_relu(self):
         self.common_test_custom_gemm(
@@ -242,10 +246,9 @@ def test_custom_gemm_float32_relu(self):
             activation="RELU",
         )
 
-    @unittest.skipIf(InferenceSession is None, "onnxruntime not installed")
     @unittest.skipIf(
-        "CUDAExecutionProvider" not in get_available_providers(),
-        reason="CUDA provider not available",
+        not has_cuda_ort(),
+        reason="onnxruntime not installed or CUDA provider not available",
     )
     def test_custom_gemm_float32_gelu(self):
         self.common_test_custom_gemm(
@@ -258,10 +261,9 @@ def test_custom_gemm_float32_gelu(self):
             activation="GELU",
         )
 
-    @unittest.skipIf(InferenceSession is None, "onnxruntime not installed")
     @unittest.skipIf(
-        "CUDAExecutionProvider" not in get_available_providers(),
-        reason="CUDA provider not available",
+        not has_cuda_ort(),
+        reason="onnxruntime not installed or CUDA provider not available",
     )
     def test_custom_gemm_float32_col_major_relu(self):
         self.common_test_custom_gemm(
@@ -275,10 +277,9 @@ def test_custom_gemm_float32_col_major_relu(self):
             rowMajor=0,
         )
 
-    @unittest.skipIf(InferenceSession is None, "onnxruntime not installed")
     @unittest.skipIf(
-        "CUDAExecutionProvider" not in get_available_providers(),
-        reason="CUDA provider not available",
+        not has_cuda_ort(),
+        reason="onnxruntime not installed or CUDA provider not available",
     )
     def test_custom_gemm_float32_col_major_gelu(self):
         self.common_test_custom_gemm(
@@ -292,10 +293,9 @@ def test_custom_gemm_float32_col_major_gelu(self):
             rowMajor=0,
         )
 
-    @unittest.skipIf(InferenceSession is None, "onnxruntime not installed")
     @unittest.skipIf(
-        "CUDAExecutionProvider" not in get_available_providers(),
-        reason="CUDA provider not available",
+        not has_cuda_ort(),
+        reason="onnxruntime not installed or CUDA provider not available",
     )
     def test_custom_gemm_float32_not_square(self):
         self.common_test_custom_gemm(
@@ -308,10 +308,9 @@ def test_custom_gemm_float32_not_square(self):
             square=False,
         )
 
-    @unittest.skipIf(InferenceSession is None, "onnxruntime not installed")
     @unittest.skipIf(
-        "CUDAExecutionProvider" not in get_available_providers(),
-        reason="CUDA provider not available",
+        not has_cuda_ort(),
+        reason="onnxruntime not installed or CUDA provider not available",
     )
     def test_custom_gemm_float32_col_major(self):
         self.common_test_custom_gemm(
@@ -324,10 +323,9 @@ def test_custom_gemm_float32_col_major(self):
             rowMajor=0,
         )
 
-    @unittest.skipIf(InferenceSession is None, "onnxruntime not installed")
     @unittest.skipIf(
-        "CUDAExecutionProvider" not in get_available_providers(),
-        reason="CUDA provider not available",
+        not has_cuda_ort(),
+        reason="onnxruntime not installed or CUDA provider not available",
     )
     def test_custom_gemm_float32_col_major_not_square(self):
         self.common_test_custom_gemm(
@@ -341,10 +339,13 @@ def test_custom_gemm_float32_col_major_not_square(self):
             square=False,
         )
 
-    @unittest.skipIf(InferenceSession is None, "onnxruntime not installed")
     @unittest.skipIf(
-        "CUDAExecutionProvider" not in get_available_providers(),
-        reason="CUDA provider not available",
+        not has_cuda_ort(),
+        reason="onnxruntime not installed or CUDA provider not available",
+    )
+    @unittest.skipIf(
+        Version(cuda_version()) < Version("12.0"),
+        reason="beta != 0 bugged in CUDA 11.8.",
     )
     def test_custom_gemm_float32_bias(self):
         self.common_test_custom_gemm(
@@ -357,10 +358,13 @@ def test_custom_gemm_float32_bias(self):
             beta=1.0,
         )
 
-    @unittest.skipIf(InferenceSession is None, "onnxruntime not installed")
     @unittest.skipIf(
-        "CUDAExecutionProvider" not in get_available_providers(),
-        reason="CUDA provider not available",
+        not has_cuda_ort(),
+        reason="onnxruntime not installed or CUDA provider not available",
+    )
+    @unittest.skipIf(
+        Version(cuda_version()) < Version("12.0"),
+        reason="beta != 0 bugged in CUDA 11.8.",
     )
     def test_custom_gemm_float32_bias_01(self):
         self.common_test_custom_gemm(
@@ -373,10 +377,13 @@ def test_custom_gemm_float32_bias_01(self):
             beta=1.0,
         )
 
-    @unittest.skipIf(InferenceSession is None, "onnxruntime not installed")
     @unittest.skipIf(
-        "CUDAExecutionProvider" not in get_available_providers(),
-        reason="CUDA provider not available",
+        not has_cuda_ort(),
+        reason="onnxruntime not installed or CUDA provider not available",
+    )
+    @unittest.skipIf(
+        Version(cuda_version()) < Version("12.0"),
+        reason="beta != 0 bugged in CUDA 11.8.",
     )
     def test_custom_gemm_float32_bias_col_major(self):
         self.common_test_custom_gemm(
@@ -390,10 +397,13 @@ def test_custom_gemm_float32_bias_col_major(self):
             rowMajor=0,
         )
 
-    @unittest.skipIf(InferenceSession is None, "onnxruntime not installed")
     @unittest.skipIf(
-        "CUDAExecutionProvider" not in get_available_providers(),
-        reason="CUDA provider not available",
+        not has_cuda_ort(),
+        reason="onnxruntime not installed or CUDA provider not available",
+    )
+    @unittest.skipIf(
+        Version(cuda_version()) < Version("12.0"),
+        reason="beta != 0 bugged in CUDA 11.8.",
     )
     def test_custom_gemm_float32_not_square_bias(self):
         self.common_test_custom_gemm(
@@ -407,10 +417,13 @@ def test_custom_gemm_float32_not_square_bias(self):
             square=False,
         )
 
-    @unittest.skipIf(InferenceSession is None, "onnxruntime not installed")
     @unittest.skipIf(
-        "CUDAExecutionProvider" not in get_available_providers(),
-        reason="CUDA provider not available",
+        not has_cuda_ort(),
+        reason="onnxruntime not installed or CUDA provider not available",
+    )
+    @unittest.skipIf(
+        Version(cuda_version()) < Version("12.0"),
+        reason="beta != 0 bugged in CUDA 11.8.",
     )
     def test_custom_gemm_float32_not_square_bias_col_major(self):
         self.common_test_custom_gemm(
@@ -425,10 +438,9 @@ def test_custom_gemm_float32_not_square_bias_col_major(self):
             rowMajor=0,
         )
 
-    @unittest.skipIf(InferenceSession is None, "onnxruntime not installed")
     @unittest.skipIf(
-        "CUDAExecutionProvider" not in get_available_providers(),
-        reason="CUDA provider not available",
+        not has_cuda_ort(),
+        reason="onnxruntime not installed or CUDA provider not available",
     )
     def test_custom_gemm_float16_default(self):
         self.common_test_custom_gemm(
@@ -440,10 +452,9 @@ def test_custom_gemm_float16_default(self):
             computeType="CUBLAS_COMPUTE_32F",
         )
 
-    @unittest.skipIf(InferenceSession is None, "onnxruntime not installed")
     @unittest.skipIf(
-        "CUDAExecutionProvider" not in get_available_providers(),
-        reason="CUDA provider not available",
+        not has_cuda_ort(),
+        reason="onnxruntime not installed or CUDA provider not available",
     )
     def test_custom_gemm_float32_row_major(self):
         self.common_test_custom_gemm(
@@ -456,10 +467,9 @@ def test_custom_gemm_float32_row_major(self):
             rowMajor=1,
         )
 
-    @unittest.skipIf(InferenceSession is None, "onnxruntime not installed")
     @unittest.skipIf(
-        "CUDAExecutionProvider" not in get_available_providers(),
-        reason="CUDA provider not available",
+        not has_cuda_ort(),
+        reason="onnxruntime not installed or CUDA provider not available",
     )
     @unittest.skipIf(
         Version(ort_version) < Version("1.16"), reason="float8 types not released"
@@ -478,10 +488,9 @@ def test_custom_gemm_float8(self):
             rowMajor=0,
         )
 
-    @unittest.skipIf(InferenceSession is None, "onnxruntime not installed")
     @unittest.skipIf(
-        "CUDAExecutionProvider" not in get_available_providers(),
-        reason="CUDA provider not available",
+        not has_cuda_ort(),
+        reason="onnxruntime not installed or CUDA provider not available",
     )
     @unittest.skipIf(
         Version(ort_version) < Version("1.16"), reason="float8 types not released"
diff --git a/_unittests/ut_reference/test_c_tree_ensemble.py b/_unittests/ut_reference/test_c_tree_ensemble.py
index 8fb614ea..88cf7b1b 100644
--- a/_unittests/ut_reference/test_c_tree_ensemble.py
+++ b/_unittests/ut_reference/test_c_tree_ensemble.py
@@ -15,19 +15,16 @@
 from lightgbm import LGBMRegressor
 from onnx_extended.ext_test_case import ExtTestCase, ignore_warnings
 from onnx_extended.reference import CReferenceEvaluator
-from onnx_extended.reference.c_ops.c_op_tree_ensemble_classifier import (
-    TreeEnsembleClassifier_1,
-)
-from onnx_extended.reference.c_ops.c_op_tree_ensemble_regressor import (
-    TreeEnsembleRegressor_1,
-    TreeEnsembleRegressor_3,
-)
 
 
 class TestCTreeEnsemble(ExtTestCase):
     @unittest.skipIf(onnx_opset_version() < 19, reason="ReferenceEvaluator is bugged")
     @ignore_warnings((FutureWarning, DeprecationWarning))
     def test_decision_tree_classifier_bin(self):
+        from onnx_extended.reference.c_ops.c_op_tree_ensemble_classifier import (
+            TreeEnsembleClassifier_1,
+        )
+
         iris = load_iris()
         X, y = iris.data.astype(numpy.float32), iris.target
         y[y == 2] = 0
@@ -51,6 +48,10 @@ def test_decision_tree_classifier_bin(self):
     @unittest.skipIf(onnx_opset_version() < 19, reason="ReferenceEvaluator is bugged")
     @ignore_warnings((FutureWarning, DeprecationWarning))
     def test_decision_tree_classifier_multi(self):
+        from onnx_extended.reference.c_ops.c_op_tree_ensemble_classifier import (
+            TreeEnsembleClassifier_1,
+        )
+
         iris = load_iris()
         X, y = iris.data.astype(numpy.float32), iris.target
         X_train, X_test, y_train, _ = train_test_split(X, y, random_state=11)
@@ -73,6 +74,10 @@ def test_decision_tree_classifier_multi(self):
     @unittest.skipIf(onnx_opset_version() < 19, reason="ReferenceEvaluator is bugged")
     @ignore_warnings((FutureWarning, DeprecationWarning))
     def test_decision_tree_classifier_plusten(self):
+        from onnx_extended.reference.c_ops.c_op_tree_ensemble_classifier import (
+            TreeEnsembleClassifier_1,
+        )
+
         iris = load_iris()
         X, y = iris.data.astype(numpy.float32), iris.target
         y += 10
@@ -95,6 +100,10 @@ def test_decision_tree_classifier_plusten(self):
     @unittest.skipIf(onnx_opset_version() < 19, reason="ReferenceEvaluator is bugged")
     @ignore_warnings((FutureWarning, DeprecationWarning))
     def test_gradient_boosting_classifier2(self):
+        from onnx_extended.reference.c_ops.c_op_tree_ensemble_classifier import (
+            TreeEnsembleClassifier_1,
+        )
+
         iris = load_iris()
         X, y = iris.data.astype(numpy.float32), iris.target
         y[y == 2] = 1
@@ -116,6 +125,10 @@ def test_gradient_boosting_classifier2(self):
     @unittest.skipIf(onnx_opset_version() < 19, reason="ReferenceEvaluator is bugged")
     @ignore_warnings((FutureWarning, DeprecationWarning))
     def test_gradient_boosting_classifier3(self):
+        from onnx_extended.reference.c_ops.c_op_tree_ensemble_classifier import (
+            TreeEnsembleClassifier_1,
+        )
+
         iris = load_iris()
         X, y = iris.data.astype(numpy.float32), iris.target
         X_train, X_test, y_train, _ = train_test_split(X, y, random_state=11)
@@ -139,6 +152,10 @@ def test_gradient_boosting_classifier3(self):
         onnx_opset_version() < 19, reason="ArrayFeatureExtractor has no implementation"
     )
     def test_decision_tree_classifier_mlabel(self):
+        from onnx_extended.reference.c_ops.c_op_tree_ensemble_classifier import (
+            TreeEnsembleClassifier_1,
+        )
+
         iris = load_iris()
         X, y_ = iris.data.astype(numpy.float32), iris.target
         y = numpy.zeros((y_.shape[0], 3), dtype=numpy.int64)
@@ -166,6 +183,10 @@ def test_decision_tree_classifier_mlabel(self):
     @unittest.skipIf(onnx_opset_version() < 19, reason="ReferenceEvaluator is bugged")
     @ignore_warnings((FutureWarning, DeprecationWarning))
     def test_decision_tree_regressor(self):
+        from onnx_extended.reference.c_ops.c_op_tree_ensemble_regressor import (
+            TreeEnsembleRegressor_1,
+        )
+
         iris = load_iris()
         X, y = iris.data, iris.target
         X_train, X_test, y_train, _ = train_test_split(X, y, random_state=11)
@@ -195,6 +216,10 @@ def test_decision_tree_regressor(self):
 
     @ignore_warnings((FutureWarning, DeprecationWarning, UserWarning))
     def test_decision_tree_regressor_double(self):
+        from onnx_extended.reference.c_ops.c_op_tree_ensemble_regressor import (
+            TreeEnsembleRegressor_3,
+        )
+
         iris = load_iris()
         X, y = iris.data, iris.target
         X_train, X_test, y_train, _ = train_test_split(X, y, random_state=11)
@@ -379,7 +404,7 @@ def common_test_onnxrt_python_tree_ensemble_runtime_version_cls(
         oinf = CReferenceEvaluator(model_def)
         y = oinf.run(None, {"X": X_test.astype(dtype)})
         lexp = clr.predict_proba(X_test).astype(numpy.float32)
-        atol = {numpy.float32: 1e-5, numpy.float64: 1e-1}
+        atol = {numpy.float32: 1e-5, numpy.float64: 1.01e-1}
         with self.subTest(dtype=dtype):
             if single_cls:
                 diff = list(sorted(numpy.abs(lexp.ravel() - y[1])))
diff --git a/_unittests/ut_validation/test_cuda_gemm.py b/_unittests/ut_validation/test_cuda_gemm.py
index cdeb52a5..823a92dc 100644
--- a/_unittests/ut_validation/test_cuda_gemm.py
+++ b/_unittests/ut_validation/test_cuda_gemm.py
@@ -13,6 +13,7 @@
 
 
 class TestCudaGemm(ExtTestCase):
+    @unittest.skipIf(get_device_prop is None, reason="CUDA not available")
     def test_get_device_prop(self):
         r = get_device_prop()
         self.assertIsInstance(r, dict)
diff --git a/_unittests/ut_validation/test_speed_metrics.py b/_unittests/ut_validation/test_speed_metrics.py
index 1a1003ca..27965a2d 100644
--- a/_unittests/ut_validation/test_speed_metrics.py
+++ b/_unittests/ut_validation/test_speed_metrics.py
@@ -1,17 +1,21 @@
 import unittest
 from onnx_extended.ext_test_case import ExtTestCase
-from onnx_extended.validation.cpu._validation import (
-    benchmark_cache,
-    benchmark_cache_tree,
-)
 
 
 class TestSpeedMetrics(ExtTestCase):
     def test_benchmark_cache(self):
+        from onnx_extended.validation.cpu._validation import (
+            benchmark_cache,
+        )
+
         res = benchmark_cache(1000, False)
         self.assertGreater(res, 0)
 
     def test_benchmark_cache_tree(self):
+        from onnx_extended.validation.cpu._validation import (
+            benchmark_cache_tree,
+        )
+
         res = benchmark_cache_tree(1000)
         self.assertIsInstance(res, list)
         self.assertEqual(len(res), 1000)
diff --git a/_unittests/ut_validation/test_vector_sum.py b/_unittests/ut_validation/test_vector_sum.py
index d3196ad0..0c3ce28a 100644
--- a/_unittests/ut_validation/test_vector_sum.py
+++ b/_unittests/ut_validation/test_vector_sum.py
@@ -1,22 +1,12 @@
 import unittest
 import numpy
 from onnx_extended.ext_test_case import ExtTestCase
-from onnx_extended.validation.cpu._validation import (
-    vector_add,
-    vector_sum,
-    vector_sum_array,
-    vector_sum_array_parallel,
-    vector_sum_array_avx,
-    vector_sum_array_avx_parallel,
-)
-from onnx_extended.validation.cython.vector_function_cy import (
-    vector_sum_c,
-    vector_add_c,
-)
 
 
 class TestVectorSum(ExtTestCase):
     def test_vector_sum_c(self):
+        from onnx_extended.validation.cython.vector_function_cy import vector_sum_c
+
         values = numpy.array([[10, 1, 4, 5, 6, 7]], dtype=numpy.float32)
         t1 = vector_sum_c(values, True)
         t2 = vector_sum_c(values, False)
@@ -24,6 +14,8 @@ def test_vector_sum_c(self):
         self.assertEqual(t2, 33)
 
     def test_vector_sum(self):
+        from onnx_extended.validation.cpu._validation import vector_sum
+
         values = [10, 1, 4, 5, 6, 7]
         t1 = vector_sum(1, values, True)
         t2 = vector_sum(1, values, False)
@@ -36,6 +28,8 @@ def test_vector_sum(self):
         self.assertEqual(t2, 33)
 
     def test_vector_sum_array(self):
+        from onnx_extended.validation.cpu._validation import vector_sum_array
+
         values = numpy.array([10, 1, 4, 5, 6, 7], dtype=numpy.float32)
         t1 = vector_sum_array(1, values, True)
         t2 = vector_sum_array(1, values, False)
@@ -48,6 +42,8 @@ def test_vector_sum_array(self):
         self.assertEqual(t2, 33)
 
     def test_vector_sum_array_parallel(self):
+        from onnx_extended.validation.cpu._validation import vector_sum_array_parallel
+
         values = numpy.arange(16 * 16).reshape((-1, 16)).astype(numpy.float32)
         t = values.sum()
         t1 = vector_sum_array_parallel(16, values, True)
@@ -56,18 +52,26 @@ def test_vector_sum_array_parallel(self):
         self.assertEqual(t, t2)
 
     def test_vector_sum_array_avx(self):
+        from onnx_extended.validation.cpu._validation import vector_sum_array_avx
+
         values = numpy.arange(16 * 16).reshape((-1, 16)).astype(numpy.float32)
         t = values.sum()
         t1 = vector_sum_array_avx(16, values)
         self.assertEqual(t, t1)
 
     def test_vector_sum_array_avx_parallel(self):
+        from onnx_extended.validation.cpu._validation import (
+            vector_sum_array_avx_parallel,
+        )
+
         values = numpy.arange(16 * 16).reshape((-1, 16)).astype(numpy.float32)
         t = values.sum()
         t1 = vector_sum_array_avx_parallel(16, values)
         self.assertEqual(t, t1)
 
     def test_vector_add_exc(self):
+        from onnx_extended.validation.cpu._validation import vector_add
+
         # This test checks function vector_add
         # raises an exception if the dimension do not match.
         v1 = numpy.ones((3, 4), dtype=numpy.float32)
@@ -77,6 +81,8 @@ def test_vector_add_exc(self):
         self.assertRaise(lambda: vector_add(v1, v2), RuntimeError)
 
     def test_vector_add(self):
+        from onnx_extended.validation.cpu._validation import vector_add
+
         v1 = numpy.ones((3, 4), dtype=numpy.float32)
         v2 = (numpy.ones((3, 4)) * 10).astype(numpy.float32)
         v3 = vector_add(v1, v2)
@@ -84,6 +90,8 @@ def test_vector_add(self):
         self.assertEqualArray(v1 + v2, v3)
 
     def test_vector_add_c(self):
+        from onnx_extended.validation.cython.vector_function_cy import vector_add_c
+
         t1 = numpy.arange(10).reshape((2, 5)).astype(numpy.float32)
         t2 = numpy.arange(10).reshape((2, 5)).astype(numpy.float32)
         res = t1 + t2
diff --git a/_unittests/ut_xrun_doc/test_documentation_examples.py b/_unittests/ut_xrun_doc/test_documentation_examples.py
index cb707f38..538a272b 100644
--- a/_unittests/ut_xrun_doc/test_documentation_examples.py
+++ b/_unittests/ut_xrun_doc/test_documentation_examples.py
@@ -1,4 +1,5 @@
 import unittest
+import warnings
 import os
 import sys
 import importlib
@@ -10,6 +11,23 @@
 VERBOSE = 0
 ROOT = os.path.realpath(os.path.abspath(os.path.join(onnx_extended_file, "..", "..")))
 
+try:
+    from onnx_extended.ortcy.wrap.ortinf import OrtSession
+except ImportError as e:
+    msg = "libonnxruntime.so.1.15.1: cannot open shared object file"
+    if msg in str(e):
+        from onnx_extended.ortcy.wrap import __file__ as loc
+
+        all_files = os.listdir(os.path.dirname(loc))
+        warnings.warn(
+            f"Unable to find onnxruntime {e!r}, found files in {os.path.dirname(loc)}: "
+            f"{all_files}."
+        )
+        OrtSession = None
+        here = os.path.dirname(__file__)
+    else:
+        OrtSession = "OrtSession is not initialized"
+
 
 def import_source(module_file_path, module_name):
     if not os.path.exists(module_file_path):
@@ -64,6 +82,9 @@ def add_test_methods(cls):
         fold = os.path.normpath(os.path.join(this, "..", "..", "_doc", "examples"))
         found = os.listdir(fold)
         for name in found:
+            if OrtSession is None and name in {"plot_bench_cypy_ort.py"}:
+                # The build went wrong.
+                continue
             if name.startswith("plot_") and name.endswith(".py"):
                 short_name = os.path.split(os.path.splitext(name)[0])[-1]
 
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index c315f5d0..778fe91c 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -4,8 +4,8 @@ jobs:
     vmImage: 'ubuntu-latest'
   strategy:
     matrix:
-      Python310-Linux:
-        python.version: '3.10'
+      Python311-Linux:
+        python.version: '3.11'
     maxParallel: 3
 
   steps:
@@ -30,8 +30,8 @@ jobs:
       black --diff .
     displayName: 'Black'
   - script: |
-      cmake-lint _cmake/Find*  --disabled-codes C0103 C0113 --linelength=88
-      cmake-lint _cmake/CMake*  --disabled-codes C0103 C0113 --linelength=88
+      cmake-lint _cmake/Find* --disabled-codes C0103 C0113 --line-width=88
+      cmake-lint _cmake/CMake* --disabled-codes C0103 C0113 --line-width=88
     displayName: 'cmake-lint'
   - script: |
       rstcheck -r ./_doc ./onnx_extended
@@ -40,17 +40,25 @@ jobs:
       cython-lint .
     displayName: 'cython-lint'
   - script: |
-      python -m pip wheel . --wheel-dir dist --config-settings="--with_cuda=0" -v -v -v
+      export USE_CUDA=0
+      python -m pip install -e . -v
+    displayName: 'pip install -e . -v'
+  - script: |
+      python -m pytest _unittests --durations=10
+    displayName: 'Runs Unit Tests'
+  - script: |
+      # --config-settings does not work yet.
+      # python -m pip wheel . --config-settings="--use_cuda=0" -v
+      export USE_CUDA=0
+      python -m pip wheel . -v
     displayName: 'build wheel'
   - script: |
-      python -m pip install . -v -v -v
-    displayName: 'install wheel'
-#   - script: |  # It fails due to ModuleNotFoundError: No module named 'onnx_extended.reference.c_ops.cpu.c_op_conv_'
-#       python -m pytest . --durations=10
-#     displayName: 'Runs Unit Tests'
+      mkdir dist
+      cp onnx_extended*.whl dist
+    displayName: 'copy wheel'
   - task: PublishPipelineArtifact@0
     inputs:
-      artifactName: 'wheel-linux-wheel-$(python.version)'
+      artifactName: 'wheel-linux-pip-$(python.version)'
       targetPath: 'dist'
 
 - job: 'TestLinux'
@@ -58,8 +66,8 @@ jobs:
     vmImage: 'ubuntu-latest'
   strategy:
     matrix:
-      Python310-Linux:
-        python.version: '3.10'
+      Python311-Linux:
+        python.version: '3.11'
     maxParallel: 3
 
   steps:
@@ -88,8 +96,8 @@ jobs:
       black --diff .
     displayName: 'Black'
   - script: |
-      cmake-lint _cmake/Find*  --disabled-codes C0103 C0113 --linelength=88
-      cmake-lint _cmake/CMake*  --disabled-codes C0103 C0113 --linelength=88
+      cmake-lint _cmake/Find* --disabled-codes C0103 C0113 --line-width=88
+      cmake-lint _cmake/CMake* --disabled-codes C0103 C0113 --line-width=88
     displayName: 'cmake-lint'
   - script: |
       cython-lint .
@@ -109,9 +117,7 @@ jobs:
     displayName: 'Run C++ Unit Tests'
 
   - script: |
-      cd _unittests
-      python -m pytest . --durations=10
-      cd ..
+      python -m pytest _unittests --durations=10
     displayName: 'Runs Unit Tests'
   - script: |
       python -u setup.py bdist_wheel
@@ -160,9 +166,7 @@ jobs:
         ctest -C Release --rerun-failed --output-on-failure
     displayName: 'Runs C++ Unit Tests'
   - script: |
-      cd _unittests
-      python -m pytest . --durations=10
-      cd ..
+      python -m pytest _unittests --durations=10
     displayName: 'Runs Unit Tests'
   - script: |
         python -u setup.py bdist_wheel
@@ -210,6 +214,7 @@ jobs:
       pip install -r requirements-dev.txt
     displayName: 'Install Requirements dev'
   - script: |
+      gcc --version
       python -c "import sys;print('PYTHON', sys.executable)"
       python -c "import sys;print('PYTHON', sys.version_info)"
       python -c "import numpy;print('numpy', numpy.__version__)"
@@ -223,9 +228,7 @@ jobs:
     displayName: 'build wheel'
   - script: |
       source activate myEnvironment
-      cd _unittests
-      python -m pytest . --durations=10
-      cd ..
+      python -m pytest _unittests --durations=10
     displayName: 'Runs Unit Tests'
   - script: |
       python -u setup.py bdist_wheel
diff --git a/onnx_extended/ortcy/wrap/helpers.h b/onnx_extended/onnx_extended_helpers.h
similarity index 70%
rename from onnx_extended/ortcy/wrap/helpers.h
rename to onnx_extended/onnx_extended_helpers.h
index f382ab5d..4e44971b 100644
--- a/onnx_extended/ortcy/wrap/helpers.h
+++ b/onnx_extended/onnx_extended_helpers.h
@@ -8,7 +8,23 @@
 #include <thread>
 #include <vector>
 
-namespace orthelpers {
+namespace onnx_extended_helpers {
+
+inline std::vector<std::string> SplitString(const std::string &input,
+                                            char delimiter) {
+  std::vector<std::string> parts;
+  std::string::size_type start = 0;
+  std::string::size_type end = input.find(delimiter);
+
+  while (end != std::string::npos) {
+    parts.push_back(input.substr(start, end - start));
+    start = end + 1;
+    end = input.find(delimiter, start);
+  }
+
+  parts.push_back(input.substr(start));
+  return parts;
+}
 
 inline void MakeStringInternal(std::ostringstream &ss) noexcept {}
 
@@ -73,18 +89,18 @@ template <typename... Args> inline std::string MakeString(const Args &...args) {
 }
 
 #if !defined(_THROW_DEFINED)
-#define EXT_THROW(...) throw std::runtime_error(orthelpers::MakeString(__VA_ARGS__));
+#define EXT_THROW(...)                                                         \
+  throw std::runtime_error(onnx_extended_helpers::MakeString(__VA_ARGS__));
 #define _THROW_DEFINED
 #endif
 
 #if !defined(_ENFORCE_DEFINED)
 #define EXT_ENFORCE(cond, ...)                                                 \
   if (!(cond))                                                                 \
-    throw std::runtime_error(                                                  \
-        orthelpers::MakeString("`", #cond, "` failed. ", orthelpers::MakeString(__VA_ARGS__)));
+    throw std::runtime_error(onnx_extended_helpers::MakeString(                \
+        "`", #cond, "` failed. ",                                              \
+        onnx_extended_helpers::MakeString(__VA_ARGS__)));
 #define _ENFORCE_DEFINED
 #endif
 
-
-
-} // namespace orthelpers
+} // namespace onnx_extended_helpers
diff --git a/onnx_extended/ortcy/wrap/ortapi.cpp b/onnx_extended/ortcy/wrap/ortapi.cpp
index 96a8580e..1273911b 100644
--- a/onnx_extended/ortcy/wrap/ortapi.cpp
+++ b/onnx_extended/ortcy/wrap/ortapi.cpp
@@ -1,5 +1,5 @@
 #include "ortapi.h"
-#include "helpers.h"
+#include "onnx_extended_helpers.h"
 #include "ortapi_inline.h"
 #ifdef _WIN32
 #include <codecvt>
@@ -11,235 +11,246 @@
 namespace ortapi {
 
 std::vector<std::string> get_available_providers() {
-    int len;
-    char** providers;
-    ThrowOnError(GetOrtApi()->GetAvailableProviders(&providers, &len));
-    std::vector<std::string> available_providers(providers, providers + len);
-    ThrowOnError(GetOrtApi()->ReleaseAvailableProviders(providers, len));
-    return available_providers;
+  int len;
+  char **providers;
+  ThrowOnError(GetOrtApi()->GetAvailableProviders(&providers, &len));
+  std::vector<std::string> available_providers(providers, providers + len);
+  ThrowOnError(GetOrtApi()->ReleaseAvailableProviders(providers, len));
+  return available_providers;
 }
 
 void OrtCpuValue::free_ort_value() {
-    if (ort_value_ != nullptr) {
-        GetOrtApi()->ReleaseValue((OrtValue*)ort_value_);
-        ort_value_ = nullptr;
-    }
+  if (ort_value_ != nullptr) {
+    GetOrtApi()->ReleaseValue((OrtValue *)ort_value_);
+    ort_value_ = nullptr;
+  }
 }
 
 class OrtInference {
 public:
+  OrtInference() {
+    ThrowOnError(
+        GetOrtApi()->CreateEnv(ORT_LOGGING_LEVEL_WARNING, "ortcy", &env_));
+    ThrowOnError(GetOrtApi()->CreateSessionOptions(&sess_options_));
+    ThrowOnError(GetOrtApi()->CreateRunOptions(&run_options_));
+    ThrowOnError(GetOrtApi()->CreateCpuMemoryInfo(
+        OrtArenaAllocator, OrtMemTypeDefault, &cpu_memory_info_));
+    sess_ = nullptr;
+    cpu_allocator_ = nullptr;
+    n_inputs_ = 0;
+    n_outputs_ = 0;
+  }
 
-    OrtInference() {
-        ThrowOnError(GetOrtApi()->CreateEnv(ORT_LOGGING_LEVEL_WARNING, "ortcy", &env_));
-        ThrowOnError(GetOrtApi()->CreateSessionOptions(&sess_options_));
-        ThrowOnError(GetOrtApi()->CreateRunOptions(&run_options_));
-        ThrowOnError(GetOrtApi()->CreateCpuMemoryInfo(OrtArenaAllocator, OrtMemTypeDefault, &cpu_memory_info_));
-        sess_ = nullptr;
-        cpu_allocator_ = nullptr;
-        n_inputs_ = 0;
-        n_outputs_ = 0;
-    }
+  void LoadFromFile(const char *filepath) {
+    EXT_ENFORCE(filepath != nullptr);
+    EXT_ENFORCE(env_ != nullptr);
+    EXT_ENFORCE(sess_options_ != nullptr);
+#ifdef _WIN32
+    std::string name(filepath);
+    std::wstring_convert<std::codecvt_utf8<wchar_t>> cvt;
+    std::wstring wname(cvt.from_bytes(name));
+    ThrowOnError(
+        GetOrtApi()->CreateSession(env_, wname.c_str(), sess_options_, &sess_));
+#else
+    ThrowOnError(
+        GetOrtApi()->CreateSession(env_, filepath, sess_options_, &sess_));
+#endif
+    LoadFinalize();
+  }
+
+  void LoadFromBytes(const void *model_data, size_t model_data_length) {
+    ThrowOnError(GetOrtApi()->CreateSessionFromArray(
+        env_, model_data, model_data_length, sess_options_, &sess_));
+    LoadFinalize();
+  }
+
+  ~OrtInference() {
+    if (cpu_allocator_ != nullptr)
+      GetOrtApi()->ReleaseAllocator(cpu_allocator_);
+    if (sess_ != nullptr)
+      GetOrtApi()->ReleaseSession(sess_);
+    GetOrtApi()->ReleaseSessionOptions(sess_options_);
+    GetOrtApi()->ReleaseRunOptions(run_options_);
+    GetOrtApi()->ReleaseMemoryInfo(cpu_memory_info_);
+    GetOrtApi()->ReleaseEnv(env_);
+  }
 
-    void LoadFromFile(const char* filepath) {
-        EXT_ENFORCE(filepath != nullptr);
-        EXT_ENFORCE(env_ != nullptr);
-        EXT_ENFORCE(sess_options_ != nullptr);
-        #ifdef _WIN32
-        std::string name(filepath);
+  size_t GetInputCount() const { return n_inputs_; }
+  size_t GetOutputCount() const { return n_outputs_; }
+
+  void Initialize(const char *optimized_file_path = nullptr,
+                  int graph_optimization_level = -1, int enable_cuda = 0,
+                  int cuda_device_id = 0, int set_denormal_as_zero = 0,
+                  int intra_op_num_threads = -1, int inter_op_num_threads = -1,
+                  const char **custom_libs = nullptr) {
+    if (graph_optimization_level != -1) {
+      ThrowOnError(GetOrtApi()->SetSessionGraphOptimizationLevel(
+          sess_options_, (GraphOptimizationLevel)graph_optimization_level));
+    }
+    if (optimized_file_path != nullptr) {
+      std::string path(optimized_file_path);
+      if (!path.empty()) {
+#ifdef _WIN32
         std::wstring_convert<std::codecvt_utf8<wchar_t>> cvt;
-        std::wstring wname(cvt.from_bytes(name));
-        ThrowOnError(GetOrtApi()->CreateSession(env_, wname.c_str(), sess_options_, &sess_));
-        #else
-        ThrowOnError(GetOrtApi()->CreateSession(env_, filepath, sess_options_, &sess_));
-        #endif
-        LoadFinalize();
+        std::wstring wpath(cvt.from_bytes(path));
+        ThrowOnError(GetOrtApi()->SetOptimizedModelFilePath(sess_options_,
+                                                            wpath.c_str()));
+#else
+        ThrowOnError(GetOrtApi()->SetOptimizedModelFilePath(sess_options_,
+                                                            path.c_str()));
+#endif
+      }
     }
-
-    void LoadFromBytes(const void* model_data, size_t model_data_length) {
-        ThrowOnError(GetOrtApi()->CreateSessionFromArray(env_, model_data, model_data_length, sess_options_, &sess_));
-        LoadFinalize();
+    if (enable_cuda) {
+      OrtCUDAProviderOptions cuda_options;
+      cuda_options.device_id = cuda_device_id;
+      cuda_options.do_copy_in_default_stream = true;
+      // TODO: Support arena configuration for users of test runner
+      ThrowOnError(GetOrtApi()->SessionOptionsAppendExecutionProvider_CUDA(
+          sess_options_, &cuda_options));
     }
-
-    ~OrtInference() {
-        if (cpu_allocator_ != nullptr) GetOrtApi()->ReleaseAllocator(cpu_allocator_);
-        if (sess_ != nullptr) GetOrtApi()->ReleaseSession(sess_);
-        GetOrtApi()->ReleaseSessionOptions(sess_options_);
-        GetOrtApi()->ReleaseRunOptions(run_options_);
-        GetOrtApi()->ReleaseMemoryInfo(cpu_memory_info_);
-        GetOrtApi()->ReleaseEnv(env_);
+    // see https://github.com/microsoft/onnxruntime/blob/main/
+    // include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
+    if (set_denormal_as_zero) {
+      ThrowOnError(GetOrtApi()->AddSessionConfigEntry(
+          sess_options_, "session.set_denormal_as_zero", "1"));
+    }
+    if (intra_op_num_threads != -1) {
+      ThrowOnError(GetOrtApi()->SetIntraOpNumThreads(sess_options_,
+                                                     intra_op_num_threads));
+    }
+    if (inter_op_num_threads != -1) {
+      ThrowOnError(GetOrtApi()->SetInterOpNumThreads(sess_options_,
+                                                     inter_op_num_threads));
+    }
+    if (custom_libs != nullptr) {
+#ifdef _WIN32
+      std::wstring_convert<std::codecvt_utf8<wchar_t>> cvt;
+#endif
+      while (*custom_libs != nullptr) {
+#ifdef _WIN32
+        std::wstring wpath(cvt.from_bytes(*custom_libs));
+        ThrowOnError(GetOrtApi()->RegisterCustomOpsLibrary_V2(sess_options_,
+                                                              wpath.c_str()));
+#else
+        ThrowOnError(GetOrtApi()->RegisterCustomOpsLibrary_V2(sess_options_,
+                                                              *custom_libs));
+#endif
+        ++custom_libs;
+      }
     }
+  }
 
-    size_t GetInputCount() const { return n_inputs_; }
-    size_t GetOutputCount() const { return n_outputs_; }
+  size_t Run(size_t n_inputs, OrtShape *shapes, OrtCpuValue *values,
+             size_t max_outputs, OrtShape *out_shapes,
+             OrtCpuValue *out_values) {
+    if (max_outputs < n_outputs_)
+      EXT_THROW("Not enough expected outputs, max_outputs=", max_outputs, " > ",
+                n_outputs_, ".");
+    if (n_inputs > n_inputs_)
+      EXT_THROW("Too many inputs, n_inputs=", n_inputs, " > ", n_inputs, ".");
+    std::vector<OrtValue *> ort_values(n_inputs);
 
-    void Initialize(const char* optimized_file_path = nullptr,
-                    int graph_optimization_level = -1,
-                    int enable_cuda = 0,
-                    int cuda_device_id = 0,
-                    int set_denormal_as_zero = 0,
-                    int intra_op_num_threads = -1,
-                    int inter_op_num_threads = -1,
-                    const char** custom_libs = nullptr) {
-        if (graph_optimization_level != -1) {
-            ThrowOnError(GetOrtApi()->SetSessionGraphOptimizationLevel(
-                sess_options_, (GraphOptimizationLevel)graph_optimization_level));
-        }
-        if (optimized_file_path != nullptr) {
-            std::string path(optimized_file_path);
-            if (!path.empty()) {
-                #ifdef _WIN32
-                std::wstring_convert<std::codecvt_utf8<wchar_t>> cvt;
-                std::wstring wpath(cvt.from_bytes(path));
-                ThrowOnError(GetOrtApi()->SetOptimizedModelFilePath(
-                    sess_options_, wpath.c_str()));
-                #else
-                ThrowOnError(GetOrtApi()->SetOptimizedModelFilePath(
-                    sess_options_, path.c_str()));
-                #endif
-            }
-        }
-        if (enable_cuda) {
-            OrtCUDAProviderOptions cuda_options;
-            cuda_options.device_id = cuda_device_id;
-            cuda_options.do_copy_in_default_stream = true;
-            // TODO: Support arena configuration for users of test runner
-            ThrowOnError(GetOrtApi()->SessionOptionsAppendExecutionProvider_CUDA(sess_options_, &cuda_options));
-        }
-        // see https://github.com/microsoft/onnxruntime/blob/main/
-        // include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
-        if (set_denormal_as_zero) {
-            ThrowOnError(GetOrtApi()->AddSessionConfigEntry(
-                sess_options_, "session.set_denormal_as_zero", "1"));
-        }
-        if (intra_op_num_threads != -1) {
-            ThrowOnError(GetOrtApi()->SetIntraOpNumThreads(sess_options_, intra_op_num_threads));
-        }
-        if (inter_op_num_threads != -1) {
-            ThrowOnError(GetOrtApi()->SetInterOpNumThreads(sess_options_, inter_op_num_threads));
-        }
-        if (custom_libs != nullptr) {
-            #ifdef _WIN32
-            std::wstring_convert<std::codecvt_utf8<wchar_t>> cvt;
-            #endif
-            while (*custom_libs != nullptr) {
-                #ifdef _WIN32
-                std::wstring wpath(cvt.from_bytes(*custom_libs));
-                ThrowOnError(GetOrtApi()->RegisterCustomOpsLibrary_V2(sess_options_, wpath.c_str()));
-                #else
-                ThrowOnError(GetOrtApi()->RegisterCustomOpsLibrary_V2(sess_options_, *custom_libs));
-                #endif
-                ++custom_libs;
-            }
-        }
+    for (size_t i = 0; i < n_inputs; ++i) {
+      ONNXTensorElementDataType elem_type =
+          (ONNXTensorElementDataType)values[i].elem_type();
+      ThrowOnError(GetOrtApi()->CreateTensorWithDataAsOrtValue(
+          cpu_memory_info_, values[i].data(),
+          values[i].size() * ElementSize(elem_type), shapes[i].dims(),
+          shapes[i].ndim(), elem_type, &ort_values[i]));
     }
 
-    size_t Run(size_t n_inputs,
-                       OrtShape* shapes,
-                       OrtCpuValue* values,
-                       size_t max_outputs,
-                       OrtShape* out_shapes,
-                       OrtCpuValue* out_values) {
-        if (max_outputs < n_outputs_)
-            EXT_THROW("Not enough expected outputs, max_outputs=",
-                      max_outputs, " > ", n_outputs_, ".");
-        if (n_inputs > n_inputs_)
-            EXT_THROW("Too many inputs, n_inputs=", n_inputs, " > ", n_inputs, ".");
-        std::vector<OrtValue*> ort_values(n_inputs);
-        
-        for(size_t i = 0; i < n_inputs; ++i) {
-            ONNXTensorElementDataType elem_type = (ONNXTensorElementDataType)values[i].elem_type();
-            ThrowOnError(GetOrtApi()->CreateTensorWithDataAsOrtValue(
-                cpu_memory_info_, values[i].data(),
-                values[i].size() * ElementSize(elem_type),
-                shapes[i].dims(), shapes[i].ndim(),
-                elem_type, &ort_values[i]));
-        }
+    std::vector<OrtValue *> ort_values_out(n_outputs_);
+    ThrowOnError(GetOrtApi()->Run(sess_, run_options_, input_names_call_.data(),
+                                  ort_values.data(), n_inputs,
+                                  output_names_call_.data(), n_outputs_,
+                                  ort_values_out.data()));
 
-        std::vector<OrtValue*> ort_values_out(n_outputs_);
-        ThrowOnError(GetOrtApi()->Run(
-            sess_, run_options_,
-            input_names_call_.data(), ort_values.data(), n_inputs,
-            output_names_call_.data(), n_outputs_, ort_values_out.data()));
-
-        for(size_t i = 0; i < n_inputs; ++i) {
-            GetOrtApi()->ReleaseValue(ort_values[i]);
-        }
-        OrtTensorTypeAndShapeInfo* info;
-        ONNXTensorElementDataType elem_type;
-        size_t size, n_dims;
-        void* data;
-        for(size_t i = 0; i < n_outputs_; ++i) {
-            ThrowOnError(GetOrtApi()->GetTensorTypeAndShape(ort_values_out[i], &info));
-            ThrowOnError(GetOrtApi()->GetTensorElementType(info, &elem_type));
-            if (elem_type == ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING) {
-                GetOrtApi()->ReleaseTensorTypeAndShapeInfo(info);
-                for(; i < n_outputs_; ++i) {
-                    GetOrtApi()->ReleaseValue(ort_values_out[i]);
-                }
-                throw std::runtime_error("tensor(string) is not supported as outputs.");
-            }
-            ThrowOnError(GetOrtApi()->GetTensorShapeElementCount(info, &size));
-            ThrowOnError(GetOrtApi()->GetTensorMutableData(ort_values_out[i], &data));
-            ThrowOnError(GetOrtApi()->GetDimensionsCount(info, &n_dims));
-            out_shapes[i].init(n_dims);
-            ThrowOnError(GetOrtApi()->GetDimensions(info, (int64_t*)out_shapes[i].dims(), n_dims));
-            /* typedef void copy_allocate(size_t output, int elem_type, size_t size,
-                                          OrtShape shape, void* data, void* args); */
-            GetOrtApi()->ReleaseTensorTypeAndShapeInfo(info);
-            out_values[i].init(size, elem_type, data, ort_values_out[i]);
-            // GetOrtApi()->ReleaseValue(ort_values_out[i]);
+    for (size_t i = 0; i < n_inputs; ++i) {
+      GetOrtApi()->ReleaseValue(ort_values[i]);
+    }
+    OrtTensorTypeAndShapeInfo *info;
+    ONNXTensorElementDataType elem_type;
+    size_t size, n_dims;
+    void *data;
+    for (size_t i = 0; i < n_outputs_; ++i) {
+      ThrowOnError(
+          GetOrtApi()->GetTensorTypeAndShape(ort_values_out[i], &info));
+      ThrowOnError(GetOrtApi()->GetTensorElementType(info, &elem_type));
+      if (elem_type ==
+          ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING) {
+        GetOrtApi()->ReleaseTensorTypeAndShapeInfo(info);
+        for (; i < n_outputs_; ++i) {
+          GetOrtApi()->ReleaseValue(ort_values_out[i]);
         }
-        return n_outputs_;
+        throw std::runtime_error("tensor(string) is not supported as outputs.");
+      }
+      ThrowOnError(GetOrtApi()->GetTensorShapeElementCount(info, &size));
+      ThrowOnError(GetOrtApi()->GetTensorMutableData(ort_values_out[i], &data));
+      ThrowOnError(GetOrtApi()->GetDimensionsCount(info, &n_dims));
+      out_shapes[i].init(n_dims);
+      ThrowOnError(GetOrtApi()->GetDimensions(
+          info, (int64_t *)out_shapes[i].dims(), n_dims));
+      /* typedef void copy_allocate(size_t output, int elem_type, size_t size,
+                                    OrtShape shape, void* data, void* args); */
+      GetOrtApi()->ReleaseTensorTypeAndShapeInfo(info);
+      out_values[i].init(size, elem_type, data, ort_values_out[i]);
+      // GetOrtApi()->ReleaseValue(ort_values_out[i]);
     }
+    return n_outputs_;
+  }
 
 protected:
-    void LoadFinalize() {
-        EXT_ENFORCE(cpu_memory_info_ != nullptr);
-        ThrowOnError(GetOrtApi()->CreateAllocator(sess_, cpu_memory_info_ , &cpu_allocator_));
-        EXT_ENFORCE(cpu_allocator_ != nullptr);
-        ThrowOnError(GetOrtApi()->SessionGetInputCount(sess_, &n_inputs_));
-        ThrowOnError(GetOrtApi()->SessionGetOutputCount(sess_, &n_outputs_));
-        input_names_.reserve(n_inputs_);
-        output_names_.reserve(n_outputs_);
-        
-        char* name;
-        for(size_t i = 0; i < n_inputs_; ++i) {
-            ThrowOnError(GetOrtApi()->SessionGetInputName(sess_, i, cpu_allocator_, &name));
-            input_names_.emplace_back(std::string(name));
-            ThrowOnError(GetOrtApi()->AllocatorFree(cpu_allocator_, name));
-        }
-        for(size_t i = 0; i < n_outputs_; ++i) {
-            ThrowOnError(GetOrtApi()->SessionGetOutputName(sess_, i, cpu_allocator_, &name));
-            output_names_.emplace_back(std::string(name));
-            ThrowOnError(GetOrtApi()->AllocatorFree(cpu_allocator_, name));
-        }
-        input_names_call_.resize(n_inputs_);
-        for(size_t i = 0; i < n_inputs_; ++i) {
-            input_names_call_[i] = input_names_[i].c_str();
-        }
-        output_names_call_.resize(n_inputs_);
-        for(size_t i = 0; i < n_inputs_; ++i) {
-            output_names_call_[i] = output_names_[i].c_str();
-        }
+  void LoadFinalize() {
+    EXT_ENFORCE(cpu_memory_info_ != nullptr);
+    ThrowOnError(
+        GetOrtApi()->CreateAllocator(sess_, cpu_memory_info_, &cpu_allocator_));
+    EXT_ENFORCE(cpu_allocator_ != nullptr);
+    ThrowOnError(GetOrtApi()->SessionGetInputCount(sess_, &n_inputs_));
+    ThrowOnError(GetOrtApi()->SessionGetOutputCount(sess_, &n_outputs_));
+    input_names_.reserve(n_inputs_);
+    output_names_.reserve(n_outputs_);
+
+    char *name;
+    for (size_t i = 0; i < n_inputs_; ++i) {
+      ThrowOnError(
+          GetOrtApi()->SessionGetInputName(sess_, i, cpu_allocator_, &name));
+      input_names_.emplace_back(std::string(name));
+      ThrowOnError(GetOrtApi()->AllocatorFree(cpu_allocator_, name));
+    }
+    for (size_t i = 0; i < n_outputs_; ++i) {
+      ThrowOnError(
+          GetOrtApi()->SessionGetOutputName(sess_, i, cpu_allocator_, &name));
+      output_names_.emplace_back(std::string(name));
+      ThrowOnError(GetOrtApi()->AllocatorFree(cpu_allocator_, name));
+    }
+    input_names_call_.resize(n_inputs_);
+    for (size_t i = 0; i < n_inputs_; ++i) {
+      input_names_call_[i] = input_names_[i].c_str();
     }
+    output_names_call_.resize(n_inputs_);
+    for (size_t i = 0; i < n_inputs_; ++i) {
+      output_names_call_[i] = output_names_[i].c_str();
+    }
+  }
 
 private:
-    // before loading the model
-    OrtEnv* env_;
-    OrtSessionOptions* sess_options_;
-    OrtRunOptions* run_options_;
-    OrtMemoryInfo* cpu_memory_info_;
+  // before loading the model
+  OrtEnv *env_;
+  OrtSessionOptions *sess_options_;
+  OrtRunOptions *run_options_;
+  OrtMemoryInfo *cpu_memory_info_;
 
 private:
-    // after loading the model
-    OrtSession* sess_;
-    OrtAllocator* cpu_allocator_;
-    size_t n_inputs_;
-    size_t n_outputs_;
-    std::vector<std::string> input_names_;
-    std::vector<std::string> output_names_;
-    std::vector<const char*> input_names_call_;
-    std::vector<const char*> output_names_call_;
+  // after loading the model
+  OrtSession *sess_;
+  OrtAllocator *cpu_allocator_;
+  size_t n_inputs_;
+  size_t n_outputs_;
+  std::vector<std::string> input_names_;
+  std::vector<std::string> output_names_;
+  std::vector<const char *> input_names_call_;
+  std::vector<const char *> output_names_call_;
 };
 
 /*
@@ -253,47 +264,44 @@ typedef enum {
 
 //////// SIMPLE API //////
 
-OrtSessionType* create_session() { return (OrtSessionType*)(new OrtInference()); }
-void delete_session(OrtSessionType* ptr) {
-    if (ptr == nullptr)
-        throw std::runtime_error("Cannot delete a null pointer (delete_session).");
-    delete (OrtInference*)ptr;
+OrtSessionType *create_session() {
+  return (OrtSessionType *)(new OrtInference());
+}
+void delete_session(OrtSessionType *ptr) {
+  if (ptr == nullptr)
+    throw std::runtime_error("Cannot delete a null pointer (delete_session).");
+  delete (OrtInference *)ptr;
+}
+void session_load_from_file(OrtSessionType *ptr, const char *filename) {
+  ((OrtInference *)ptr)->LoadFromFile(filename);
+}
+void session_load_from_bytes(OrtSessionType *ptr, const void *buffer,
+                             size_t size) {
+  ((OrtInference *)ptr)->LoadFromBytes(buffer, size);
+}
+size_t session_get_input_count(OrtSessionType *ptr) {
+  return ((OrtInference *)ptr)->GetInputCount();
 }
-void session_load_from_file(OrtSessionType* ptr, const char* filename) { ((OrtInference*)ptr)->LoadFromFile(filename); }
-void session_load_from_bytes(OrtSessionType* ptr, const void* buffer, size_t size) {
-    ((OrtInference*)ptr)->LoadFromBytes(buffer, size);
+size_t session_get_output_count(OrtSessionType *ptr) {
+  return ((OrtInference *)ptr)->GetOutputCount();
 }
-size_t session_get_input_count(OrtSessionType* ptr) { return ((OrtInference*)ptr)->GetInputCount(); }
-size_t session_get_output_count(OrtSessionType* ptr) { return ((OrtInference*)ptr)->GetOutputCount(); }
 
-void session_initialize(OrtSessionType* ptr,
-                        const char* optimized_file_path,
-                        int graph_optimization_level,
-                        int enable_cuda,
-                        int cuda_device_id,
-                        int set_denormal_as_zero,
-                        int intra_op_num_threads,
-                        int inter_op_num_threads,
-                        char** custom_libs) {
-    ((OrtInference*)ptr)->Initialize(optimized_file_path,
-                                     graph_optimization_level,
-                                     enable_cuda,
-                                     cuda_device_id,
-                                     set_denormal_as_zero,
-                                     intra_op_num_threads,
-                                     inter_op_num_threads,
-                                     (const char**)custom_libs);
+void session_initialize(OrtSessionType *ptr, const char *optimized_file_path,
+                        int graph_optimization_level, int enable_cuda,
+                        int cuda_device_id, int set_denormal_as_zero,
+                        int intra_op_num_threads, int inter_op_num_threads,
+                        char **custom_libs) {
+  ((OrtInference *)ptr)
+      ->Initialize(optimized_file_path, graph_optimization_level, enable_cuda,
+                   cuda_device_id, set_denormal_as_zero, intra_op_num_threads,
+                   inter_op_num_threads, (const char **)custom_libs);
 }
 
-size_t session_run(OrtSessionType* ptr,
-                   size_t n_inputs,
-                   OrtShape* shapes,
-                   OrtCpuValue* values,
-                   size_t max_outputs,
-                   OrtShape* out_shapes,
-                   OrtCpuValue* out_values) {
-    return ((OrtInference*)ptr)->Run(n_inputs, shapes, values,
-                                     max_outputs, out_shapes, out_values);
+size_t session_run(OrtSessionType *ptr, size_t n_inputs, OrtShape *shapes,
+                   OrtCpuValue *values, size_t max_outputs,
+                   OrtShape *out_shapes, OrtCpuValue *out_values) {
+  return ((OrtInference *)ptr)
+      ->Run(n_inputs, shapes, values, max_outputs, out_shapes, out_values);
 }
 
 } // namespace ortapi
diff --git a/onnx_extended/ortcy/wrap/ortapi.h b/onnx_extended/ortcy/wrap/ortapi.h
index 718feea2..929b9daf 100644
--- a/onnx_extended/ortcy/wrap/ortapi.h
+++ b/onnx_extended/ortcy/wrap/ortapi.h
@@ -10,100 +10,99 @@
 namespace ortapi {
 
 inline size_t ElementSize(ONNXTensorElementDataType elem_type) {
-    switch(elem_type) {
-        case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE:
-        case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64:
-        case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64:
-            return 8;
-        case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT:
-        case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32:
-        case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32:
-            return 4;
-        case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16:
-        case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16:
-        case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16:
-        case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16:
-            return 2;
-        case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8:
-        case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8:
-            return 2;
-        default:
-            throw std::runtime_error(
-                "One element type is not implemented in function `ortapi::ElementSize()`.");
-    }
+  switch (elem_type) {
+  case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE:
+  case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64:
+  case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64:
+    return 8;
+  case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT:
+  case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32:
+  case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32:
+    return 4;
+  case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16:
+  case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16:
+  case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16:
+  case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16:
+    return 2;
+  case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8:
+  case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8:
+    return 2;
+  default:
+    throw std::runtime_error("One element type is not implemented in function "
+                             "`ortapi::ElementSize()`.");
+  }
 }
 
 inline size_t ElementSizeI(int elem_type) {
-    return ElementSize((ONNXTensorElementDataType)elem_type);
+  return ElementSize((ONNXTensorElementDataType)elem_type);
 }
 
 class OrtShape {
 private:
-    int64_t size_;
-    int64_t dims_[8];
+  int64_t size_;
+  int64_t dims_[8];
 
 public:
-    inline OrtShape() { size_ = 0; }
-    inline OrtShape(size_t ndim) { init(ndim); }
-    inline void init(size_t ndim) {
-        if (ndim > 8)
-            throw std::runtime_error("shape cannot have more than 8 dimensions.");
-        size_ = ndim;
-    }
-    inline int64_t ndim() const { return size_; }
-    inline void set(size_t i, int64_t dim) { dims_[i] = dim; }
-    inline const int64_t *dims() const { return dims_; }
+  inline OrtShape() { size_ = 0; }
+  inline OrtShape(size_t ndim) { init(ndim); }
+  inline void init(size_t ndim) {
+    if (ndim > 8)
+      throw std::runtime_error("shape cannot have more than 8 dimensions.");
+    size_ = ndim;
+  }
+  inline int64_t ndim() const { return size_; }
+  inline void set(size_t i, int64_t dim) { dims_[i] = dim; }
+  inline const int64_t *dims() const { return dims_; }
 };
 
 class OrtCpuValue {
-    private:
-        size_t size_;
-        int elem_type_;  // ONNXTensorElementDataType
-        void* data_;
-        void* ort_value_;
-    public:
-        inline OrtCpuValue() { elem_type_ = -1; size_ = 0; ort_value_ = nullptr; data_ = nullptr; }
-        inline void init(size_t size, int elem_type, void* data, void* ort_value) {
-            size_ = size;
-            elem_type_ = elem_type;
-            data_ = data;
-            ort_value_ = ort_value;
-        }
-        inline size_t size() { return size_; }
-        inline int elem_type() { return elem_type_; }
-        inline void* data() { return data_; }
-        void free_ort_value();
+private:
+  size_t size_;
+  int elem_type_; // ONNXTensorElementDataType
+  void *data_;
+  void *ort_value_;
+
+public:
+  inline OrtCpuValue() {
+    elem_type_ = -1;
+    size_ = 0;
+    ort_value_ = nullptr;
+    data_ = nullptr;
+  }
+  inline void init(size_t size, int elem_type, void *data, void *ort_value) {
+    size_ = size;
+    elem_type_ = elem_type;
+    data_ = data;
+    ort_value_ = ort_value;
+  }
+  inline size_t size() { return size_; }
+  inline int elem_type() { return elem_type_; }
+  inline void *data() { return data_; }
+  void free_ort_value();
 };
 
 // Simplified API for this project.
 // see https://onnxruntime.ai/docs/api/c/
 
-typedef void release(size_t output, int elem_type, size_t size, OrtShape* shape, void* data, void* args);
+typedef void release(size_t output, int elem_type, size_t size, OrtShape *shape,
+                     void *data, void *args);
 
 std::vector<std::string> get_available_providers();
 
 OrtSessionType *create_session();
 void delete_session(OrtSessionType *);
-void session_load_from_file(OrtSessionType*, const char* filename);
-void session_load_from_bytes(OrtSessionType*, const void* buffer, size_t size);
-void session_initialize(OrtSessionType* ptr,
-                        const char* optimized_file_path,
-                        int graph_optimization_level = -1,
-                        int enable_cuda = 0,
-                        int cuda_device_id = 0,
-                        int set_denormal_as_zero = 0,
+void session_load_from_file(OrtSessionType *, const char *filename);
+void session_load_from_bytes(OrtSessionType *, const void *buffer, size_t size);
+void session_initialize(OrtSessionType *ptr, const char *optimized_file_path,
+                        int graph_optimization_level = -1, int enable_cuda = 0,
+                        int cuda_device_id = 0, int set_denormal_as_zero = 0,
                         int intra_op_num_threads = -1,
                         int inter_op_num_threads = -1,
-                        char** custom_libs = nullptr);
+                        char **custom_libs = nullptr);
 size_t session_get_input_count(OrtSessionType *);
 size_t session_get_output_count(OrtSessionType *);
-size_t session_run(OrtSessionType* ptr,
-                   size_t n_inputs,
-                   OrtShape* shapes,
-                   OrtCpuValue* values,
-                   size_t max_outputs,
-                   OrtShape* out_shapes,
-                   OrtCpuValue* out_values);
-
+size_t session_run(OrtSessionType *ptr, size_t n_inputs, OrtShape *shapes,
+                   OrtCpuValue *values, size_t max_outputs,
+                   OrtShape *out_shapes, OrtCpuValue *out_values);
 
 } // namespace ortapi
diff --git a/onnx_extended/ortcy/wrap/ortapi_inline.h b/onnx_extended/ortcy/wrap/ortapi_inline.h
index 188a66ae..33813492 100644
--- a/onnx_extended/ortcy/wrap/ortapi_inline.h
+++ b/onnx_extended/ortcy/wrap/ortapi_inline.h
@@ -1,27 +1,29 @@
 #pragma once
 
-#include "helpers.h"
-
 #define ORT_API_MANUAL_INIT
 #include "onnxruntime_c_api.h"
 #undef ORT_API_MANUAL_INIT
 
+#include "onnx_extended_helpers.h"
+
 namespace ortapi {
 
-inline static const OrtApi *GetOrtApi() { 
-    const OrtApi* api_ = OrtGetApiBase()->GetApi(ORT_API_VERSION);
-    return api_;
+inline static const OrtApi *GetOrtApi() {
+  const OrtApi *api_ = OrtGetApiBase()->GetApi(ORT_API_VERSION);
+  return api_;
 }
 
-inline const char* ort_version() { return OrtGetApiBase()->GetVersionString(); }
-
-inline void _ThrowOnError_(OrtStatus* ort_status, const char* filename, int line) {
-    if (ort_status) {
-        std::string message(GetOrtApi()->GetErrorMessage(ort_status));
-        OrtErrorCode code = GetOrtApi()->GetErrorCode(ort_status);
-        throw std::runtime_error(
-            orthelpers::MakeString("error: onnxruntime(", code, "), ", message, "\n    ", filename, ":", line));
-    }
+inline const char *ort_version() { return OrtGetApiBase()->GetVersionString(); }
+
+inline void _ThrowOnError_(OrtStatus *ort_status, const char *filename,
+                           int line) {
+  if (ort_status) {
+    std::string message(GetOrtApi()->GetErrorMessage(ort_status));
+    OrtErrorCode code = GetOrtApi()->GetErrorCode(ort_status);
+    throw std::runtime_error(onnx_extended_helpers::MakeString(
+        "error: onnxruntime(", code, "), ", message, "\n    ", filename, ":",
+        line));
+  }
 }
 
 #define ThrowOnError(ort_status) _ThrowOnError_(ort_status, __FILE__, __LINE__)
diff --git a/onnx_extended/ortops/common/common_kernels.h b/onnx_extended/ortops/common/common_kernels.h
index 8e326fe2..f5fec662 100644
--- a/onnx_extended/ortops/common/common_kernels.h
+++ b/onnx_extended/ortops/common/common_kernels.h
@@ -1,75 +1,17 @@
 #pragma once
 
-#include "helpers.h"
 #define ORT_API_MANUAL_INIT
 #include <onnxruntime_c_api.h>
 #include <onnxruntime_cxx_api.h>
 #undef ORT_API_MANUAL_INIT
 
-namespace ortops {
-
-inline void MakeStringInternal(std::ostringstream &ss) noexcept {}
-
-template <typename T>
-inline void MakeStringInternal(std::ostringstream &ss, const T &t) noexcept {
-  ss << t;
-}
-
-template <>
-inline void MakeStringInternal(std::ostringstream &ss,
-                               const std::vector<int32_t> &t) noexcept {
-  for (auto it : t)
-    ss << "x" << it;
-}
-
-template <>
-inline void MakeStringInternal(std::ostringstream &ss,
-                               const std::vector<uint32_t> &t) noexcept {
-  for (auto it : t)
-    ss << "x" << it;
-}
-
-template <>
-inline void MakeStringInternal(std::ostringstream &ss,
-                               const std::vector<int64_t> &t) noexcept {
-  for (auto it : t)
-    ss << "x" << it;
-}
+#include "onnx_extended_helpers.h"
 
-template <>
-inline void MakeStringInternal(std::ostringstream &ss,
-                               const std::vector<uint64_t> &t) noexcept {
-  for (auto it : t)
-    ss << "x" << it;
-}
-
-template <>
-inline void MakeStringInternal(std::ostringstream &ss,
-                               const std::vector<int16_t> &t) noexcept {
-  for (auto it : t)
-    ss << "x" << it;
-}
-
-template <>
-inline void MakeStringInternal(std::ostringstream &ss,
-                               const std::vector<uint16_t> &t) noexcept {
-  for (auto it : t)
-    ss << "x" << it;
-}
-
-template <typename T, typename... Args>
-inline void MakeStringInternal(std::ostringstream &ss, const T &t,
-                               const Args &...args) noexcept {
-  MakeStringInternal(ss, t);
-  MakeStringInternal(ss, args...);
-}
-
-template <typename... Args> inline std::string MakeString(const Args &...args) {
-  std::ostringstream ss;
-  MakeStringInternal(ss, args...);
-  return std::string(ss.str());
-}
+namespace ortops {
 
+////////////////////////
+// errors and exceptions
+////////////////////////
 
 inline void _ThrowOnError_(OrtStatus *ort_status, const char *filename,
                            int line, const OrtApi &api) {
@@ -81,9 +23,9 @@ inline void _ThrowOnError_(OrtStatus *ort_status, const char *filename,
       std::string message(api.GetErrorMessage(ort_status));
       api.ReleaseStatus(ort_status);
       if (code != ORT_OK) {
-        throw std::runtime_error(
-            orthelpers::MakeString("error: onnxruntime(", code, "), ", message,
-                                   "\n    ", filename, ":", line));
+        throw std::runtime_error(onnx_extended_helpers::MakeString(
+            "error: onnxruntime(", code, "), ", message, "\n    ", filename,
+            ":", line));
       }
     }
   }
@@ -92,6 +34,10 @@ inline void _ThrowOnError_(OrtStatus *ort_status, const char *filename,
 #define ThrowOnError(api, ort_status)                                          \
   _ThrowOnError_(ort_status, __FILE__, __LINE__, api)
 
+////////////////////
+// kernel attributes
+////////////////////
+
 inline std::string KernelInfoGetOptionalAttributeString(
     const OrtApi &api, const OrtKernelInfo *info, const char *name,
     const std::string &default_value) {
@@ -137,6 +83,51 @@ KernelInfoGetAttributeApi<float>(const OrtApi &api, const OrtKernelInfo *info,
   return api.KernelInfoGetAttribute_float(info, name, &out);
 }
 
+template <>
+inline OrtStatus *KernelInfoGetAttributeApi<std::vector<float>>(
+    const OrtApi &api, const OrtKernelInfo *info, const char *name,
+    std::vector<float> &out) {
+  size_t size = 0;
+
+  // Feed nullptr for the data buffer to query the true size of the attribute
+  OrtStatus *status =
+      api.KernelInfoGetAttributeArray_float(info, name, nullptr, &size);
+
+  if (status == nullptr) {
+    out.resize(size);
+    status =
+        api.KernelInfoGetAttributeArray_float(info, name, out.data(), &size);
+  }
+
+  return status;
+}
+
+template <>
+inline OrtStatus *KernelInfoGetAttributeApi<std::vector<int64_t>>(
+    const OrtApi &api, const OrtKernelInfo *info, const char *name,
+    std::vector<int64_t> &out) {
+  size_t size = 0;
+
+  // Feed nullptr for the data buffer to query the true size of the attribute
+  OrtStatus *status =
+      api.KernelInfoGetAttributeArray_int64(info, name, nullptr, &size);
+
+  if (status == nullptr) {
+    out.resize(size);
+    ThrowOnError(api, api.KernelInfoGetAttributeArray_int64(info, name,
+                                                            out.data(), &size));
+  }
+  return status;
+}
+
+template <>
+inline OrtStatus *KernelInfoGetAttributeApi<std::vector<std::string>>(
+    const OrtApi &api, const OrtKernelInfo *info, const char *name,
+    std::vector<std::string> &output) {
+  EXT_THROW("Unable to retrieve attribute as an array of strings. "
+            "You should use a single comma separated string.");
+}
+
 template <typename T>
 inline T KernelInfoGetOptionalAttribute(const OrtApi &api,
                                         const OrtKernelInfo *info,
@@ -144,9 +135,8 @@ inline T KernelInfoGetOptionalAttribute(const OrtApi &api,
   T out;
   OrtStatus *status = KernelInfoGetAttributeApi<T>(api, info, name, out);
 
-  if (status == nullptr) {
+  if (status == nullptr)
     return out;
-  }
   OrtErrorCode code = api.GetErrorCode(status);
   if (code == ORT_FAIL) {
     api.ReleaseStatus(status);
diff --git a/onnx_extended/ortops/common/common_kernels_cuda.h b/onnx_extended/ortops/common/common_kernels_cuda.h
index 45c05717..51b19ca9 100644
--- a/onnx_extended/ortops/common/common_kernels_cuda.h
+++ b/onnx_extended/ortops/common/common_kernels_cuda.h
@@ -2,7 +2,7 @@
 
 #include "common_kernels.h"
 #include "cublas_v2.h"
-#include "helpers.h"
+#include "onnx_extended_helpers.h"
 #include <cuda_runtime.h>
 
 namespace ortops {
diff --git a/onnx_extended/ortops/optim/__init__.py b/onnx_extended/ortops/optim/__init__.py
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/onnx_extended/ortops/optim/__init__.py
@@ -0,0 +1 @@
+
diff --git a/onnx_extended/ortops/optim/cpu/__init__.py b/onnx_extended/ortops/optim/cpu/__init__.py
new file mode 100644
index 00000000..77cfc65e
--- /dev/null
+++ b/onnx_extended/ortops/optim/cpu/__init__.py
@@ -0,0 +1,53 @@
+import os
+import textwrap
+from typing import List
+from ... import _get_ort_ext_libs
+
+
+def get_ort_ext_libs() -> List[str]:
+    """
+    Returns the list of libraries implementing new simple
+    :epkg:`onnxruntime` kernels implemented for the
+    :epkg:`CPUExecutionProvider`.
+    """
+    return _get_ort_ext_libs(os.path.dirname(__file__))
+
+
+def documentation() -> List[str]:
+    """
+    Returns a list of rst string documenting every implemented kernels
+    in this subfolder.
+    """
+    return list(
+        map(
+            textwrap.dedent,
+            [
+                """
+    onnx_extented.ortops.option.cpu.RandomForestRegressor
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    It does the sum of two tensors.
+
+    **Provider**
+    
+    CPUExecutionProvider
+    
+    **Inputs**
+    
+    * X (T1): tensor of type T1
+
+    **Outputs**
+
+    * Y (T2): prediction of type T2
+
+    **Constraints**
+
+    * T1: float, double
+    * T2: float, double
+
+    **Attributes**
+
+    """,
+            ],
+        )
+    )
diff --git a/onnx_extended/ortops/optim/cpu/ort_optim_cpu_lib.cc b/onnx_extended/ortops/optim/cpu/ort_optim_cpu_lib.cc
new file mode 100644
index 00000000..0cbf20c5
--- /dev/null
+++ b/onnx_extended/ortops/optim/cpu/ort_optim_cpu_lib.cc
@@ -0,0 +1,42 @@
+// Source: https://github.com/microsoft/onnxruntime/tree/main/
+// onnxruntime/test/testdata/custom_op_get_const_input_test_library
+
+#include <mutex>
+#include <vector>
+
+#include "ort_optim_cpu_lib.h"
+#include "tree_ensemble.h"
+
+static const char *c_OpDomain = "onnx_extented.ortops.optim.cpu";
+
+static void AddOrtCustomOpDomainToContainer(Ort::CustomOpDomain &&domain) {
+  static std::vector<Ort::CustomOpDomain> ort_custom_op_domain_container;
+  static std::mutex ort_custom_op_domain_mutex;
+  std::lock_guard<std::mutex> lock(ort_custom_op_domain_mutex);
+  ort_custom_op_domain_container.push_back(std::move(domain));
+}
+
+OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
+                                          const OrtApiBase *api_base) {
+  Ort::InitApi(api_base->GetApi(ORT_API_VERSION));
+  Ort::UnownedSessionOptions session_options(options);
+
+  // An instance remaining available until onnxruntime unload the library.
+  static ortops::TreeEnsembleRegressor c_TreeEnsembleRegressor;
+
+  OrtStatus *result = nullptr;
+
+  try {
+    Ort::CustomOpDomain domain{c_OpDomain};
+
+    domain.Add(&c_TreeEnsembleRegressor);
+
+    session_options.Add(domain);
+    AddOrtCustomOpDomainToContainer(std::move(domain));
+  } catch (const std::exception &e) {
+    Ort::Status status{e};
+    result = status.release();
+  }
+
+  return result;
+}
diff --git a/onnx_extended/ortops/optim/cpu/ort_optim_cpu_lib.h b/onnx_extended/ortops/optim/cpu/ort_optim_cpu_lib.h
new file mode 100644
index 00000000..b7f1241e
--- /dev/null
+++ b/onnx_extended/ortops/optim/cpu/ort_optim_cpu_lib.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <onnxruntime_c_api.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ORT_EXPORT OrtStatus *ORT_API_CALL
+RegisterCustomOps(OrtSessionOptions *options, const OrtApiBase *api_base);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/onnx_extended/ortops/optim/cpu/tree_ensemble.cc b/onnx_extended/ortops/optim/cpu/tree_ensemble.cc
new file mode 100644
index 00000000..50c1e77e
--- /dev/null
+++ b/onnx_extended/ortops/optim/cpu/tree_ensemble.cc
@@ -0,0 +1,145 @@
+#include "tree_ensemble.h"
+
+namespace ortops {
+
+////////////////////////
+// Operators declaration
+////////////////////////
+
+void *TreeEnsembleRegressor::CreateKernel(const OrtApi &api,
+                                          const OrtKernelInfo *info) const {
+  return std::make_unique<TreeEnsembleKernel>(api, info).release();
+};
+
+const char *TreeEnsembleRegressor::GetName() const {
+  return "TreeEnsembleRegressor";
+};
+
+const char *TreeEnsembleRegressor::GetExecutionProviderType() const {
+  return "CPUExecutionProvider";
+};
+
+size_t TreeEnsembleRegressor::GetInputTypeCount() const { return 1; };
+
+ONNXTensorElementDataType
+TreeEnsembleRegressor::GetInputType(size_t index) const {
+  return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+};
+
+size_t TreeEnsembleRegressor::GetOutputTypeCount() const { return 1; };
+
+ONNXTensorElementDataType
+TreeEnsembleRegressor::GetOutputType(size_t index) const {
+  return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+};
+
+////////////////////////
+// Kernel initialization
+////////////////////////
+
+TreeEnsembleKernel::TreeEnsembleKernel(const OrtApi &api,
+                                       const OrtKernelInfo *info) {
+  reg_float_float_float = nullptr;
+
+  std::string aggregate_function = KernelInfoGetOptionalAttributeString(
+      api, info, "aggregate_function", "SUM");
+  std::vector<float> base_values = KernelInfoGetOptionalAttribute(
+      api, info, "base_values", std::vector<float>());
+  n_targets_or_classes = KernelInfoGetOptionalAttribute(
+      api, info, "n_targets", static_cast<int64_t>(1));
+  std::vector<int64_t> nodes_falsenodeids = KernelInfoGetOptionalAttribute(
+      api, info, "nodes_falsenodeids", std::vector<int64_t>());
+  std::vector<int64_t> nodes_featureids = KernelInfoGetOptionalAttribute(
+      api, info, "nodes_featureids", std::vector<int64_t>());
+  std::vector<float> nodes_hitrates = KernelInfoGetOptionalAttribute(
+      api, info, "nodes_hitrates", std::vector<float>());
+  std::vector<int64_t> nodes_missing_value_tracks_true =
+      KernelInfoGetOptionalAttribute(
+          api, info, "nodes_missing_value_tracks_true", std::vector<int64_t>());
+  std::string nodes_modes_single =
+      KernelInfoGetOptionalAttributeString(api, info, "nodes_modes", "");
+  std::vector<int64_t> nodes_nodeids = KernelInfoGetOptionalAttribute(
+      api, info, "nodes_nodeids", std::vector<int64_t>());
+  std::vector<int64_t> nodes_treeids = KernelInfoGetOptionalAttribute(
+      api, info, "nodes_treeids", std::vector<int64_t>());
+  std::vector<int64_t> nodes_truenodeids = KernelInfoGetOptionalAttribute(
+      api, info, "nodes_truenodeids", std::vector<int64_t>());
+  std::vector<float> nodes_values = KernelInfoGetOptionalAttribute(
+      api, info, "nodes_values", std::vector<float>());
+  std::string post_transform =
+      KernelInfoGetOptionalAttributeString(api, info, "post_transform", "NONE");
+
+  std::vector<int64_t> target_class_ids = KernelInfoGetOptionalAttribute(
+      api, info, "target_ids", std::vector<int64_t>());
+  std::vector<int64_t> target_class_nodeids = KernelInfoGetOptionalAttribute(
+      api, info, "target_nodeids", std::vector<int64_t>());
+  std::vector<int64_t> target_class_treeids = KernelInfoGetOptionalAttribute(
+      api, info, "target_treeids", std::vector<int64_t>());
+  std::vector<float> target_class_weights = KernelInfoGetOptionalAttribute(
+      api, info, "target_weights", std::vector<float>());
+
+  std::vector<std::string> nodes_modes = SplitString(nodes_modes_single, ',');
+  EXT_ENFORCE(n_targets_or_classes > 0);
+  EXT_ENFORCE(nodes_values.size() > 0);
+  EXT_ENFORCE(nodes_nodeids.size() > 0);
+  EXT_ENFORCE(nodes_modes.size() == nodes_falsenodeids.size(),
+              " nodes_modes.size()==", nodes_modes.size(),
+              "!=", nodes_falsenodeids.size(),
+              ", nodes_modes=", nodes_modes_single, ".");
+  EXT_ENFORCE(n_targets_or_classes > 0);
+
+  std::unique_ptr<onnx_c_ops::TreeEnsembleCommon<float, float, float>> ptr(
+      new onnx_c_ops::TreeEnsembleCommon<float, float, float>());
+  reg_float_float_float.swap(ptr);
+  auto status = reg_float_float_float->Init(
+      aggregate_function, base_values, n_targets_or_classes, nodes_falsenodeids,
+      nodes_featureids, nodes_hitrates, nodes_missing_value_tracks_true,
+      nodes_modes, nodes_nodeids, nodes_treeids, nodes_truenodeids,
+      nodes_values, post_transform, target_class_ids, target_class_nodeids,
+      target_class_treeids, target_class_weights);
+  EXT_ENFORCE(status.IsOK(), "The tree ensemble initialisation failed.");
+
+  int64_t parallel_tree = KernelInfoGetOptionalAttribute(
+      api, info, "parallel_tree", static_cast<int64_t>(80));
+  int64_t parallel_tree_N = KernelInfoGetOptionalAttribute(
+      api, info, "parallel_tree_N", static_cast<int64_t>(128));
+  int64_t parallel_N = KernelInfoGetOptionalAttribute(api, info, "parallel_N",
+                                                      static_cast<int64_t>(50));
+  int64_t batch_size_tree = KernelInfoGetOptionalAttribute(
+      api, info, "batch_size_tree", static_cast<int64_t>(2));
+  int64_t batch_size_rows = KernelInfoGetOptionalAttribute(
+      api, info, "batch_size_rows", static_cast<int64_t>(2));
+  int64_t use_node3 = KernelInfoGetOptionalAttribute(api, info, "use_node3",
+                                                     static_cast<int64_t>(0));
+
+  reg_float_float_float->set(parallel_tree, parallel_tree_N, parallel_N,
+                             batch_size_tree, batch_size_rows, use_node3);
+}
+
+////////////////////////
+// Kernel Implementation
+////////////////////////
+
+void TreeEnsembleKernel::Compute(OrtKernelContext *context) {
+  Ort::KernelContext ctx(context);
+  Ort::ConstValue input_X = ctx.GetInput(0);
+  std::vector<int64_t> dimensions_in =
+      input_X.GetTensorTypeAndShapeInfo().GetShape();
+  EXT_ENFORCE(dimensions_in.size() == 2, "TreeEnsemble only allows 2D inputs.");
+  std::vector<int64_t> dimensions_out{dimensions_in[0], n_targets_or_classes};
+  Ort::UnownedValue output = ctx.GetOutput(0, dimensions_out);
+
+  if (reg_float_float_float.get() != nullptr) {
+    const float *X = input_X.GetTensorData<float>();
+    float *out = output.GetTensorMutableData<float>();
+    reg_float_float_float->Compute(dimensions_in[0], dimensions_in[1], X, out,
+                                   nullptr);
+  } else {
+    EXT_ENFORCE("No implementation yet for input type=",
+                input_X.GetTensorTypeAndShapeInfo().GetElementType(),
+                " and output type=",
+                output.GetTensorTypeAndShapeInfo().GetElementType(), ".");
+  }
+}
+
+} // namespace ortops
diff --git a/onnx_extended/ortops/optim/cpu/tree_ensemble.h b/onnx_extended/ortops/optim/cpu/tree_ensemble.h
new file mode 100644
index 00000000..f04362de
--- /dev/null
+++ b/onnx_extended/ortops/optim/cpu/tree_ensemble.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include "common/common_kernels.h"
+#include "cpu/c_op_tree_ensemble_common_.hpp"
+// #include <memory>
+
+namespace ortops {
+
+struct TreeEnsembleKernel {
+  TreeEnsembleKernel(const OrtApi &api, const OrtKernelInfo *info);
+  void Compute(OrtKernelContext *context);
+
+  // Attributes
+  int64_t n_targets_or_classes;
+  std::unique_ptr<onnx_c_ops::TreeEnsembleCommon<float, float, float>>
+      reg_float_float_float;
+};
+
+struct TreeEnsembleRegressor
+    : Ort::CustomOpBase<TreeEnsembleRegressor, TreeEnsembleKernel> {
+  void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const;
+  const char *GetName() const;
+  const char *GetExecutionProviderType() const;
+  size_t GetInputTypeCount() const;
+  ONNXTensorElementDataType GetInputType(size_t index) const;
+  size_t GetOutputTypeCount() const;
+  ONNXTensorElementDataType GetOutputType(size_t index) const;
+};
+
+} // namespace ortops
diff --git a/onnx_extended/ortops/optim/optimize.py b/onnx_extended/ortops/optim/optimize.py
new file mode 100644
index 00000000..7e5d8869
--- /dev/null
+++ b/onnx_extended/ortops/optim/optimize.py
@@ -0,0 +1,151 @@
+from typing import Any, Dict, Optional, Union
+from onnx import AttributeProto, ModelProto, NodeProto, GraphProto, FunctionProto
+from onnx.helper import make_model, make_node, make_graph, make_opsetid
+
+
+def has_subgraph(node: NodeProto) -> bool:
+    """
+    Tells if a node has a subgraph as an attribute.
+    """
+    for att in node.attribute:
+        if att.type == AttributeProto.GRAPH:
+            return True
+    return False
+
+
+def get_node_attribute(node: NodeProto, name: str) -> AttributeProto:
+    """
+    Returns the value of one attribute.
+
+    :param node: node
+    :param name: attribute name
+    :return: value
+    """
+    for att in node.attribute:
+        if att.name == name:
+            return att
+    raise KeyError(
+        f"Unable to find {name!r} among {list(att.name for att in node.attribute)}."
+    )
+
+
+def change_onnx_operator_domain(
+    onx: Union[ModelProto, GraphProto, FunctionProto],
+    op_type: str,
+    op_domain: str = "",
+    new_op_type: Optional[str] = None,
+    new_op_domain: Optional[str] = None,
+    new_opset: Optional[int] = None,
+    **kwargs: Dict[str, Any],
+) -> Union[ModelProto, GraphProto, FunctionProto]:
+    """
+    Replaces an operator by another one in the same domain
+    or another one.
+
+    :param onx: proto to modify
+    :param op_type: operator to look for
+    :param op_domain: domain to look for
+    :param new_op_type: new operator name or None for the same name
+    :param new_op_domain: new domain name or None the for the same domain
+    :param new_opset: new opset for the new domain
+    :param kwargs: modified parameters, set it to None to remove them
+    :return: same type as the input
+
+    The function is not recursive yet.
+    """
+
+    def change_node(node):
+        atts = []
+        new_kwargs = {}
+        for att in node.attribute:
+            if att.name in kwargs:
+                v = kwargs[att.name]
+                if v is None:
+                    continue
+                new_kwargs[att.name] = v
+                continue
+            atts.append(att)
+        for k, v in kwargs.items():
+            if v is None or k in new_kwargs:
+                continue
+            new_kwargs[k] = v
+        new_node = make_node(
+            new_op_type or node.op_type,
+            node.input,
+            node.output,
+            domain=new_op_domain or node.domain,
+            **new_kwargs,
+        )
+        if len(atts) > 0:
+            new_node.attribute.extend(atts)
+        return new_node
+
+    if isinstance(onx, GraphProto):
+        new_nodes = []
+        modified = False
+        for node in onx.node:
+            if has_subgraph(node):
+                raise NotImplementedError(
+                    f"The function is not recursive yet and cannot "
+                    f"handle node {node.op_type!r} from domain "
+                    f"{node.domain!r}."
+                )
+            if node.op_type == op_type and node.domain == op_domain:
+                new_node = change_node(node)
+                new_nodes.append(new_node)
+                modified = True
+                continue
+            new_nodes.append(node)
+        if not modified:
+            return onx
+        return make_graph(
+            new_nodes,
+            onx.name,
+            onx.input,
+            onx.output,
+            onx.initializer,
+            onx.sparse_initializer,
+        )
+
+    if isinstance(onx, FunctionProto):
+        raise NotImplementedError()
+
+    if not isinstance(onx, ModelProto):
+        raise TypeError(f"Unexpected type for onx {type(onx)}.")
+
+    new_graph = change_onnx_operator_domain(
+        onx.graph,
+        op_type=op_type,
+        op_domain=op_domain,
+        new_opset=new_opset,
+        new_op_type=new_op_type,
+        new_op_domain=new_op_domain,
+        **kwargs,
+    )
+    if id(new_graph) == id(onx.graph):
+        # no change
+        return onx
+
+    if new_op_domain is None:
+        new_op_domain = op_domain
+    if new_op_domain == op_domain and new_opset is not None:
+        raise ValueError(
+            f"If new_op_domain==domain=={new_op_domain!r}, "
+            f"new_opset must be None not {new_opset}."
+        )
+    opsets = list(onx.opset_import)
+    if new_op_domain != op_domain:
+        opsets.append(make_opsetid(new_op_domain, new_opset or 1))
+
+    new_model = make_model(
+        new_graph,
+        functions=onx.functions,
+        ir_version=onx.ir_version,
+        producer_name=onx.producer_name,
+        producer_version=onx.producer_version,
+        model_version=onx.model_version,
+        doc_string=onx.doc_string,
+        opset_imports=opsets,
+        domain=onx.domain,
+    )
+    return new_model
diff --git a/onnx_extended/ortops/tutorial/cpu/my_kernel.cc b/onnx_extended/ortops/tutorial/cpu/my_kernel.cc
index 9832e46d..6ef7ef32 100644
--- a/onnx_extended/ortops/tutorial/cpu/my_kernel.cc
+++ b/onnx_extended/ortops/tutorial/cpu/my_kernel.cc
@@ -12,7 +12,8 @@ void MyCustomKernel::Compute(OrtKernelContext *context) {
   const float *Y = input_Y.GetTensorData<float>();
 
   // Setup output, which is assumed to have the same dimensions as the inputs.
-  std::vector<int64_t> dimensions = input_X.GetTensorTypeAndShapeInfo().GetShape();
+  std::vector<int64_t> dimensions =
+      input_X.GetTensorTypeAndShapeInfo().GetShape();
 
   Ort::UnownedValue output = ctx.GetOutput(0, dimensions);
   float *out = output.GetTensorMutableData<float>();
@@ -25,20 +26,27 @@ void MyCustomKernel::Compute(OrtKernelContext *context) {
   }
 }
 
-void* MyCustomOp::CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const {
+void *MyCustomOp::CreateKernel(const OrtApi &api,
+                               const OrtKernelInfo *info) const {
   return std::make_unique<MyCustomKernel>(api, info).release();
 };
 
-const char* MyCustomOp::GetName() const { return "MyCustomOp"; };
+const char *MyCustomOp::GetName() const { return "MyCustomOp"; };
 
-const char* MyCustomOp::GetExecutionProviderType() const { return "CPUExecutionProvider"; };
+const char *MyCustomOp::GetExecutionProviderType() const {
+  return "CPUExecutionProvider";
+};
 
 size_t MyCustomOp::GetInputTypeCount() const { return 2; };
 
-ONNXTensorElementDataType MyCustomOp::GetInputType(size_t index) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; };
+ONNXTensorElementDataType MyCustomOp::GetInputType(size_t index) const {
+  return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+};
 
 size_t MyCustomOp::GetOutputTypeCount() const { return 1; };
 
-ONNXTensorElementDataType MyCustomOp::GetOutputType(size_t index) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; };
+ONNXTensorElementDataType MyCustomOp::GetOutputType(size_t index) const {
+  return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+};
 
 } // namespace ortops
diff --git a/onnx_extended/ortops/tutorial/cpu/my_kernel.h b/onnx_extended/ortops/tutorial/cpu/my_kernel.h
index fba3344d..67be0686 100644
--- a/onnx_extended/ortops/tutorial/cpu/my_kernel.h
+++ b/onnx_extended/ortops/tutorial/cpu/my_kernel.h
@@ -6,13 +6,13 @@ namespace ortops {
 
 struct MyCustomKernel {
   MyCustomKernel(const OrtApi &api, const OrtKernelInfo *info);
-  void Compute(OrtKernelContext* context);
+  void Compute(OrtKernelContext *context);
 };
 
 struct MyCustomOp : Ort::CustomOpBase<MyCustomOp, MyCustomKernel> {
-  void* CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const ;
-  const char* GetName() const;
-  const char* GetExecutionProviderType() const;
+  void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const;
+  const char *GetName() const;
+  const char *GetExecutionProviderType() const;
   size_t GetInputTypeCount() const;
   ONNXTensorElementDataType GetInputType(size_t index) const;
   size_t GetOutputTypeCount() const;
diff --git a/onnx_extended/ortops/tutorial/cpu/my_kernel_attr.cc b/onnx_extended/ortops/tutorial/cpu/my_kernel_attr.cc
index a02fd3e8..5fcf911d 100644
--- a/onnx_extended/ortops/tutorial/cpu/my_kernel_attr.cc
+++ b/onnx_extended/ortops/tutorial/cpu/my_kernel_attr.cc
@@ -6,18 +6,21 @@ MyCustomKernelWithAttributes::MyCustomKernelWithAttributes(
     const OrtApi &api, const OrtKernelInfo *info) {
   // A float attribute.
   float value_float;
-  ThrowOnError(api, api.KernelInfoGetAttribute_float(info, "att_float", &value_float));
+  ThrowOnError(
+      api, api.KernelInfoGetAttribute_float(info, "att_float", &value_float));
   att_float = value_float;
 
   // An integer attribute.
   int64_t value_int64;
-  ThrowOnError(api, api.KernelInfoGetAttribute_int64(info, "att_int64", &value_int64));
+  ThrowOnError(
+      api, api.KernelInfoGetAttribute_int64(info, "att_int64", &value_int64));
   att_int64 = value_int64;
 
   // A string attribute.
   char value_string[1000];
   size_t size = 1000;
-  ThrowOnError(api, api.KernelInfoGetAttribute_string(info, "att_string", value_string, &size));
+  ThrowOnError(api, api.KernelInfoGetAttribute_string(info, "att_string",
+                                                      value_string, &size));
   att_string = value_string;
 
   // A tensor attribute
@@ -25,22 +28,24 @@ MyCustomKernelWithAttributes::MyCustomKernelWithAttributes(
   OrtAllocator *cpu_allocator;
   ThrowOnError(api, api.GetAllocatorWithDefaultOptions(&cpu_allocator));
 
-  OrtValue* value_tensor = nullptr;
-  ThrowOnError(api, api.KernelInfoGetAttribute_tensor(info, "att_tensor", cpu_allocator, &value_tensor));
+  OrtValue *value_tensor = nullptr;
+  ThrowOnError(api, api.KernelInfoGetAttribute_tensor(
+                        info, "att_tensor", cpu_allocator, &value_tensor));
 
   // Retrieve the dimensions and the element type.
-  OrtTensorTypeAndShapeInfo* shape_info;
+  OrtTensorTypeAndShapeInfo *shape_info;
   ThrowOnError(api, api.GetTensorTypeAndShape(value_tensor, &shape_info));
 
   // Retrieve the element type.
   ONNXTensorElementDataType elem_type;
   ThrowOnError(api, api.GetTensorElementType(shape_info, &elem_type));
-  if (elem_type != ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE) {
+  if (elem_type !=
+      ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE) {
     api.ReleaseTensorTypeAndShapeInfo(shape_info);
     api.ReleaseValue(value_tensor);
     throw std::runtime_error(
-      "Attribute 'att_tensor' of operator 'MyCustomOpWithAttributes' expects a double tensor."
-    );
+        "Attribute 'att_tensor' of operator 'MyCustomOpWithAttributes' expects "
+        "a double tensor.");
   }
 
   // Retrieve the number of elements in the shape.
@@ -52,7 +57,7 @@ MyCustomKernelWithAttributes::MyCustomKernelWithAttributes(
   size_t size_tensor;
   ThrowOnError(api, api.GetTensorShapeElementCount(shape_info, &size_tensor));
   att_tensor_double.resize(size_tensor);
-  void* data;
+  void *data;
   ThrowOnError(api, api.GetTensorMutableData(value_tensor, &data));
 
   memcpy(att_tensor_double.data(), data, size_tensor * sizeof(double));
@@ -63,9 +68,8 @@ MyCustomKernelWithAttributes::MyCustomKernelWithAttributes(
 
   // Verifications.
   if (att_tensor_double.empty()) {
-    throw std::runtime_error(
-      "Attribute 'att_tensor' of operator 'MyCustomOpWithAttributes' cannot be empty."
-    );
+    throw std::runtime_error("Attribute 'att_tensor' of operator "
+                             "'MyCustomOpWithAttributes' cannot be empty.");
   }
 }
 
@@ -77,7 +81,8 @@ void MyCustomKernelWithAttributes::Compute(OrtKernelContext *context) {
   const double *Y = input_Y.GetTensorData<double>();
 
   // Setup output, which is assumed to have the same dimensions as the inputs.
-  std::vector<int64_t> dimensions = input_X.GetTensorTypeAndShapeInfo().GetShape();
+  std::vector<int64_t> dimensions =
+      input_X.GetTensorTypeAndShapeInfo().GetShape();
 
   Ort::UnownedValue output = ctx.GetOutput(0, dimensions);
   double *out = output.GetTensorMutableData<double>();
@@ -85,27 +90,40 @@ void MyCustomKernelWithAttributes::Compute(OrtKernelContext *context) {
   const size_t size = output.GetTensorTypeAndShapeInfo().GetElementCount();
 
   // Do computation
-  double cst = att_tensor_double[0] + static_cast<double>(att_float) + static_cast<double>(att_int64) + static_cast<double>(att_string[0]);
+  double cst = att_tensor_double[0] + static_cast<double>(att_float) +
+               static_cast<double>(att_int64) +
+               static_cast<double>(att_string[0]);
 
   for (size_t i = 0; i < size; i++) {
     out[i] = X[i] + Y[i] + cst;
   }
 }
 
-void* MyCustomOpWithAttributes::CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const {
+void *MyCustomOpWithAttributes::CreateKernel(const OrtApi &api,
+                                             const OrtKernelInfo *info) const {
   return std::make_unique<MyCustomKernelWithAttributes>(api, info).release();
 };
 
-const char* MyCustomOpWithAttributes::GetName() const { return "MyCustomOpWithAttributes"; };
+const char *MyCustomOpWithAttributes::GetName() const {
+  return "MyCustomOpWithAttributes";
+};
 
-const char* MyCustomOpWithAttributes::GetExecutionProviderType() const { return "CPUExecutionProvider"; };
+const char *MyCustomOpWithAttributes::GetExecutionProviderType() const {
+  return "CPUExecutionProvider";
+};
 
 size_t MyCustomOpWithAttributes::GetInputTypeCount() const { return 2; };
 
-ONNXTensorElementDataType MyCustomOpWithAttributes::GetInputType(size_t index) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE; };
+ONNXTensorElementDataType
+MyCustomOpWithAttributes::GetInputType(size_t index) const {
+  return ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE;
+};
 
 size_t MyCustomOpWithAttributes::GetOutputTypeCount() const { return 1; };
 
-ONNXTensorElementDataType MyCustomOpWithAttributes::GetOutputType(size_t index) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE; };
+ONNXTensorElementDataType
+MyCustomOpWithAttributes::GetOutputType(size_t index) const {
+  return ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE;
+};
 
 } // namespace ortops
diff --git a/onnx_extended/ortops/tutorial/cpu/my_kernel_attr.h b/onnx_extended/ortops/tutorial/cpu/my_kernel_attr.h
index d8817f19..7d9d5082 100644
--- a/onnx_extended/ortops/tutorial/cpu/my_kernel_attr.h
+++ b/onnx_extended/ortops/tutorial/cpu/my_kernel_attr.h
@@ -5,20 +5,22 @@
 namespace ortops {
 
 struct MyCustomKernelWithAttributes {
-  MyCustomKernelWithAttributes(const OrtApi& api, const OrtKernelInfo* info);
-  void Compute(OrtKernelContext* context);
+  MyCustomKernelWithAttributes(const OrtApi &api, const OrtKernelInfo *info);
+  void Compute(OrtKernelContext *context);
 
-  private:
-    std::string att_string;
-    float att_float;
-    int64_t att_int64;
-    std::vector<double> att_tensor_double;
+private:
+  std::string att_string;
+  float att_float;
+  int64_t att_int64;
+  std::vector<double> att_tensor_double;
 };
 
-struct MyCustomOpWithAttributes : Ort::CustomOpBase<MyCustomOpWithAttributes, MyCustomKernelWithAttributes> {
-  void* CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const ;
-  const char* GetName() const;
-  const char* GetExecutionProviderType() const;
+struct MyCustomOpWithAttributes
+    : Ort::CustomOpBase<MyCustomOpWithAttributes,
+                        MyCustomKernelWithAttributes> {
+  void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const;
+  const char *GetName() const;
+  const char *GetExecutionProviderType() const;
   size_t GetInputTypeCount() const;
   ONNXTensorElementDataType GetInputType(size_t index) const;
   size_t GetOutputTypeCount() const;
diff --git a/onnx_extended/ortops/tutorial/cpu/ort_tutorial_cpu_lib.cc b/onnx_extended/ortops/tutorial/cpu/ort_tutorial_cpu_lib.cc
index 3609c0d1..74e2bcce 100644
--- a/onnx_extended/ortops/tutorial/cpu/ort_tutorial_cpu_lib.cc
+++ b/onnx_extended/ortops/tutorial/cpu/ort_tutorial_cpu_lib.cc
@@ -4,20 +4,21 @@
 #include <mutex>
 #include <vector>
 
-#include "ort_tutorial_cpu_lib.h"
 #include "my_kernel.h"
 #include "my_kernel_attr.h"
+#include "ort_tutorial_cpu_lib.h"
 
-static const char* c_OpDomain = "onnx_extented.ortops.tutorial.cpu";
+static const char *c_OpDomain = "onnx_extented.ortops.tutorial.cpu";
 
-static void AddOrtCustomOpDomainToContainer(Ort::CustomOpDomain&& domain) {
+static void AddOrtCustomOpDomainToContainer(Ort::CustomOpDomain &&domain) {
   static std::vector<Ort::CustomOpDomain> ort_custom_op_domain_container;
   static std::mutex ort_custom_op_domain_mutex;
   std::lock_guard<std::mutex> lock(ort_custom_op_domain_mutex);
   ort_custom_op_domain_container.push_back(std::move(domain));
 }
 
-OrtStatus* ORT_API_CALL RegisterCustomOps(OrtSessionOptions* options, const OrtApiBase* api_base) {
+OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
+                                          const OrtApiBase *api_base) {
   Ort::InitApi(api_base->GetApi(ORT_API_VERSION));
   Ort::UnownedSessionOptions session_options(options);
 
@@ -25,7 +26,7 @@ OrtStatus* ORT_API_CALL RegisterCustomOps(OrtSessionOptions* options, const OrtA
   static ortops::MyCustomOp c_CustomOp;
   static ortops::MyCustomOpWithAttributes c_CustomOpAttr;
 
-  OrtStatus* result = nullptr;
+  OrtStatus *result = nullptr;
 
   try {
     Ort::CustomOpDomain domain{c_OpDomain};
@@ -35,8 +36,7 @@ OrtStatus* ORT_API_CALL RegisterCustomOps(OrtSessionOptions* options, const OrtA
 
     session_options.Add(domain);
     AddOrtCustomOpDomainToContainer(std::move(domain));
-  }
-  catch (const std::exception& e) {
+  } catch (const std::exception &e) {
     Ort::Status status{e};
     result = status.release();
   }
diff --git a/onnx_extended/ortops/tutorial/cpu/ort_tutorial_cpu_lib.h b/onnx_extended/ortops/tutorial/cpu/ort_tutorial_cpu_lib.h
index a69d2d7d..0e7867fa 100644
--- a/onnx_extended/ortops/tutorial/cpu/ort_tutorial_cpu_lib.h
+++ b/onnx_extended/ortops/tutorial/cpu/ort_tutorial_cpu_lib.h
@@ -8,7 +8,8 @@
 extern "C" {
 #endif
 
-ORT_EXPORT OrtStatus* ORT_API_CALL RegisterCustomOps(OrtSessionOptions* options, const OrtApiBase* api_base);
+ORT_EXPORT OrtStatus *ORT_API_CALL
+RegisterCustomOps(OrtSessionOptions *options, const OrtApiBase *api_base);
 
 #ifdef __cplusplus
 }
diff --git a/onnx_extended/ortops/tutorial/cuda/custom_gemm.cu b/onnx_extended/ortops/tutorial/cuda/custom_gemm.cu
index a791dae4..5f285204 100644
--- a/onnx_extended/ortops/tutorial/cuda/custom_gemm.cu
+++ b/onnx_extended/ortops/tutorial/cuda/custom_gemm.cu
@@ -215,8 +215,8 @@ CustomGemmKernel::CustomGemmKernel(const OrtApi &api,
     EXT_THROW("Unexpected value for compute_type '", compute_type, "'.");
   }
 
-  std::string activation = KernelInfoGetOptionalAttributeString(
-      api, info, "activation", "DEFUALT");
+  std::string activation =
+      KernelInfoGetOptionalAttributeString(api, info, "activation", "DEFUALT");
   if (activation == "DEFUALT") {
     epilogue_ = EpiloqueGemmKernel::Default;
   } else if (activation == "RELU") {
@@ -319,7 +319,8 @@ void CustomGemmKernel::Compute(OrtKernelContext *context) {
 
   bool has_scales = n_inputs > 3;
   if (has_scales) {
-    EXT_ENFORCE(n_inputs == 6, "Number of inputs must be 6 but is ", n_inputs, ".");
+    EXT_ENFORCE(n_inputs == 6, "Number of inputs must be 6 but is ", n_inputs,
+                ".");
     scale_A = ctx.GetInput(3);
     scale_B = ctx.GetInput(4);
     scale_Y = ctx.GetInput(5);
@@ -516,17 +517,17 @@ void CustomGemmKernel::ComputeGemm(
   }
 
   cublasLtEpilogue_t epilogue;
-  switch(epilogue_) {
-    case EpiloqueGemmKernel::Default:
+  switch (epilogue_) {
+  case EpiloqueGemmKernel::Default:
     epilogue = CUBLASLT_EPILOGUE_DEFAULT;
     break;
-    case EpiloqueGemmKernel::Relu:
+  case EpiloqueGemmKernel::Relu:
     epilogue = CUBLASLT_EPILOGUE_RELU;
     break;
-    case EpiloqueGemmKernel::Gelu:
+  case EpiloqueGemmKernel::Gelu:
     epilogue = CUBLASLT_EPILOGUE_GELU;
     break;
-  } 
+  }
   cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE,
                                  &epilogue, sizeof(epilogue));
 
diff --git a/onnx_extended/ortops/tutorial/cuda/ort_tutorial_cuda_lib.cc b/onnx_extended/ortops/tutorial/cuda/ort_tutorial_cuda_lib.cc
index 5cad8ae6..28afd8db 100644
--- a/onnx_extended/ortops/tutorial/cuda/ort_tutorial_cuda_lib.cc
+++ b/onnx_extended/ortops/tutorial/cuda/ort_tutorial_cuda_lib.cc
@@ -4,44 +4,44 @@
 #include <mutex>
 #include <vector>
 
-#include "ort_tutorial_cuda_lib.h"
 #include "custom_gemm.h"
+#include "ort_tutorial_cuda_lib.h"
 
-static const char* c_OpDomain = "onnx_extented.ortops.tutorial.cuda";
+static const char *c_OpDomain = "onnx_extented.ortops.tutorial.cuda";
 
-static void AddOrtCustomOpDomainToContainer(Ort::CustomOpDomain&& domain) {
+static void AddOrtCustomOpDomainToContainer(Ort::CustomOpDomain &&domain) {
   static std::vector<Ort::CustomOpDomain> ort_custom_op_domain_container;
   static std::mutex ort_custom_op_domain_mutex;
   std::lock_guard<std::mutex> lock(ort_custom_op_domain_mutex);
   ort_custom_op_domain_container.push_back(std::move(domain));
 }
 
-OrtStatus* ORT_API_CALL RegisterCustomOps(OrtSessionOptions* options, const OrtApiBase* api_base) {
+OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
+                                          const OrtApiBase *api_base) {
   Ort::InitApi(api_base->GetApi(ORT_API_VERSION));
   Ort::UnownedSessionOptions session_options(options);
 
   // An instance remaining available until onnxruntime unload the library.
   static ortops::CustomGemmOpFloat c_CustomGemmFloat;
   static ortops::CustomGemmOpFloat16 c_CustomGemmFloat16;
-  #if ORT_VERSION >= 1160 && CUDA_VERSION >= 11080
+#if ORT_VERSION >= 1160 && CUDA_VERSION >= 11080
   static ortops::CustomGemmOpFloat8E4M3FN c_CustomGemmFloat8E4M3FN;
-  #endif
+#endif
 
-  OrtStatus* result = nullptr;
+  OrtStatus *result = nullptr;
 
   try {
     Ort::CustomOpDomain domain{c_OpDomain};
 
     domain.Add(&c_CustomGemmFloat);
     domain.Add(&c_CustomGemmFloat16);
-    #if ORT_VERSION >= 1160 && CUDA_VERSION >= 11080
+#if ORT_VERSION >= 1160 && CUDA_VERSION >= 11080
     domain.Add(&c_CustomGemmFloat8E4M3FN);
-    #endif
+#endif
 
     session_options.Add(domain);
     AddOrtCustomOpDomainToContainer(std::move(domain));
-  }
-  catch (const std::exception& e) {
+  } catch (const std::exception &e) {
     Ort::Status status{e};
     result = status.release();
   }
diff --git a/onnx_extended/ortops/tutorial/cuda/ort_tutorial_cuda_lib.h b/onnx_extended/ortops/tutorial/cuda/ort_tutorial_cuda_lib.h
index a69d2d7d..0e7867fa 100644
--- a/onnx_extended/ortops/tutorial/cuda/ort_tutorial_cuda_lib.h
+++ b/onnx_extended/ortops/tutorial/cuda/ort_tutorial_cuda_lib.h
@@ -8,7 +8,8 @@
 extern "C" {
 #endif
 
-ORT_EXPORT OrtStatus* ORT_API_CALL RegisterCustomOps(OrtSessionOptions* options, const OrtApiBase* api_base);
+ORT_EXPORT OrtStatus *ORT_API_CALL
+RegisterCustomOps(OrtSessionOptions *options, const OrtApiBase *api_base);
 
 #ifdef __cplusplus
 }
diff --git a/onnx_extended/reference/c_custom_ops/__init__.py b/onnx_extended/reference/c_custom_ops/__init__.py
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/onnx_extended/reference/c_custom_ops/__init__.py
@@ -0,0 +1 @@
+
diff --git a/onnx_extended/reference/c_custom_ops/custom_op_tree_ensemble_regressor.py b/onnx_extended/reference/c_custom_ops/custom_op_tree_ensemble_regressor.py
new file mode 100644
index 00000000..45cf49b6
--- /dev/null
+++ b/onnx_extended/reference/c_custom_ops/custom_op_tree_ensemble_regressor.py
@@ -0,0 +1,190 @@
+from typing import Any, Dict
+import numpy
+from onnx import NodeProto
+from onnx.defs import OpSchema, get_schema
+from onnx.reference.op_run import OpRun
+from ..c_ops.cpu.c_op_tree_ensemble_py_ import (
+    RuntimeTreeEnsembleRegressorFloat,
+    RuntimeTreeEnsembleRegressorDouble,
+)
+
+
+class TreeEnsembleRegressorCommon(OpRun):
+    op_domain = "onnx_extented.ortops.optim.cpu"
+
+    def __init__(
+        self, onnx_node: NodeProto, run_params: Dict[str, Any], schema: Any = None
+    ):
+        OpRun.__init__(self, onnx_node, run_params, schema=schema)
+        self.parallel = None
+        self.rt_ = None
+        # default is no parallelization
+        self.set_parallel(int(100e6), int(100e6), int(100e6), 1, 1, 0)
+
+    def set_parallel(
+        self,
+        parallel_tree: int = -1,
+        parallel_tree_N: int = -1,
+        parallel_N: int = -1,
+        batch_size_tree: int = -1,
+        batch_size_rows: int = -1,
+        node3: int = -1,
+    ):
+        """
+        Sets the parameter for parallelization.
+        If a parameter is set to -1, its value does not change.
+
+        :param parallel_tree: parallization by trees if the number of trees is higher
+        :param parallel_tree_N: batch size (rows) if parallization by trees
+        :param parallel_N: parallization by rows if the number of rows is higher
+        :param batch_size_tree: number of trees to compute at the same time
+        :param batch_size_rows: number of rows to compute at the same time
+        :param node3: use bigger nodes
+        """
+        self.parallel = (
+            parallel_tree,
+            parallel_tree_N,
+            parallel_N,
+            batch_size_tree,
+            batch_size_rows,
+            node3,
+        )
+        if self.rt_ is not None:
+            self.rt_.set(*self.parallel)
+
+    def _init(self, dtype, **kwargs):
+        if dtype == numpy.float32:
+            cls = RuntimeTreeEnsembleRegressorFloat
+        else:
+            cls = RuntimeTreeEnsembleRegressorDouble
+
+        self.rt_ = cls()
+
+        empty_f = numpy.array([], dtype=dtype)
+        base_values = numpy.array(
+            kwargs.get("base_values", None)
+            or kwargs.get("base_values_as_tensor", None)
+            or empty_f
+        )
+        nodes_values = numpy.array(
+            kwargs.get("nodes_values", None)
+            or kwargs.get("nodes_values_as_tensor", None)
+            or empty_f
+        )
+        nodes_hitrates = numpy.array(
+            kwargs.get("nodes_hitrates", None)
+            or kwargs.get("nodes_hitrates_as_tensor", None)
+            or empty_f
+        )
+        tw = numpy.array(
+            kwargs.get("target_weights", None)
+            or kwargs.get("target_weights", None)
+            or empty_f
+        )
+
+        self.rt_.init(
+            kwargs.get("aggregate_function", "SUM"),  # 3
+            base_values,  # 4
+            kwargs["n_targets"],  # 5
+            kwargs["nodes_falsenodeids"],  # 6
+            kwargs["nodes_featureids"],  # 7
+            nodes_hitrates,  # 8
+            kwargs.get("nodes_missing_value_tracks_true", []),  # 9
+            kwargs["nodes_modes"].split(","),  # 10
+            kwargs["nodes_nodeids"],  # 11
+            kwargs["nodes_treeids"],  # 12
+            kwargs["nodes_truenodeids"],  # 13
+            nodes_values,  # 14
+            kwargs["post_transform"],  # 15
+            kwargs["target_ids"],  # 16
+            kwargs["target_nodeids"],  # 17
+            kwargs["target_treeids"],  # 18
+            tw,  # 19
+        )
+        if self.parallel is not None:
+            self.rt_.set(*self.parallel)
+
+    def _run(self, x, **kwargs):
+        if hasattr(x, "todense"):
+            x = x.todense()
+        if self.rt_ is None:
+            self._init(x.dtype, **kwargs)
+        pred = self.rt_.compute(x)
+        if pred.shape[0] != x.shape[0]:
+            pred = pred.reshape((x.shape[0], -1))
+        return (pred,)
+
+
+def _make_schema():
+    attributes = []
+    sch = get_schema("TreeEnsembleRegressor", 1, "ai.onnx.ml")
+    for att in sch.attributes.values():
+        if att.name == "nodes_modes":
+            attributes.append(
+                OpSchema.Attribute(
+                    "nodes_modes",
+                    OpSchema.AttrType.STRING,
+                    "comma separated value nodes_modes",
+                )
+            )
+        else:
+            attributes.append(att)
+    return OpSchema(
+        "TreeEnsembleRegressor",
+        TreeEnsembleRegressorCommon.op_domain,
+        1,
+        inputs=[
+            OpSchema.FormalParameter("X", "T"),
+        ],
+        outputs=[
+            OpSchema.FormalParameter("Y", "T"),
+        ],
+        type_constraints=[("T", ["tensor(float)"], "")],
+        attributes=attributes,
+    )
+
+
+class TreeEnsembleRegressor_1(TreeEnsembleRegressorCommon):
+    op_schema = _make_schema()
+
+    def _run(
+        self,
+        x,
+        aggregate_function=None,
+        base_values=None,
+        n_targets=None,
+        nodes_falsenodeids=None,
+        nodes_featureids=None,
+        nodes_hitrates=None,
+        nodes_missing_value_tracks_true=None,
+        nodes_modes=None,
+        nodes_nodeids=None,
+        nodes_treeids=None,
+        nodes_truenodeids=None,
+        nodes_values=None,
+        post_transform=None,
+        target_ids=None,
+        target_nodeids=None,
+        target_treeids=None,
+        target_weights=None,
+    ):
+        return TreeEnsembleRegressorCommon._run(
+            self,
+            x,
+            base_values=base_values,
+            n_targets=n_targets,
+            nodes_falsenodeids=nodes_falsenodeids,
+            nodes_featureids=nodes_featureids,
+            nodes_hitrates=nodes_hitrates,
+            nodes_missing_value_tracks_true=nodes_missing_value_tracks_true,
+            nodes_modes=nodes_modes,
+            nodes_nodeids=nodes_nodeids,
+            nodes_treeids=nodes_treeids,
+            nodes_truenodeids=nodes_truenodeids,
+            nodes_values=nodes_values,
+            post_transform=post_transform,
+            target_ids=target_ids,
+            target_nodeids=target_nodeids,
+            target_treeids=target_treeids,
+            target_weights=target_weights,
+        )
diff --git a/onnx_extended/reference/c_ops/c_op_common.cpp b/onnx_extended/reference/c_ops/c_op_common.cpp
new file mode 100755
index 00000000..19ade3aa
Binary files /dev/null and b/onnx_extended/reference/c_ops/c_op_common.cpp differ
diff --git a/onnx_extended/reference/c_ops/cpu/c_op_common.h b/onnx_extended/reference/c_ops/cpu/c_op_common.h
index a3e6a190..1097cd84 100644
--- a/onnx_extended/reference/c_ops/cpu/c_op_common.h
+++ b/onnx_extended/reference/c_ops/cpu/c_op_common.h
@@ -13,6 +13,8 @@
 #include <cmath>
 #endif
 
+#include "onnx_extended_helpers.h"
+
 namespace onnx_c_ops {
 
 void *AllocatorDefaultAlloc(size_t size);
@@ -440,79 +442,4 @@ void debug_print(const std::string &msg, size_t i, size_t j, size_t k, float pa,
 void debug_print(const std::string &msg, size_t i, size_t j, size_t k,
                  double pa, double pb, double val);
 
-inline void MakeStringInternal(std::ostringstream &ss) noexcept {}
-
-template <typename T>
-inline void MakeStringInternal(std::ostringstream &ss, const T &t) noexcept {
-  ss << t;
-}
-
-template <>
-inline void MakeStringInternal(std::ostringstream &ss,
-                               const std::vector<int32_t> &t) noexcept {
-  for (auto it : t)
-    ss << "x" << it;
-}
-
-template <>
-inline void MakeStringInternal(std::ostringstream &ss,
-                               const std::vector<uint32_t> &t) noexcept {
-  for (auto it : t)
-    ss << "x" << it;
-}
-
-template <>
-inline void MakeStringInternal(std::ostringstream &ss,
-                               const std::vector<int64_t> &t) noexcept {
-  for (auto it : t)
-    ss << "x" << it;
-}
-
-template <>
-inline void MakeStringInternal(std::ostringstream &ss,
-                               const std::vector<uint64_t> &t) noexcept {
-  for (auto it : t)
-    ss << "x" << it;
-}
-
-template <>
-inline void MakeStringInternal(std::ostringstream &ss,
-                               const std::vector<int16_t> &t) noexcept {
-  for (auto it : t)
-    ss << "x" << it;
-}
-
-template <>
-inline void MakeStringInternal(std::ostringstream &ss,
-                               const std::vector<uint16_t> &t) noexcept {
-  for (auto it : t)
-    ss << "x" << it;
-}
-
-template <typename T, typename... Args>
-inline void MakeStringInternal(std::ostringstream &ss, const T &t,
-                               const Args &...args) noexcept {
-  MakeStringInternal(ss, t);
-  MakeStringInternal(ss, args...);
-}
-
-template <typename... Args> inline std::string MakeString(const Args &...args) {
-  std::ostringstream ss;
-  MakeStringInternal(ss, args...);
-  return std::string(ss.str());
-}
-
-#if !defined(_THROW_DEFINED)
-#define EXT_THROW(...) throw std::runtime_error(MakeString(__VA_ARGS__));
-#define _THROW_DEFINED
-#endif
-
-#if !defined(_ENFORCE_DEFINED)
-#define EXT_ENFORCE(cond, ...)                                                    \
-  if (!(cond))                                                                 \
-    throw std::runtime_error(                                                  \
-        MakeString("`", #cond, "` failed. ", MakeString(__VA_ARGS__)));
-#define _ENFORCE_DEFINED
-#endif
-
 } // namespace onnx_c_ops
diff --git a/onnx_extended/reference/c_ops/cpu/c_op_common_parallel.hpp b/onnx_extended/reference/c_ops/cpu/c_op_common_parallel.hpp
index 21461e91..71dc927b 100644
--- a/onnx_extended/reference/c_ops/cpu/c_op_common_parallel.hpp
+++ b/onnx_extended/reference/c_ops/cpu/c_op_common_parallel.hpp
@@ -58,7 +58,7 @@ inline void TrySimpleParallelFor(int64_t n_threads, int64_t batch_size,
 
 template <typename F>
 inline void TryBatchParallelFor(int64_t n_threads, int64_t batch_size,
-                                 int64_t total, F &&fn) {
+                                int64_t total, F &&fn) {
   if (n_threads != omp_get_max_threads()) {
     throw std::runtime_error("TryBatchParallelFor not implemented when "
                              "n_threads != omp_get_max_threads().");
diff --git a/onnx_extended/reference/c_ops/cpu/c_op_conv.h b/onnx_extended/reference/c_ops/cpu/c_op_conv.h
index 0a355458..193e18d7 100644
--- a/onnx_extended/reference/c_ops/cpu/c_op_conv.h
+++ b/onnx_extended/reference/c_ops/cpu/c_op_conv.h
@@ -1,5 +1,7 @@
 #include "c_op_conv_common.h"
 
+using namespace onnx_extended_helpers;
+
 namespace onnx_c_ops {
 
 template <typename T>
diff --git a/onnx_extended/reference/c_ops/cpu/c_op_conv_common.h b/onnx_extended/reference/c_ops/cpu/c_op_conv_common.h
index ddb5c3b2..09d1d098 100644
--- a/onnx_extended/reference/c_ops/cpu/c_op_conv_common.h
+++ b/onnx_extended/reference/c_ops/cpu/c_op_conv_common.h
@@ -11,110 +11,104 @@ namespace onnx_c_ops {
 // was initialized.
 template <typename NTYPE>
 void gemm(bool transA, bool transB, size_t M, size_t N, size_t K, NTYPE alpha,
-          const NTYPE* A, const NTYPE* B, NTYPE beta, NTYPE* C) {
+          const NTYPE *A, const NTYPE *B, NTYPE beta, NTYPE *C) {
 #if defined(__APPLE__)
-    // link issues on apple, "___kmpc_fork_call", referenced from:
-    if (transA) {
-        if (transB) {
-        }
-        else {
-            // a A B + b C, dimension = M * N
-            NTYPE* begin;
-            NTYPE val;
-            NTYPE val0;
-            size_t i, j, k, maxc = 0;
-            const NTYPE* pA, * pB;
-            for (i = 0, begin = C; i < M; ++i) {
-                for (j = 0; j < N; ++j, ++begin) {
-                    val0 = *begin * beta;
-                    val = 0;
-                    pA = A + i;
-                    pB = B + j;
-                    for (k = K; k > 0; --k, pA += K, pB += N)
-                        val += *pA * *pB;
-                    *begin = val0 + val * alpha;
-                    maxc = maxc > (size_t)(begin - C) ? maxc : (size_t)(begin - C);
-                    if (maxc > M * N)
-                        throw std::invalid_argument("gemm10: maxc > M * N");
-                }
-            }
-            return;
+  // link issues on apple, "___kmpc_fork_call", referenced from:
+  if (transA) {
+    if (transB) {
+    } else {
+      // a A B + b C, dimension = M * N
+      NTYPE *begin;
+      NTYPE val;
+      NTYPE val0;
+      size_t i, j, k, maxc = 0;
+      const NTYPE *pA, *pB;
+      for (i = 0, begin = C; i < M; ++i) {
+        for (j = 0; j < N; ++j, ++begin) {
+          val0 = *begin * beta;
+          val = 0;
+          pA = A + i;
+          pB = B + j;
+          for (k = K; k > 0; --k, pA += K, pB += N)
+            val += *pA * *pB;
+          *begin = val0 + val * alpha;
+          maxc = maxc > (size_t)(begin - C) ? maxc : (size_t)(begin - C);
+          if (maxc > M * N)
+            throw std::invalid_argument("gemm10: maxc > M * N");
         }
+      }
+      return;
     }
-    else {
-        if (transB) {
-        }
-        else {
-            // a A B + b C, dimension = M * N
-            NTYPE* begin;
-            NTYPE val;
-            NTYPE val0;
-            size_t i, j, k, maxc = 0;
-            const NTYPE* pA, * pB;
-            for (i = 0, begin = C; i < M; ++i) {
-                for (j = 0; j < N; ++j, ++begin) {
-                    val0 = *begin * beta;
-                    val = 0;
-                    pA = A + i * K;
-                    pB = B + j;
-                    for (k = K; k > 0; --k, ++pA, pB += N)
-                        val += *pA * *pB;
-                    *begin = val0 + val * alpha;
-                    maxc = maxc > (size_t)(begin - C) ? maxc : (size_t)(begin - C);
-                    if (maxc > M * N)
-                        throw std::invalid_argument("gemm00: maxc > M * N");
-                }
-            }
-            return;
+  } else {
+    if (transB) {
+    } else {
+      // a A B + b C, dimension = M * N
+      NTYPE *begin;
+      NTYPE val;
+      NTYPE val0;
+      size_t i, j, k, maxc = 0;
+      const NTYPE *pA, *pB;
+      for (i = 0, begin = C; i < M; ++i) {
+        for (j = 0; j < N; ++j, ++begin) {
+          val0 = *begin * beta;
+          val = 0;
+          pA = A + i * K;
+          pB = B + j;
+          for (k = K; k > 0; --k, ++pA, pB += N)
+            val += *pA * *pB;
+          *begin = val0 + val * alpha;
+          maxc = maxc > (size_t)(begin - C) ? maxc : (size_t)(begin - C);
+          if (maxc > M * N)
+            throw std::invalid_argument("gemm00: maxc > M * N");
         }
+      }
+      return;
     }
+  }
 #else
-    typedef Map<Matrix<NTYPE, Dynamic, Dynamic, RowMajor>> matrixdd_row;
-    typedef Map<Matrix<NTYPE, Dynamic, Dynamic, ColMajor>> matrixdd_col;
-    matrixdd_row mc(C, M, N);
-    if (beta != 1)
-        mc *= beta;
-    if (transA) {
-        matrixdd_col ma((NTYPE*)A, M, K);
-        if (transB) {
-            matrixdd_col mb((NTYPE*)B, K, N);
-            if (alpha != 1)
-                mc.noalias() += alpha * ma * mb;
-            else
-                mc.noalias() += ma * mb;
-            return;
-        }
-        else {
-            matrixdd_row mb((NTYPE*)B, K, N);
-            if (alpha != 1)
-                mc.noalias() += alpha * ma * mb;
-            else
-                mc.noalias() += ma * mb;
-            return;
-        }
+  typedef Map<Matrix<NTYPE, Dynamic, Dynamic, RowMajor>> matrixdd_row;
+  typedef Map<Matrix<NTYPE, Dynamic, Dynamic, ColMajor>> matrixdd_col;
+  matrixdd_row mc(C, M, N);
+  if (beta != 1)
+    mc *= beta;
+  if (transA) {
+    matrixdd_col ma((NTYPE *)A, M, K);
+    if (transB) {
+      matrixdd_col mb((NTYPE *)B, K, N);
+      if (alpha != 1)
+        mc.noalias() += alpha * ma * mb;
+      else
+        mc.noalias() += ma * mb;
+      return;
+    } else {
+      matrixdd_row mb((NTYPE *)B, K, N);
+      if (alpha != 1)
+        mc.noalias() += alpha * ma * mb;
+      else
+        mc.noalias() += ma * mb;
+      return;
     }
-    else {
-        matrixdd_row ma((NTYPE*)A, M, K);
-        if (transB) {
-            matrixdd_col mb((NTYPE*)B, K, N);
-            if (alpha != 1)
-                mc.noalias() += alpha * ma * mb;
-            else
-                mc.noalias() += ma * mb;
-            return;
-        }
-        else {
-            matrixdd_row mb((NTYPE*)B, K, N);
-            if (alpha != 1)
-                mc.noalias() += alpha * ma * mb;
-            else
-                mc.noalias() += ma * mb;
-            return;
-        }
+  } else {
+    matrixdd_row ma((NTYPE *)A, M, K);
+    if (transB) {
+      matrixdd_col mb((NTYPE *)B, K, N);
+      if (alpha != 1)
+        mc.noalias() += alpha * ma * mb;
+      else
+        mc.noalias() += ma * mb;
+      return;
+    } else {
+      matrixdd_row mb((NTYPE *)B, K, N);
+      if (alpha != 1)
+        mc.noalias() += alpha * ma * mb;
+      else
+        mc.noalias() += ma * mb;
+      return;
     }
+  }
 #endif
-    throw std::invalid_argument(
-        "Not implemented for adjointd matrices (Gemm<T>).");
+  throw std::invalid_argument(
+      "Not implemented for adjointd matrices (Gemm<T>).");
 }
 
 }; // namespace onnx_c_ops
diff --git a/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_common_.hpp b/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_common_.hpp
index 218bb57c..7a29cbe6 100644
--- a/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_common_.hpp
+++ b/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_common_.hpp
@@ -6,13 +6,18 @@
 
 #include "c_op_common_parallel.hpp"
 #include "c_op_tree_ensemble_common_agg_.hpp"
+#include <deque>
+#include <unordered_map>
 
-// #define DEBUG_PRINT(...) printf("%s", MakeString("*", __FILE__, ":", __LINE__, ":", MakeString(__VA_ARGS__), "\n").c_str());
+// #define DEBUG_PRINT(...) printf("%s", MakeString("*", __FILE__, ":",
+// __LINE__, ":", MakeString(__VA_ARGS__), "\n").c_str());
 #define DEBUG_PRINT(...)
 
 // https://cims.nyu.edu/~stadler/hpc17/material/ompLec.pdf
 // http://amestoy.perso.enseeiht.fr/COURS/CoursMulticoreProgrammingButtari.pdf
 
+using namespace onnx_extended_helpers;
+
 namespace onnx_c_ops {
 
 template <class Tp> struct TreeAlloc {
@@ -185,7 +190,9 @@ Status TreeEnsembleCommon<InputType, ThresholdType, OutputType>::Init(
   DEBUG_PRINT("Init")
   EXT_ENFORCE(n_targets_or_classes > 0);
   EXT_ENFORCE(nodes_falsenodeids.size() == nodes_featureids.size());
-  EXT_ENFORCE(nodes_falsenodeids.size() == nodes_modes.size());
+  EXT_ENFORCE(nodes_falsenodeids.size() == nodes_modes.size(),
+              "nodes_falsenodeids.size()=", nodes_falsenodeids.size(),
+              " nodes_modes.size()=", nodes_modes.size());
   EXT_ENFORCE(nodes_falsenodeids.size() == nodes_nodeids.size());
   EXT_ENFORCE(nodes_falsenodeids.size() == nodes_treeids.size());
   EXT_ENFORCE(nodes_falsenodeids.size() == nodes_truenodeids.size());
@@ -262,8 +269,8 @@ Status TreeEnsembleCommon<InputType, ThresholdType, OutputType>::Init(
     auto p =
         idi.insert(std::pair<TreeNodeElementId, uint32_t>(node_tree_id, i));
     if (!p.second) {
-      EXT_THROW("Node ", node_tree_id.node_id, " in tree ", node_tree_id.tree_id,
-             " is already there.");
+      EXT_THROW("Node ", node_tree_id.node_id, " in tree ",
+                node_tree_id.tree_id, " is already there.");
     }
     nodes_.emplace_back(node);
     node_tree_ids.emplace_back(node_tree_id);
@@ -289,11 +296,11 @@ Status TreeEnsembleCommon<InputType, ThresholdType, OutputType>::Init(
     auto found = idi.find(coor);
     if (found == idi.end()) {
       EXT_THROW("Unable to find node ", coor.tree_id, "-", coor.node_id,
-             " (truenode).");
+                " (truenode).");
     }
     if (found->second == truenode_ids.size()) {
       EXT_THROW("A node cannot point to itself: ", coor.tree_id, "-",
-             node_tree_id.node_id, " (truenode).");
+                node_tree_id.node_id, " (truenode).");
     }
     truenode_ids.emplace_back(found->second);
 
@@ -302,11 +309,11 @@ Status TreeEnsembleCommon<InputType, ThresholdType, OutputType>::Init(
     found = idi.find(coor);
     if (found == idi.end()) {
       EXT_THROW("Unable to find node ", coor.tree_id, "-", coor.node_id,
-             " (falsenode).");
+                " (falsenode).");
     }
     if (found->second == falsenode_ids.size()) {
       EXT_THROW("A node cannot point to itself: ", coor.tree_id, "-",
-             node_tree_id.node_id, " (falsenode).");
+                node_tree_id.node_id, " (falsenode).");
     }
     falsenode_ids.emplace_back(found->second);
     // We could also check that truenode_ids[truenode_ids.size() - 1] !=
@@ -334,7 +341,7 @@ Status TreeEnsembleCommon<InputType, ThresholdType, OutputType>::Init(
     auto found = idi.find(ind);
     if (found == idi.end()) {
       EXT_THROW("Unable to find node ", ind.tree_id, "-", ind.node_id,
-             " (weights).");
+                " (weights).");
     }
 
     TreeNodeElement<ThresholdType> &leaf = nodes_[found->second];
@@ -365,16 +372,16 @@ Status TreeEnsembleCommon<InputType, ThresholdType, OutputType>::Init(
     if (!nodes_[i].is_not_leaf()) {
       if (nodes_[i].falsenode_inc_or_n_weights == 0) {
         EXT_THROW("Target is missing for leaf ", ind.tree_id, "-", ind.node_id,
-               ".");
+                  ".");
       }
       continue;
     }
     EXT_ENFORCE(truenode_ids[i] != i); // That would mean the left node is
-                                    // itself, leading to an infinite loop.
+                                       // itself, leading to an infinite loop.
     nodes_[i].truenode_inc_or_first_weight =
         static_cast<int32_t>(truenode_ids[i] - i);
     EXT_ENFORCE(falsenode_ids[i] != i); // That would mean the right node is
-                                     // itself, leading to an infinite loop.
+                                        // itself, leading to an infinite loop.
     nodes_[i].falsenode_inc_or_n_weights =
         static_cast<int32_t>(falsenode_ids[i] - i);
   }
@@ -445,7 +452,8 @@ int TreeEnsembleCommon<InputType, ThresholdType, OutputType>::
   while (!stack.empty()) {
     pair = stack.front();
     stack.pop_front();
-    // EXT_ENFORCE(map_node_to_node3.find(pair.first) == map_node_to_node3.end(),
+    // EXT_ENFORCE(map_node_to_node3.find(pair.first) ==
+    // map_node_to_node3.end(),
     //          "This node index ", pair.first,
     //          " was already added as a TreeNodeElement3.");
     node = pair.second;
@@ -603,7 +611,8 @@ void TreeEnsembleCommon<InputType, ThresholdType, OutputType>::ComputeAgg(
   DEBUG_PRINT("max_num_threads=", max_num_threads)
   DEBUG_PRINT("parallel_tree_N_=", parallel_tree_N_)
   DEBUG_PRINT("parallel_tree_n=", parallel_tree_n)
-  DEBUG_PRINT("n_targets_or_classes_=", n_targets_or_classes_, " N=", N, " agg.kind()=", agg.kind())
+  DEBUG_PRINT("n_targets_or_classes_=", n_targets_or_classes_, " N=", N,
+              " agg.kind()=", agg.kind())
 
   if (n_targets_or_classes_ == 1) {
     DEBUG_PRINT()
@@ -654,7 +663,8 @@ void TreeEnsembleCommon<InputType, ThresholdType, OutputType>::ComputeAgg(
       // That's why the first loop split into batch so that every batch holds
       // on caches, then loop on trees and finally loop on the batch rows.
       DEBUG_PRINT()
-      std::vector<ScoreValue<ThresholdType>> scores(std::min(parallel_tree_n, N));
+      std::vector<ScoreValue<ThresholdType>> scores(
+          std::min(parallel_tree_n, N));
       size_t j;
       int64_t i, batch, batch_end;
 
@@ -806,10 +816,9 @@ void TreeEnsembleCommon<InputType, ThresholdType, OutputType>::ComputeAgg(
           }
         }
         for (i = batch; i < batch_end; ++i) {
-          agg.FinalizeScores(scores[i - batch],
-                             z_data + i * n_targets_or_classes_, -1,
-                             label_data == nullptr ? nullptr
-                                                   : (label_data + i));
+          agg.FinalizeScores(
+              scores[i - batch], z_data + i * n_targets_or_classes_, -1,
+              label_data == nullptr ? nullptr : (label_data + i));
         }
       }
       DEBUG_PRINT()
@@ -992,7 +1001,7 @@ const TreeNodeElement<ThresholdType> *
 TreeEnsembleCommon<InputType, ThresholdType, OutputType>::ProcessTreeNodeLeave3(
     size_t root_id, const InputType *x_data) const {
   EXT_ENFORCE(same_mode_, "This optimization is only available when all node "
-                       "follow the same mode.");
+                          "follow the same mode.");
   const TreeNodeElement3<ThresholdType> *root3 = roots3_[root_id];
   const TreeNodeElement<ThresholdType> *root;
   EXT_ENFORCE(root3 != nullptr, "No optimization for tree ", root_id, ".");
@@ -1018,7 +1027,8 @@ TreeEnsembleCommon<InputType, ThresholdType, OutputType>::ProcessTreeNodeLeave3(
     }
     break;
   default:
-    EXT_THROW("TreeNodeElement3 not yet implement with mode ", root3->mode(), ".");
+    EXT_THROW("TreeNodeElement3 not yet implement with mode ", root3->mode(),
+              ".");
   }
 }
 
@@ -1047,7 +1057,8 @@ TreeEnsembleCommon<InputType, ThresholdType, OutputType>::ProcessTreeNodeLeave(
         DEBUG_PRINT("LEQ+")
         while (root->is_not_leaf()) {
           val = x_data[root->feature_id];
-          DEBUG_PRINT("val=", val, " root->value_or_unique_weight=", root->value_or_unique_weight)
+          DEBUG_PRINT("val=", val, " root->value_or_unique_weight=",
+                      root->value_or_unique_weight)
           root += val <= root->value_or_unique_weight
                       ? root->truenode_inc_or_first_weight
                       : root->falsenode_inc_or_n_weights;
diff --git a/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_common_agg_.hpp b/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_common_agg_.hpp
index b59ec7ca..fc82d694 100644
--- a/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_common_agg_.hpp
+++ b/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_common_agg_.hpp
@@ -9,12 +9,6 @@
 #include <thread>
 #include <vector>
 
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-namespace py = pybind11;
-
 #include "c_op_common.h"
 
 namespace onnx_c_ops {
@@ -278,7 +272,7 @@ class TreeAggregator {
     write_scores(predictions, post_transform_, Z, add_second_class);
   }
 
-  const char* kind() const { return "NONE"; }
+  const char *kind() const { return "NONE"; }
 };
 
 /////////////
@@ -354,7 +348,7 @@ class TreeAggregatorSum
     write_scores(predictions, this->post_transform_, Z, add_second_class);
   }
 
-  const char* kind() const { return "SUM"; }
+  const char *kind() const { return "SUM"; }
 };
 
 template <typename InputType, typename ThresholdType, typename OutputType>
@@ -393,7 +387,7 @@ class TreeAggregatorAverage
     write_scores(predictions, this->post_transform_, Z, add_second_class);
   }
 
-  const char* kind() const { return "AVERAGE"; }
+  const char *kind() const { return "AVERAGE"; }
 };
 
 template <typename InputType, typename ThresholdType, typename OutputType>
@@ -462,7 +456,7 @@ class TreeAggregatorMin
     }
   }
 
-  const char* kind() const { return "MIN"; }
+  const char *kind() const { return "MIN"; }
 };
 
 template <typename InputType, typename ThresholdType, typename OutputType>
@@ -532,7 +526,7 @@ class TreeAggregatorMax
     }
   }
 
-  const char* kind() const { return "MAX"; }
+  const char *kind() const { return "MAX"; }
 };
 
 /////////////////
@@ -710,7 +704,7 @@ class TreeAggregatorClassifier
       predictions.resize(2);
   }
 
-  const char* kind() const { return "CLASSIFICATION"; }
+  const char *kind() const { return "CLASSIFICATION"; }
 };
 
 } // namespace onnx_c_ops
diff --git a/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_common_classifier_.hpp b/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_common_classifier_.hpp
index e219d12f..cdb6a3ad 100644
--- a/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_common_classifier_.hpp
+++ b/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_common_classifier_.hpp
@@ -36,12 +36,12 @@ class TreeEnsembleCommonClassifier
     }
   }
 
-  Status Init(const std::string &aggregate_function,                  // 3
-              const std::vector<ThresholdType> &base_values,          // 4
-              int64_t n_targets_or_classes,                           // 5
-              const std::vector<int64_t> &nodes_falsenodeids,         // 6
-              const std::vector<int64_t> &nodes_featureids,           // 7
-              const std::vector<ThresholdType> &nodes_hitrates,       // 8
+  Status Init(const std::string &aggregate_function,                       // 3
+              const std::vector<ThresholdType> &base_values,               // 4
+              int64_t n_targets_or_classes,                                // 5
+              const std::vector<int64_t> &nodes_falsenodeids,              // 6
+              const std::vector<int64_t> &nodes_featureids,                // 7
+              const std::vector<ThresholdType> &nodes_hitrates,            // 8
               const std::vector<int64_t> &nodes_missing_value_tracks_true, // 9
               const std::vector<std::string> &nodes_modes,                 // 10
               const std::vector<int64_t> &nodes_nodeids,                   // 11
@@ -55,23 +55,23 @@ class TreeEnsembleCommonClassifier
               const std::vector<ThresholdType> &class_weights              // 19
   ) {
     TreeEnsembleCommon<InputType, ThresholdType, OutputType>::Init(
-        aggregate_function,                         // 3
-        base_values,                                // 4
-        n_targets_or_classes,                       // 5
-        nodes_falsenodeids,                         // 6
-        nodes_featureids,                           // 7
-        nodes_hitrates,                             // 8
-        nodes_missing_value_tracks_true,            // 9
-        nodes_modes,                                // 10
-        nodes_nodeids,                              // 11
-        nodes_treeids,                              // 12
-        nodes_truenodeids,                          // 13
-        nodes_values,                               // 14
-        post_transform,                             // 15
-        class_ids,                                  // 16
-        class_nodeids,                              // 17
-        class_treeids,                              // 18
-        class_weights                               // 19
+        aggregate_function,              // 3
+        base_values,                     // 4
+        n_targets_or_classes,            // 5
+        nodes_falsenodeids,              // 6
+        nodes_featureids,                // 7
+        nodes_hitrates,                  // 8
+        nodes_missing_value_tracks_true, // 9
+        nodes_modes,                     // 10
+        nodes_nodeids,                   // 11
+        nodes_treeids,                   // 12
+        nodes_truenodeids,               // 13
+        nodes_values,                    // 14
+        post_transform,                  // 15
+        class_ids,                       // 16
+        class_nodeids,                   // 17
+        class_treeids,                   // 18
+        class_weights                    // 19
     );
     DEBUG_PRINT("Init")
 
diff --git a/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_py_.cpp b/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_py_.cpp
index d06569b1..b094151d 100644
--- a/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_py_.cpp
+++ b/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_py_.cpp
@@ -4,6 +4,12 @@
 #include "c_op_tree_ensemble_py_.hpp"
 #include "c_op_tree_ensemble_py_classifier_.hpp"
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+namespace py = pybind11;
+
 //////////////////////////////////////////
 // Classifier
 //////////////////////////////////////////
diff --git a/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_py_.hpp b/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_py_.hpp
index 88e79610..7942f187 100644
--- a/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_py_.hpp
+++ b/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_py_.hpp
@@ -3,6 +3,12 @@
 
 #include "c_op_tree_ensemble_common_.hpp"
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+namespace py = pybind11;
+
 #define py_array_t_int64_t                                                     \
   py::array_t<int64_t, py::array::c_style | py::array::forcecast>
 #define py_array_t_ntype_t                                                     \
@@ -86,23 +92,23 @@ class RuntimeTreeEnsembleCommon
     array2vector(ttarget_class_treeids, target_class_treeids, int64_t);
     array2vector(ttarget_class_weights, target_class_weights, NTYPE);
 
-    init_c(aggregate_function,                         // 3
-           cbasevalues,                                // 4
-           n_targets_or_classes,                       // 5
-           tnodes_falsenodeids,                        // 6
-           tnodes_featureids,                          // 7
-           tnodes_hitrates,                            // 8
-           tmissing_tracks_true,                       // 9
-           nodes_modes,                                // 10
-           tnodes_nodeids,                             // 11
-           tnodes_treeids,                             // 12
-           tnodes_truenodeids,                         // 13
-           tnodes_values,                              // 14
-           post_transform,                             // 15
-           ttarget_class_ids,                          // 16
-           ttarget_class_nodeids,                      // 17
-           ttarget_class_treeids,                      // 18
-           ttarget_class_weights                       // 19
+    init_c(aggregate_function,    // 3
+           cbasevalues,           // 4
+           n_targets_or_classes,  // 5
+           tnodes_falsenodeids,   // 6
+           tnodes_featureids,     // 7
+           tnodes_hitrates,       // 8
+           tmissing_tracks_true,  // 9
+           nodes_modes,           // 10
+           tnodes_nodeids,        // 11
+           tnodes_treeids,        // 12
+           tnodes_truenodeids,    // 13
+           tnodes_values,         // 14
+           post_transform,        // 15
+           ttarget_class_ids,     // 16
+           ttarget_class_nodeids, // 17
+           ttarget_class_treeids, // 18
+           ttarget_class_weights  // 19
     );
   }
 
@@ -124,23 +130,23 @@ class RuntimeTreeEnsembleCommon
               const std::vector<int64_t> &target_class_treeids,            // 18
               const std::vector<NTYPE> &target_class_weights               // 19
   ) {
-    this->Init(aggregate_function,                         // 3
-               base_values,                                // 4
-               n_targets_or_classes,                       // 5
-               nodes_falsenodeids,                         // 6
-               nodes_featureids,                           // 7
-               nodes_hitrates,                             // 8
-               nodes_missing_value_tracks_true,            // 9
-               nodes_modes,                                // 10
-               nodes_nodeids,                              // 11
-               nodes_treeids,                              // 12
-               nodes_truenodeids,                          // 13
-               nodes_values,                               // 14
-               post_transform,                             // 15
-               target_class_ids,                           // 16
-               target_class_nodeids,                       // 17
-               target_class_treeids,                       // 18
-               target_class_weights                        // 19
+    this->Init(aggregate_function,              // 3
+               base_values,                     // 4
+               n_targets_or_classes,            // 5
+               nodes_falsenodeids,              // 6
+               nodes_featureids,                // 7
+               nodes_hitrates,                  // 8
+               nodes_missing_value_tracks_true, // 9
+               nodes_modes,                     // 10
+               nodes_nodeids,                   // 11
+               nodes_treeids,                   // 12
+               nodes_truenodeids,               // 13
+               nodes_values,                    // 14
+               post_transform,                  // 15
+               target_class_ids,                // 16
+               target_class_nodeids,            // 17
+               target_class_treeids,            // 18
+               target_class_weights             // 19
     );
   }
 
diff --git a/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_py_classifier_.hpp b/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_py_classifier_.hpp
index ec41a068..548cac28 100644
--- a/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_py_classifier_.hpp
+++ b/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_py_classifier_.hpp
@@ -67,23 +67,23 @@ class RuntimeTreeEnsembleClassifier
     array2vector(ttarget_class_treeids, target_class_treeids, int64_t);
     array2vector(ttarget_class_weights, target_class_weights, NTYPE);
 
-    init_c(aggregate_function,                         // 3
-           cbasevalues,                                // 4
-           n_targets_or_classes,                       // 5
-           tnodes_falsenodeids,                        // 6
-           tnodes_featureids,                          // 7
-           tnodes_hitrates,                            // 8
-           tmissing_tracks_true,                       // 9
-           nodes_modes,                                // 10
-           tnodes_nodeids,                             // 11
-           tnodes_treeids,                             // 12
-           tnodes_truenodeids,                         // 13
-           tnodes_values,                              // 14
-           post_transform,                             // 15
-           ttarget_class_ids,                          // 16
-           ttarget_class_nodeids,                      // 17
-           ttarget_class_treeids,                      // 18
-           ttarget_class_weights                       // 19
+    init_c(aggregate_function,    // 3
+           cbasevalues,           // 4
+           n_targets_or_classes,  // 5
+           tnodes_falsenodeids,   // 6
+           tnodes_featureids,     // 7
+           tnodes_hitrates,       // 8
+           tmissing_tracks_true,  // 9
+           nodes_modes,           // 10
+           tnodes_nodeids,        // 11
+           tnodes_treeids,        // 12
+           tnodes_truenodeids,    // 13
+           tnodes_values,         // 14
+           post_transform,        // 15
+           ttarget_class_ids,     // 16
+           ttarget_class_nodeids, // 17
+           ttarget_class_treeids, // 18
+           ttarget_class_weights  // 19
     );
   }
 
@@ -105,23 +105,23 @@ class RuntimeTreeEnsembleClassifier
               const std::vector<int64_t> &target_class_treeids,            // 18
               const std::vector<NTYPE> &target_class_weights               // 19
   ) {
-    this->Init(aggregate_function,                         // 3
-               base_values,                                // 4
-               n_targets_or_classes,                       // 5
-               nodes_falsenodeids,                         // 6
-               nodes_featureids,                           // 7
-               nodes_hitrates,                             // 8
-               nodes_missing_value_tracks_true,            // 9
-               nodes_modes,                                // 10
-               nodes_nodeids,                              // 11
-               nodes_treeids,                              // 12
-               nodes_truenodeids,                          // 13
-               nodes_values,                               // 14
-               post_transform,                             // 15
-               target_class_ids,                           // 16
-               target_class_nodeids,                       // 17
-               target_class_treeids,                       // 18
-               target_class_weights                        // 19
+    this->Init(aggregate_function,              // 3
+               base_values,                     // 4
+               n_targets_or_classes,            // 5
+               nodes_falsenodeids,              // 6
+               nodes_featureids,                // 7
+               nodes_hitrates,                  // 8
+               nodes_missing_value_tracks_true, // 9
+               nodes_modes,                     // 10
+               nodes_nodeids,                   // 11
+               nodes_treeids,                   // 12
+               nodes_truenodeids,               // 13
+               nodes_values,                    // 14
+               post_transform,                  // 15
+               target_class_ids,                // 16
+               target_class_nodeids,            // 17
+               target_class_treeids,            // 18
+               target_class_weights             // 19
     );
   }
 
diff --git a/onnx_extended/reference/c_reference_evaluator.py b/onnx_extended/reference/c_reference_evaluator.py
index 8b04acdc..4ffb5c63 100644
--- a/onnx_extended/reference/c_reference_evaluator.py
+++ b/onnx_extended/reference/c_reference_evaluator.py
@@ -4,15 +4,6 @@
 from onnx.defs import get_schema
 from onnx.reference import ReferenceEvaluator
 from onnx.reference.op_run import OpRun
-from onnx_extended.reference.c_ops.c_op_conv import Conv
-from onnx_extended.reference.c_ops.c_op_tree_ensemble_regressor import (
-    TreeEnsembleRegressor_1,
-    TreeEnsembleRegressor_3,
-)
-from onnx_extended.reference.c_ops.c_op_tree_ensemble_classifier import (
-    TreeEnsembleClassifier_1,
-    TreeEnsembleClassifier_3,
-)
 
 
 class CReferenceEvaluator(ReferenceEvaluator):
@@ -29,13 +20,28 @@ class CReferenceEvaluator(ReferenceEvaluator):
         ref = ReferenceEvaluator(..., new_ops=[Conv])
     """
 
-    default_ops = [
-        Conv,
-        TreeEnsembleClassifier_1,
-        TreeEnsembleClassifier_3,
-        TreeEnsembleRegressor_1,
-        TreeEnsembleRegressor_3,
-    ]
+    def default_ops():
+        from onnx_extended.reference.c_ops.c_op_conv import Conv
+        from onnx_extended.reference.c_ops.c_op_tree_ensemble_regressor import (
+            TreeEnsembleRegressor_1,
+            TreeEnsembleRegressor_3,
+        )
+        from onnx_extended.reference.c_ops.c_op_tree_ensemble_classifier import (
+            TreeEnsembleClassifier_1,
+            TreeEnsembleClassifier_3,
+        )
+        from onnx_extended.reference.c_custom_ops.custom_op_tree_ensemble_regressor import (  # noqa: E501
+            TreeEnsembleRegressor_1 as TreeEnsembleRegressor_1_Float,
+        )
+
+        return [
+            Conv,
+            TreeEnsembleClassifier_1,
+            TreeEnsembleClassifier_3,
+            TreeEnsembleRegressor_1,
+            TreeEnsembleRegressor_3,
+            TreeEnsembleRegressor_1_Float,
+        ]
 
     @staticmethod
     def filter_ops(proto, new_ops, opsets):
@@ -84,10 +90,10 @@ def __init__(
         **kwargs,
     ):
         if new_ops is None:
-            new_ops = CReferenceEvaluator.default_ops
+            new_ops = CReferenceEvaluator.default_ops()
         else:
             new_ops = new_ops.copy()
-            new_ops.extend(CReferenceEvaluator.default_ops)
+            new_ops.extend(CReferenceEvaluator.default_ops())
         new_ops = CReferenceEvaluator.filter_ops(proto, new_ops, opsets)
 
         ReferenceEvaluator.__init__(
diff --git a/onnx_extended/validation/cpu/_validation.cpp b/onnx_extended/validation/cpu/_validation.cpp
index a6e768b5..67d74b3b 100644
--- a/onnx_extended/validation/cpu/_validation.cpp
+++ b/onnx_extended/validation/cpu/_validation.cpp
@@ -116,8 +116,7 @@ and parallelization (see `AVX API
 :return: sum of all elements
 )pbdoc");
 
-  m.def("vector_add", &vector_add,
-        py::arg("v1"), py::arg("v2"),
+  m.def("vector_add", &vector_add, py::arg("v1"), py::arg("v2"),
         R"pbdoc(Computes the addition of 2 vectors of any dimensions.
 It assumes both vectors have the same dimensions (no broadcast).).
 
@@ -125,5 +124,4 @@ It assumes both vectors have the same dimensions (no broadcast).).
 :param v2: second vector
 :return: new vector
 )pbdoc");
-
 }
diff --git a/onnx_extended/validation/cpu/speed_metrics.cpp b/onnx_extended/validation/cpu/speed_metrics.cpp
index 522f7e31..c68e4f48 100644
--- a/onnx_extended/validation/cpu/speed_metrics.cpp
+++ b/onnx_extended/validation/cpu/speed_metrics.cpp
@@ -72,7 +72,7 @@ benchmark_cache_tree(int64_t n_rows, int64_t n_features, int64_t n_trees,
       if (seed > 10037)
         seed = n_features * 7 + 1;
 
-      #pragma omp parallel for
+#pragma omp parallel for
       for (int64_t t = 0; t < n_trees; ++t) {
         int64_t end = batch + step < n_rows ? batch + step : n_rows;
         for (int64_t i = batch; i < end; ++i) {
diff --git a/onnx_extended/validation/cpu/speed_metrics.h b/onnx_extended/validation/cpu/speed_metrics.h
index ac068ec6..81de83d0 100644
--- a/onnx_extended/validation/cpu/speed_metrics.h
+++ b/onnx_extended/validation/cpu/speed_metrics.h
@@ -27,7 +27,7 @@ inline bool _isnan_(double x) {
 }
 
 inline bool _isnan_(float x) {
-  uint32_t* pv = reinterpret_cast<uint32_t*>(&x);
+  uint32_t *pv = reinterpret_cast<uint32_t *>(&x);
   uint32_t b = *pv;
   return (b & 0x7fc00000) == 0x7fc00000;
 }
diff --git a/onnx_extended/validation/cpu/vector_function.cpp b/onnx_extended/validation/cpu/vector_function.cpp
index d2508e51..991a8fc7 100644
--- a/onnx_extended/validation/cpu/vector_function.cpp
+++ b/onnx_extended/validation/cpu/vector_function.cpp
@@ -16,7 +16,7 @@
 
 namespace validation {
 
-float vector_sum(int nl, int nc, const float* values, int by_rows) {
+float vector_sum(int nl, int nc, const float *values, int by_rows) {
   float total = 0;
   if (by_rows) {
     for (size_t i = 0; i < nl; ++i) {
diff --git a/onnx_extended/validation/cpu/vector_function.h b/onnx_extended/validation/cpu/vector_function.h
index 81b34ea9..20f0b1f8 100644
--- a/onnx_extended/validation/cpu/vector_function.h
+++ b/onnx_extended/validation/cpu/vector_function.h
@@ -6,6 +6,6 @@
 
 namespace validation {
 
-float vector_sum(int nl, int nc, const float* values, int by_rows);
+float vector_sum(int nl, int nc, const float *values, int by_rows);
 
 } // namespace validation
diff --git a/onnx_extended/validation/cpu/vector_sum.cpp b/onnx_extended/validation/cpu/vector_sum.cpp
index f536b488..300954df 100644
--- a/onnx_extended/validation/cpu/vector_sum.cpp
+++ b/onnx_extended/validation/cpu/vector_sum.cpp
@@ -166,38 +166,39 @@ float vector_sum_array_avx_parallel(int nc,
 }
 
 py_array_float vector_add(const py_array_float &v1, const py_array_float &v2) {
-    // Based on tutorial https://pybind11.readthedocs.io/en/stable/advanced/pycpp/numpy.html
-    if (v1.ndim() != v2.ndim()) {
-        throw std::runtime_error("Vector v1 and v2 must have the same shape.");
-    }
-    for(int i = 0; i < v1.ndim(); ++i) {
-        if (v1.shape(i) != v2.shape(i)) {
-            throw std::runtime_error("Vector v1 and v2 must have the same shape.");
-        }
-    }
-    std::vector<int64_t> shape(v1.ndim());
-    for (int i = 0; i < v1.ndim(); ++i) {
-        shape[i] = v1.shape(i);
-    }
-    py::buffer_info b1 = v1.request();
-    py::buffer_info b2 = v2.request();
-    py_array_float result = py::array_t<float>(shape);
-    py::buffer_info br = result.request();
-    
-    const float * p1 = static_cast<const float*>(b1.ptr);  // pointer on v1 data
-    const float * p2 = static_cast<const float*>(b2.ptr);  // pointer on v2 data
-    float * pr = static_cast<float*>(br.ptr);  // pointer on result data
-    if (p1 == nullptr || p2 == nullptr || pr == nullptr) {
-        throw std::runtime_error("One vector is empty.");
-    }
-
-    // Here the addition.
-    int64_t size = v1.size();
-    for(int64_t i = 0; i < size; ++i) {      
-      pr[i] = p1[i] + p2[i];
-    }
-    
-    return result;
+  // Based on tutorial
+  // https://pybind11.readthedocs.io/en/stable/advanced/pycpp/numpy.html
+  if (v1.ndim() != v2.ndim()) {
+    throw std::runtime_error("Vector v1 and v2 must have the same shape.");
+  }
+  for (int i = 0; i < v1.ndim(); ++i) {
+    if (v1.shape(i) != v2.shape(i)) {
+      throw std::runtime_error("Vector v1 and v2 must have the same shape.");
+    }
+  }
+  std::vector<int64_t> shape(v1.ndim());
+  for (int i = 0; i < v1.ndim(); ++i) {
+    shape[i] = v1.shape(i);
+  }
+  py::buffer_info b1 = v1.request();
+  py::buffer_info b2 = v2.request();
+  py_array_float result = py::array_t<float>(shape);
+  py::buffer_info br = result.request();
+
+  const float *p1 = static_cast<const float *>(b1.ptr); // pointer on v1 data
+  const float *p2 = static_cast<const float *>(b2.ptr); // pointer on v2 data
+  float *pr = static_cast<float *>(br.ptr); // pointer on result data
+  if (p1 == nullptr || p2 == nullptr || pr == nullptr) {
+    throw std::runtime_error("One vector is empty.");
+  }
+
+  // Here the addition.
+  int64_t size = v1.size();
+  for (int64_t i = 0; i < size; ++i) {
+    pr[i] = p1[i] + p2[i];
+  }
+
+  return result;
 }
 
 } // namespace validation
diff --git a/onnx_extended/validation/cpu/vector_sum.h b/onnx_extended/validation/cpu/vector_sum.h
index 53d1b9c3..596f7478 100644
--- a/onnx_extended/validation/cpu/vector_sum.h
+++ b/onnx_extended/validation/cpu/vector_sum.h
@@ -13,7 +13,7 @@
 namespace py = pybind11;
 
 namespace validation {
-    
+
 // vector_sum
 
 float vector_sum(int nc, const std::vector<float> &values, bool by_rows);
diff --git a/onnx_extended/validation/cuda/__init__.py b/onnx_extended/validation/cuda/__init__.py
index 8b137891..4dbda618 100644
--- a/onnx_extended/validation/cuda/__init__.py
+++ b/onnx_extended/validation/cuda/__init__.py
@@ -1 +1,14 @@
-
+def cuda_version() -> str:
+    """
+    Returns the cuda version it was compiled with.
+    If CUDA was not available, it retunrs `"0.0"`.
+    """
+    try:
+        from .cuda_example_py import cuda_version as cv
+    except ImportError:
+        # No CUDA
+        return "0.0"
+    v = cv()
+    major = v // 1000
+    minor = (v % 1000) // 10
+    return f"{major}.{minor}"
diff --git a/onnx_extended/validation/cuda/cuda_example.cu b/onnx_extended/validation/cuda/cuda_example.cu
index ae5f0b63..0c9d18ab 100644
--- a/onnx_extended/validation/cuda/cuda_example.cu
+++ b/onnx_extended/validation/cuda/cuda_example.cu
@@ -11,36 +11,42 @@
 
 namespace cuda_example {
 
-__global__ void block_vector_add(const float *a, const float *b, float *c, int n) {
+__global__ void block_vector_add(const float *a, const float *b, float *c,
+                                 int n) {
   int i = blockIdx.x * blockDim.x + threadIdx.x;
   if (i < n) {
     c[i] = a[i] + b[i];
   }
 }
 
-void kernel_vector_add(unsigned int size, const float* gpu_ptr1, const float* gpu_ptr2, float* gpu_res) {
+void kernel_vector_add(unsigned int size, const float *gpu_ptr1,
+                       const float *gpu_ptr2, float *gpu_res) {
   constexpr int blockSize = 256;
   int numBlocks = (size + blockSize - 1) / blockSize;
-  block_vector_add<<<numBlocks, blockSize>>>(gpu_ptr1, gpu_ptr2, gpu_res, size);  
+  block_vector_add<<<numBlocks, blockSize>>>(gpu_ptr1, gpu_ptr2, gpu_res, size);
 }
 
-void vector_add(unsigned int size, const float* ptr1, const float* ptr2, float* br, int cudaDevice) {
+void vector_add(unsigned int size, const float *ptr1, const float *ptr2,
+                float *br, int cudaDevice) {
   // copy memory from CPU memory to CUDA memory
   NVTX_SCOPE("vector_add")
   checkCudaErrors(cudaSetDevice(cudaDevice));
   float *gpu_ptr1, *gpu_ptr2, *gpu_res;
   checkCudaErrors(cudaMalloc(&gpu_ptr1, size * sizeof(float)));
-  checkCudaErrors(cudaMemcpy(gpu_ptr1, ptr1, size * sizeof(float), cudaMemcpyHostToDevice));
+  checkCudaErrors(
+      cudaMemcpy(gpu_ptr1, ptr1, size * sizeof(float), cudaMemcpyHostToDevice));
 
   checkCudaErrors(cudaMalloc(&gpu_ptr2, size * sizeof(float)));
-  checkCudaErrors(cudaMemcpy(gpu_ptr2, ptr2, size * sizeof(float), cudaMemcpyHostToDevice));
+  checkCudaErrors(
+      cudaMemcpy(gpu_ptr2, ptr2, size * sizeof(float), cudaMemcpyHostToDevice));
 
   checkCudaErrors(cudaMalloc(&gpu_res, size * sizeof(float)));
 
   // execute the code
   kernel_vector_add(size, gpu_ptr1, gpu_ptr2, gpu_res);
 
-  checkCudaErrors(cudaMemcpy(br, gpu_res, size * sizeof(float), cudaMemcpyDeviceToHost));
+  checkCudaErrors(
+      cudaMemcpy(br, gpu_res, size * sizeof(float), cudaMemcpyDeviceToHost));
 
   // free the allocated vectors
   checkCudaErrors(cudaFree(gpu_ptr1));
@@ -49,16 +55,17 @@ void vector_add(unsigned int size, const float* ptr1, const float* ptr2, float*
 }
 
 unsigned int nextPow2(unsigned int x) {
-    --x;
-    x |= x >> 1;
-    x |= x >> 2;
-    x |= x >> 4;
-    x |= x >> 8;
-    x |= x >> 16;
-    return ++x;
+  --x;
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  x |= x >> 8;
+  x |= x >> 16;
+  return ++x;
 }
 
-__global__ void kernel_sum_reduce0(float *g_idata, float *g_odata, unsigned int n) {
+__global__ void kernel_sum_reduce0(float *g_idata, float *g_odata,
+                                   unsigned int n) {
   extern __shared__ float sdata[];
 
   // load shared mem
@@ -69,9 +76,9 @@ __global__ void kernel_sum_reduce0(float *g_idata, float *g_odata, unsigned int
 
   __syncthreads();
 
-  for (unsigned int s=1; s < blockDim.x; s *= 2) {
+  for (unsigned int s = 1; s < blockDim.x; s *= 2) {
     // modulo arithmetic is slow!
-    if ((tid % (2*s)) == 0) {
+    if ((tid % (2 * s)) == 0) {
       sdata[tid] += sdata[tid + s];
     }
     __syncthreads();
@@ -82,20 +89,24 @@ __global__ void kernel_sum_reduce0(float *g_idata, float *g_odata, unsigned int
   }
 }
 
-float kernel_vector_sum_reduce0(float* gpu_ptr, unsigned int size, int maxThreads) {
+float kernel_vector_sum_reduce0(float *gpu_ptr, unsigned int size,
+                                int maxThreads) {
   int threads = (size < maxThreads) ? nextPow2(size) : maxThreads;
-  int blocks = (size + threads - 1) / threads;  
+  int blocks = (size + threads - 1) / threads;
   dim3 dimBlock(threads, 1, 1);
   dim3 dimGrid(blocks, 1, 1);
-  float* gpu_block_ptr;
+  float *gpu_block_ptr;
   checkCudaErrors(cudaMalloc(&gpu_block_ptr, blocks * sizeof(float)));
-  int smemSize = (threads <= 32) ? 2 * threads * sizeof(float) : threads * sizeof(float);
-  kernel_sum_reduce0<<<dimGrid, dimBlock, smemSize>>>(gpu_ptr, gpu_block_ptr, size);
+  int smemSize =
+      (threads <= 32) ? 2 * threads * sizeof(float) : threads * sizeof(float);
+  kernel_sum_reduce0<<<dimGrid, dimBlock, smemSize>>>(gpu_ptr, gpu_block_ptr,
+                                                      size);
 
   // the last reduction happens on CPU, the first step is to move
   // the data from GPU to CPU.
-  float* cpu_ptr = new float[blocks];
-  checkCudaErrors(cudaMemcpy(cpu_ptr, gpu_block_ptr, blocks * sizeof(float), cudaMemcpyDeviceToHost));
+  float *cpu_ptr = new float[blocks];
+  checkCudaErrors(cudaMemcpy(cpu_ptr, gpu_block_ptr, blocks * sizeof(float),
+                             cudaMemcpyDeviceToHost));
   float gpu_result = 0;
   for (int i = 0; i < blocks; ++i) {
     gpu_result += cpu_ptr[i];
@@ -105,15 +116,15 @@ float kernel_vector_sum_reduce0(float* gpu_ptr, unsigned int size, int maxThread
   return gpu_result;
 }
 
-float vector_sum0(unsigned int size, const float* ptr, int maxThreads,
+float vector_sum0(unsigned int size, const float *ptr, int maxThreads,
                   int cudaDevice) {
   // copy memory from CPU memory to CUDA memory
   NVTX_SCOPE("vector_sum0")
   float *gpu_ptr;
   checkCudaErrors(cudaSetDevice(cudaDevice));
   checkCudaErrors(cudaMalloc(&gpu_ptr, size * sizeof(float)));
-  checkCudaErrors(cudaMemcpy(gpu_ptr, ptr, size * sizeof(float),
-                             cudaMemcpyHostToDevice));
+  checkCudaErrors(
+      cudaMemcpy(gpu_ptr, ptr, size * sizeof(float), cudaMemcpyHostToDevice));
 
   // execute the code
   float result = kernel_vector_sum_reduce0(gpu_ptr, size, maxThreads);
@@ -124,23 +135,23 @@ float vector_sum0(unsigned int size, const float* ptr, int maxThreads,
 }
 
 __global__ void vector_sum(float *input, float *output, unsigned int size) {
-    int tid = threadIdx.x + blockIdx.x * blockDim.x;
-    int stride = blockDim.x * gridDim.x;
-    float sum = 0.0f;
-    for (int i = tid; i < size; i += stride) {
-        sum += input[i];
-    }
-    atomicAdd(output, sum);
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int stride = blockDim.x * gridDim.x;
+  float sum = 0.0f;
+  for (int i = tid; i < size; i += stride) {
+    sum += input[i];
+  }
+  atomicAdd(output, sum);
 }
 
-float vector_sum_atomic(unsigned int size, const float* ptr,
-                        int maxThreads, int cudaDevice) {
+float vector_sum_atomic(unsigned int size, const float *ptr, int maxThreads,
+                        int cudaDevice) {
   NVTX_SCOPE("vector_sum_atomic")
   float *input, *output;
   float sum = 0.0f;
   cudaMalloc(&input, size * sizeof(float));
-  checkCudaErrors(cudaMemcpy(input, ptr, size * sizeof(float),
-                             cudaMemcpyHostToDevice));
+  checkCudaErrors(
+      cudaMemcpy(input, ptr, size * sizeof(float), cudaMemcpyHostToDevice));
   cudaMalloc(&output, sizeof(float));
   cudaMemcpy(output, &sum, sizeof(float), cudaMemcpyHostToDevice);
   vector_sum<<<size, maxThreads>>>(input, output, size);
@@ -150,5 +161,4 @@ float vector_sum_atomic(unsigned int size, const float* ptr,
   return sum;
 }
 
-
 } // namespace cuda_example
diff --git a/onnx_extended/validation/cuda/cuda_example.cuh b/onnx_extended/validation/cuda/cuda_example.cuh
index d325000c..594e782b 100644
--- a/onnx_extended/validation/cuda/cuda_example.cuh
+++ b/onnx_extended/validation/cuda/cuda_example.cuh
@@ -5,8 +5,10 @@ unsigned int nextPow2(unsigned int x);
 void vector_add(unsigned int size, const float *ptr1, const float *ptr2,
                 float *ptr3, int cudaDevice);
 
-float vector_sum0(unsigned int size, const float *ptr, int max_threads, int cudaDevice);
+float vector_sum0(unsigned int size, const float *ptr, int max_threads,
+                  int cudaDevice);
 
-float vector_sum_atomic(unsigned int size, const float* ptr, int maxThreads, int cudaDevice);
+float vector_sum_atomic(unsigned int size, const float *ptr, int maxThreads,
+                        int cudaDevice);
 
 } // namespace cuda_example
diff --git a/onnx_extended/validation/cuda/cuda_example_reduce.cu b/onnx_extended/validation/cuda/cuda_example_reduce.cu
index debbc5ef..1f323096 100644
--- a/onnx_extended/validation/cuda/cuda_example_reduce.cu
+++ b/onnx_extended/validation/cuda/cuda_example_reduce.cu
@@ -13,11 +13,11 @@
 
 namespace cuda_example {
 
-#define reduce6_block_and_sync(I,I2)                    \
-    if ((blockSize >= I) && (tid < I2)) {               \
-        sdata[tid] = mySum = mySum + sdata[tid + I2];   \
-    }                                                   \
-    __syncthreads();
+#define reduce6_block_and_sync(I, I2)                                          \
+  if ((blockSize >= I) && (tid < I2)) {                                        \
+    sdata[tid] = mySum = mySum + sdata[tid + I2];                              \
+  }                                                                            \
+  __syncthreads();
 
 template <typename T, unsigned int blockSize, bool nIsPow2>
 __global__ void kernel_reduce6(const T *g_idata, T *g_odata, unsigned int n) {
@@ -43,13 +43,12 @@ __global__ void kernel_reduce6(const T *g_idata, T *g_odata, unsigned int n) {
   sdata[tid] = mySum;
   __syncthreads();
 
-
   // reduction within a block in shared memory
   reduce6_block_and_sync(512, 256);
   reduce6_block_and_sync(256, 128);
   reduce6_block_and_sync(128, 64);
 
-#if (__CUDA_ARCH__ >= 300 )
+#if (__CUDA_ARCH__ >= 300)
   if (tid < 32) {
     if (blockSize >= 64) {
       mySum += sdata[tid + 32];
@@ -82,20 +81,23 @@ bool isPow2(unsigned int n) {
   return (n & (n - 1)) == 0;
 }
 
-#define case_vector_sum_6_block(T, I, B)  \
-  case I:  \
-      kernel_reduce6<T, I, B><<<dimGrid, dimBlock, smemSize>>>(gpu_ptr, gpu_block_ptr, size); \
-      break;
+#define case_vector_sum_6_block(T, I, B)                                       \
+  case I:                                                                      \
+    kernel_reduce6<T, I, B>                                                    \
+        <<<dimGrid, dimBlock, smemSize>>>(gpu_ptr, gpu_block_ptr, size);       \
+    break;
 
-float kernel_vector_sum_6(unsigned int size, const float* gpu_ptr, int maxThreads) {
+float kernel_vector_sum_6(unsigned int size, const float *gpu_ptr,
+                          int maxThreads) {
 
   int threads = (size < maxThreads) ? nextPow2(size) : maxThreads;
-  int blocks = (size + threads - 1) / threads;  
+  int blocks = (size + threads - 1) / threads;
   dim3 dimBlock(threads, 1, 1);
   dim3 dimGrid(blocks, 1, 1);
-  float* gpu_block_ptr;
+  float *gpu_block_ptr;
   checkCudaErrors(cudaMalloc(&gpu_block_ptr, blocks * sizeof(float)));
-  int smemSize = (threads <= 32) ? 2 * threads * sizeof(float) : threads * sizeof(float);
+  int smemSize =
+      (threads <= 32) ? 2 * threads * sizeof(float) : threads * sizeof(float);
 
   if (isPow2(size)) {
     switch (threads) {
@@ -110,8 +112,7 @@ float kernel_vector_sum_6(unsigned int size, const float* gpu_ptr, int maxThread
       case_vector_sum_6_block(float, 2, true);
       case_vector_sum_6_block(float, 1, true);
     }
-  }
-  else {
+  } else {
     switch (threads) {
       case_vector_sum_6_block(float, 512, false);
       case_vector_sum_6_block(float, 256, false);
@@ -128,8 +129,9 @@ float kernel_vector_sum_6(unsigned int size, const float* gpu_ptr, int maxThread
 
   // the last reduction happens on CPU, the first step is to move
   // the data from GPU to CPU.
-  float* cpu_ptr = new float[blocks];
-  checkCudaErrors(cudaMemcpy(cpu_ptr, gpu_block_ptr, blocks * sizeof(float), cudaMemcpyDeviceToHost));
+  float *cpu_ptr = new float[blocks];
+  checkCudaErrors(cudaMemcpy(cpu_ptr, gpu_block_ptr, blocks * sizeof(float),
+                             cudaMemcpyDeviceToHost));
   float gpu_result = 0;
   for (int i = 0; i < blocks; ++i) {
     gpu_result += cpu_ptr[i];
@@ -137,18 +139,17 @@ float kernel_vector_sum_6(unsigned int size, const float* gpu_ptr, int maxThread
   checkCudaErrors(cudaFree(gpu_block_ptr));
   delete[] cpu_ptr;
   return gpu_result;
-
 }
 
-float vector_sum6(unsigned int size, const float* ptr, int maxThreads,
+float vector_sum6(unsigned int size, const float *ptr, int maxThreads,
                   int cudaDevice) {
   // copy memory from CPU memory to CUDA memory
   NVTX_SCOPE("vector_sum6")
   float *gpu_ptr;
   checkCudaErrors(cudaSetDevice(cudaDevice));
   checkCudaErrors(cudaMalloc(&gpu_ptr, size * sizeof(float)));
-  checkCudaErrors(cudaMemcpy(gpu_ptr, ptr, size * sizeof(float),
-                             cudaMemcpyHostToDevice));
+  checkCudaErrors(
+      cudaMemcpy(gpu_ptr, ptr, size * sizeof(float), cudaMemcpyHostToDevice));
 
   // execute the code
   float result = kernel_vector_sum_6(size, gpu_ptr, maxThreads);
diff --git a/pyproject.toml b/pyproject.toml
index 61fe4f4c..4f715d75 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,7 +19,7 @@ classifiers = [
     "Programming Language :: Python :: 3.11",
 ]
 dependencies = ["numpy", "onnx>=1.14.0", "scipy"]
-description = "Extends the list of supported operators in onnx reference implementation, or implements faster versions in C++."
+description = "Extends the list of supported operators in onnx reference implementation and onnxruntime, or implements faster versions in C++."
 keywords = ["onnx", "onnxruntime", "CUDA", "openmp", "cmake", "cython", "pybind11"]
 license = {file = "LICENSE.txt"}
 name = "onnx-extended"
@@ -31,7 +31,7 @@ version = "0.2.0"
 homepage = "https://sdpython.github.io/doc/onnx-extended/"
 documentation = "https://sdpython.github.io/doc/onnx-extended/"
 repository = "https://github.com/sdpython/onnx-extended"
-# changelog = "https://github.com/sdpython/onnx-extended/CHANGELOG.md"
+changelog = "https://github.com/sdpython/onnx-extended/CHANGELOGS.rst"
 
 [project.optional-dependencies]
 dev = [
@@ -51,7 +51,7 @@ dev = [
     "ml-dtypes",
     "onnx-array-api",
     "onnxmltools",
-    "onnxruntime; python_version < '3.11'",
+    "onnxruntime",
     "pandas",
     "psutil",
     "pyquickhelper>=1.12.3821",
@@ -96,7 +96,7 @@ ignore_messages = "Duplicate implicit target name: \"setup.py\""
 namespaces = false
 
 [tool.setuptools.package-data]
-"*" = ["*.cc", "*.cpp", "*.cu", "*.cuh", "*.dylib", "*.h", "*.hpp", "*.pyd", "*.so*"]
+"*" = ["*.cc", "*.cpp", "*.cu", "*.cuh", "*.dll", "*.dylib", "*.h", "*.hpp", "*.pyd", "*.so*"]
 
 [tool.cibuildwheel]
 build = "*"
@@ -105,7 +105,7 @@ manylinux-x86_64-image = "manylinux2014"
 [tool.cibuildwheel.linux]
 archs = ["x86_64"]
 build = "cp*"
-skip = "cp36-* cp37-* cp38-* cp39-* cp311-* pypy* *musllinux*"
+skip = "cp36-* cp37-* cp38-* cp39-* pypy* *musllinux*"
 
 [tool.cibuildwheel.macos]
 archs = ["x86_64"]
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 7b26af67..3fdd1ebb 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -14,7 +14,7 @@ matplotlib
 ml-dtypes
 onnx-array-api
 onnxmltools
-onnxruntime; python_version < '3.11'
+onnxruntime
 openpyxl
 packaging
 pandas
diff --git a/setup.py b/setup.py
index 90672585..64e0f6fc 100644
--- a/setup.py
+++ b/setup.py
@@ -37,10 +37,14 @@
     "*.hpp",
     "*.pyd",
     "*.so*",
+    "*.dll",
 ]
 package_data = {
-    "onnx_extended.ortcy.wrap": known_extensions,
+    "onnx_extended": known_extensions,
+    "onnx_extended.ortops.optim.cpu": known_extensions,
     "onnx_extended.ortops.tutorial.cpu": known_extensions,
+    "onnx_extended.ortops.tutorial.cuda": known_extensions,
+    "onnx_extended.ortcy.wrap": known_extensions,
     "onnx_extended.reference.c_ops.cpu": known_extensions,
     "onnx_extended.validation.cpu": known_extensions,
     "onnx_extended.validation.cython": known_extensions,
@@ -62,7 +66,7 @@
 except FileNotFoundError:
     long_description = ""
 
-version_str = "0.1.0"
+VERSION_STR = "0.2.0"
 with open(os.path.join(here, "onnx_extended/__init__.py"), "r") as f:
     line = [
         _
@@ -70,7 +74,7 @@
         if _.startswith("__version__")
     ]
     if len(line) > 0:
-        version_str = line[0].split("=")[1].strip('" ')
+        VERSION_STR = line[0].split("=")[1].strip('" ')
 
 ########################################
 # C++ Helper
@@ -213,31 +217,31 @@ def __init__(self, name: str, library: str = "") -> None:
 class cmake_build_ext(build_ext):
     user_options = [
         *build_ext.user_options,
-        ("enable-nvtx=", None, "Enables compilation with NVTX events."),
         (
-            "with-cuda=",
+            "use-cuda",
             None,
             "If cuda is available, CUDA is "
             "used by default unless this option is set to 0",
         ),
+        ("use-nvtx=", None, "Enables compilation with NVTX events."),
         (
-            "cuda-version=",
+            "cuda-version",
             None,
             "If cuda is available, it searches the installed version "
             "unless this option is defined.",
         ),
         (
-            "parallel=",
+            "parallel",
             None,
             "Parallelization",
         ),
         (
-            "ort-version=",
+            "ort-version",
             None,
             "onnxruntime version, a path is allowed",
         ),
         (
-            "cuda-build=",
+            "cuda-build",
             None,
             "CUDA code can be compiled to be working with "
             "different architectures, this flag can optimize "
@@ -247,28 +251,68 @@ class cmake_build_ext(build_ext):
     ]
 
     def initialize_options(self):
-        self.enable_nvtx = None
-        self.with_cuda = None
+        self.use_nvtx = None
+        self.use_cuda = None
         self.cuda_version = None
         self.parallel = None
         self.ort_version = DEFAULT_ORT_VERSION
         self.cuda_build = "DEFAULT"
+
         build_ext.initialize_options(self)
 
+        # boolean
+        b_values = {0, 1, "1", "0", True, False}
+        t_values = {1, "1", True}
+        for att in ["use_nvtx", "use_cuda"]:
+            v = getattr(self, att)
+            if v is not None:
+                continue
+            v = os.environ.get(att.upper(), None)
+            if v is None:
+                continue
+            if v not in b_values:
+                raise ValueError(f"Unable to interpret value {v} for {att.upper()!r}.")
+            print(f"-- setup: use env {att.upper()}={v in t_values}")
+            setattr(self, att, v in t_values)
+        if self.ort_version is None:
+            self.ort_version = os.environ.get("ORT_VERSION", None)
+            if self.ort_version not in ("", None):
+                print(f"-- setup: use env ORT_VERSION={self.ort_version}")
+        if self.cuda_build is None:
+            self.cuda_build = os.environ.get("CUDA_BUILD", None)
+            if self.cuda_build not in ("", None):
+                print(f"-- setup: use env CUDA_BUILD={self.cuda_build}")
+        if self.cuda_version is None:
+            self.cuda_version = os.environ.get("CUDA_VERSION", None)
+            if self.cuda_version not in ("", None):
+                print(f"-- setup: use env CUDA_VERSION={self.cuda_version}")
+        if self.use_nvtx is None:
+            self.use_nvtx = False
+
     def finalize_options(self):
-        b_values = {None, 0, 1, "1", "0", True, False}
-        if self.enable_nvtx not in b_values:
-            raise ValueError(f"enable_nvtx={self.enable_nvtx!r} must be in {b_values}.")
-        if self.with_cuda not in b_values:
-            raise ValueError(f"with_cuda={self.with_cuda!r} must be in {b_values}.")
-        self.enable_nvtx = self.enable_nvtx in {1, "1", True, "True"}
-        self.with_cuda = self.with_cuda in {1, "1", True, "True", None}
+        build_ext.finalize_options(self)
+
+        b_values = {0, 1, "1", "0", True, False, "True", "False"}
+        if self.use_nvtx not in b_values:
+            raise ValueError(f"use_nvtx={self.use_nvtx!r} must be in {b_values}.")
+        if self.use_cuda is None:
+            self.use_cuda = find_cuda()
+        if self.use_cuda not in b_values:
+            raise ValueError(f"use_cuda={self.use_cuda!r} must be in {b_values}.")
+        self.use_nvtx = self.use_nvtx in {1, "1", True, "True"}
+        self.use_cuda = self.use_cuda in {1, "1", True, "True"}
         if self.cuda_version in (None, ""):
             self.cuda_version = None
         build = {"DEFAULT", "H100", "H100opt"}
         if self.cuda_build not in build:
-            raise ValueError(f"cuda-built={self.cuda_build} not in {build}.")
-        build_ext.finalize_options(self)
+            raise ValueError(f"cuda-build={self.cuda_build!r} not in {build}.")
+
+        options = {o[0]: o for o in self.user_options}
+        keys = list(sorted(options.keys()))
+        for na in keys:
+            opt = options[na]
+            name = opt[0].replace("-", "_").strip("=")
+            print(f"-- setup: option {name}={getattr(self, name, None)}")
 
     def get_cmake_args(self, cfg: str) -> List[str]:
         """
@@ -296,18 +340,17 @@ def get_cmake_args(self, cfg: str) -> List[str]:
             f"-DPYTHON_VERSION_MM={versmm}",
             f"-DPYTHON_MODULE_EXTENSION={module_ext}",
             f"-DORT_VERSION={self.ort_version}",
+            f"-DONNX_EXTENDED_VERSION={VERSION_STR}",
         ]
         if self.parallel is not None:
             cmake_args.append(f"-j{self.parallel}")
 
-        if os.environ.get("USE_NVTX", "0") in (1, "1") or self.enable_nvtx:
+        if self.use_nvtx:
             cmake_args.append("-DUSE_NVTX=1")
-        if os.environ.get("USE_CUDA", "1") in (0, "0") or not self.with_cuda:
-            cmake_args.append("-DUSE_CUDA=0")
-        else:
-            cmake_args.append("-DUSE_CUDA=1")
+        cmake_args.append(f"-DUSE_CUDA={1 if self.use_cuda else 0}")
+        if self.use_cuda:
             cmake_args.append(f"-DCUDA_BUILD={self.cuda_build}")
-        cuda_version = self.cuda_version or os.environ.get("CUDA_VERSION", "")
+        cuda_version = self.cuda_version
         if cuda_version not in (None, ""):
             cmake_args.append(f"-DCUDA_VERSION={cuda_version}")
 
@@ -418,7 +461,7 @@ def process_extensions(self, cfg: str, build_path: str, build_lib: str):
                 raise FileNotFoundError(f"Unable to find {look!r}.")
             if not os.path.exists(dest):
                 raise FileNotFoundError(f"Unable to find folder {dest!r}.")
-            print(f"-- copy {look!r} to {dest!r}")
+            print(f"-- setup: copy-2 {look!r} to {dest!r}")
             shutil.copy(look, dest)
 
     def _process_setup_ext_line(self, cfg, build_path, line):
@@ -450,7 +493,7 @@ def _process_setup_ext_line(self, cfg, build_path, line):
                 raise FileNotFoundError(
                     f"Unable to find library {fullname!r} (line={line!r})."
                 )
-            print(f"-- copy {fullname!r} to {fulldest!r}")
+            print(f"-- setup: copy-1 {fullname!r} to {fulldest!r}")
             shutil.copy(fullname, fulldest)
         else:
             raise RuntimeError(f"Unable to interpret line {line!r}.")
@@ -484,8 +527,11 @@ def build_extensions(self):
         cfg = "Release"
         cmake_args = self.get_cmake_args(cfg)
         build_path, build_lib = self.build_cmake(cfg, cmake_args)
+        print("-- process_setup_ext")
         self.process_setup_ext(cfg, build_path, "_setup_ext.txt")
+        print("-- process_extensions")
         self.process_extensions(cfg, build_path, build_lib)
+        print("-- done")
 
 
 def get_ext_modules():
@@ -500,8 +546,8 @@ def get_ext_modules():
     has_cuda = find_cuda()
     if has_cuda:
         add_cuda = True
-        if "--with-cuda" in sys.argv:
-            pos = sys.argv.index("--with-cuda")
+        if "--use-cuda" in sys.argv:
+            pos = sys.argv.index("--use-cuda")
             if len(sys.argv) > pos + 1 and sys.argv[pos + 1] in (
                 "0",
                 0,
@@ -509,10 +555,8 @@ def get_ext_modules():
                 "False",
             ):
                 add_cuda = False
-        elif "--with-cuda=0" in sys.argv:
+        elif os.environ.get("USE_CUDA", None) in {0, "0", False}:
             add_cuda = False
-        elif "--with-cuda=1" in sys.argv or "--with-cuda=guess":
-            add_cuda = True
         if add_cuda:
             cuda_extensions.extend(
                 [
@@ -555,7 +599,7 @@ def get_ext_modules():
 
 setup(
     name="onnx-extended",
-    version=version_str,
+    version=VERSION_STR,
     description="More operators for onnx reference implementation",
     long_description=long_description,
     author="Xavier Dupré",