sdpython · sdpython · Jul 7, 2023 · Jul 4, 2023 · Jul 4, 2023 · Jul 4, 2023
diff --git a/.gitignore b/.gitignore
@@ -9,6 +9,7 @@
 *.sln
 *.cmake
 *.whl
+*.def
 /*.png
 /*.onnx
 .build_path.txt

diff --git a/CHANGELOGS.rst b/CHANGELOGS.rst
@@ -4,6 +4,7 @@ Change Logs
 0.2.0
 +++++
 
+* :pr:`41`: implements a custom kernel for RandomForestRegressor easier to optimize
 * :pr:`34`: update to onnxruntime v1.15.1
 * :pr:`31`: implement a custom CUDA kernel (gemm)
 * :pr:`32`: update to onnxruntime v1.15.0

diff --git a/README.rst b/README.rst
@@ -29,7 +29,7 @@ onnx-extended: extensions for onnx and onnxruntime
 **onnx-extended** extends the list of supported operators in onnx
 reference implementation, or implements faster versions in C++.
 Documentation `onnx-extended
-<http://www.xavierdupre.fr/app/onnx-extended/helpsphinx/index.html>`_.
+<https://sdpython.github.io/doc/onnx-extended/>`_.
 Source are available on `github/onnx-extended
 <https://github.com/sdpython/onnx-extended>`_.
 
@@ -115,9 +115,9 @@ can be enabled with the following command:
 
 ::
 
-    python setup.py build_ext --inplace --enable_nvtx 1
-    # or
-    pip install -e . --config-settings="--enable_nvtx=1"
+    python setup.py build_ext --inplace --use_nvtx 1
+    # or (not working yet)
+    pip install -e . --config-settings="--use_nvtx=1"
 
 Experimental cython binding for onnxruntime
 +++++++++++++++++++++++++++++++++++++++++++

diff --git a/_cmake/CMakeLists.txt b/_cmake/CMakeLists.txt
@@ -1,11 +1,12 @@
 cmake_minimum_required(VERSION 3.24.0)
-project(onnx_extended VERSION 0.2.0)
+project(onnx_extended VERSION ${ONNX_EXTENDED_VERSION})
 
 #
 # initialisation
 #
 
 message(STATUS "-------------------")
+message(STATUS "ONNX_EXTENDED_VERSION=${ONNX_EXTENDED_VERSION}")
 message(STATUS "CMAKE_VERSION=${CMAKE_VERSION}")
 message(STATUS "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}")
 message(STATUS "CMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}")
@@ -25,6 +26,7 @@ message(STATUS "USE_CUDA=${USE_CUDA}")
 message(STATUS "CUDA_BUILD=${CUDA_BUILD}")
 message(STATUS "USE_NVTX=${USE_NVTX}")
 message(STATUS "ORT_VERSION=${ORT_VERSION}")
+
 # message(STATUS "ENV-PATH=$ENV{PATH}")
 # message(STATUS "ENV-PYTHONPATH=$ENV{PYTHONPATH}")
 message(STATUS "--------------------------------------------")
@@ -44,8 +46,8 @@ list(APPEND CMAKE_MODULE_PATH
 # Packages and constants
 #
 
-include("load_externals.cmake")
 include("constants.cmake")
+include("load_externals.cmake")
 
 #
 # modules
@@ -61,8 +63,10 @@ include("targets/cuda_example_py.cmake")
 include("targets/vector_function_cy.cmake")
 
 set(ORTOPS_INCLUDE_DIR "${ROOT_INCLUDE_PATH}/onnx_extended/ortops")
+set(REFOPS_INCLUDE_DIR "${ROOT_INCLUDE_PATH}/onnx_extended/reference/c_ops")
 include("targets/ortops_tutorial_cpu.cmake")
 include("targets/ortops_tutorial_cuda.cmake")
+include("targets/ortops_optim_cpu.cmake")
 
 #
 # write version

diff --git a/_cmake/clang_format.sh b/_cmake/clang_format.sh
@@ -6,11 +6,11 @@ echo "--cython-lint--"
 cython-lint .
 echo "--clang-format--"
 find onnx_extended -type f \( -name "*.h" -o -name "*.hpp" -o -name "*.cuh" -o -name "*.cpp" -o -name "*.cc" -o -name "*.cu" \) | while read f; do
-    echo "Processing '$f'";
-    clang-format --length 88 -i $f;
+    echo "clang-format -i $f";
+    clang-format -i $f;
 done
 echo "--cmake-lint--"
 find _cmake -type f \( -name "*.cmake" -o -name "*.txt" \) | while read f; do
-    echo "Processing '$f'";
+    echo "cmake-lint $f --line-width=88 --disabled-codes C0103 C0113";
     cmake-lint $f --line-width=88 --disabled-codes C0103 C0113;
 done
diff --git a/_cmake/constants.cmake b/_cmake/constants.cmake
@@ -1,3 +1,14 @@
+#
+# python extension
+#
+if(MSVC)
+  set(DLLEXT "dll")
+elseif(APPLE)
+  set(DLLEXT "dylib")
+else()
+  set(DLLEXT "so")
+endif()
+
 #
 # C++ 14 or C++ 17
 #
@@ -37,6 +48,9 @@ else()
 endif()
 
 if(APPLE)
+  message(STATUS "APPLE: set env var for open mp: CC, CCX, LDFLAGS, CPPFLAGS")
+  set(ENV{CC} "/usr/local/opt/llvm/bin/clang")
+  set(ENV{CXX} "/usr/local/opt/llvm/bin/clang++")
   set(ENV(LDFLAGS) "-L/usr/local/opt/llvm/lib")
   set(ENV(CPPFLAGS) "-I/usr/local/opt/llvm/include")
 endif()

diff --git a/_cmake/externals/FindCudaExtension.cmake b/_cmake/externals/FindCudaExtension.cmake
@@ -4,6 +4,14 @@
 # Defines USE_NTVX to enable profiling with NVIDIA profiler.
 # CUDA_VERSION must be defined as well.
 
+if(${CMAKE_CUDA_COMPILER} STREQUAL "/usr/bin/nvcc")
+  message(FATAL_ERROR
+          "CMAKE_CUDA_COMPILER is equal to '${CMAKE_CUDA_COMPILER}', "
+          "CUDA_VERSION=${CUDA_VERSION}, "
+          "CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}, "
+          "You should specify the cuda version by adding --cuda-version=...")
+endif()
+
 if(CUDA_VERSION)
   find_package(CUDAToolkit ${CUDA_VERSION} EXACT)
 else()
@@ -14,6 +22,27 @@ message(STATUS "CUDAToolkit_FOUND=${CUDAToolkit_FOUND}")
 
 if(CUDAToolkit_FOUND)
 
+  message(STATUS "befor1 language CUDA_VERSION=${CUDA_VERSION}")
+  message(STATUS "befor1 language CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}")
+  message(STATUS "befor1 language CMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}")
+
+  if(CMAKE_CUDA_ARCHITECTURES STREQUAL "")
+    set(CMAKE_CUDA_ARCHITECTURES "native")
+  endif()
+  if(CMAKE_CUDA_COMPILER STREQUAL "CMAKE_CUDA_COMPILER-NOTFOUND")
+    if(CUDA_VERSION STREQUAL "")
+      message(FATAL_ERROR "No CMAKE_CUDA_COMPILER for CUDA_VERSION=${CUDA_VERSION}. "
+                          "You can use --cuda-version=<CUDA_VERSION> or set "
+                          "CUDACXX=/usr/local/cuda-<CUDA_VERSION>/bin/nvcc")
+    else()
+      set(CMAKE_CUDA_COMPILER "/usr/local/cuda-${CUDA_VERSION}/bin/nvcc")
+      message(STATUS "set CMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}")
+    endif()
+  endif()
+
+  message(STATUS "before language CUDA_VERSION=${CUDA_VERSION}")
+  message(STATUS "before language CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}")
+  message(STATUS "before language CMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}")
   enable_language(CUDA)
   message(STATUS "------------- CUDA settings")
   message(STATUS "CUDA_VERSION=${CUDA_VERSION}")
@@ -30,7 +59,7 @@ if(CUDAToolkit_FOUND)
                         "< ${CUDA_VERSION}, nvcc is not setup properly. "
                         "Try 'whereis nvcc' and chack the version.")
   endif()
-  
+
   set(CMAKE_CUDA_STANDARD 17)
   set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 
@@ -42,11 +71,13 @@ if(CUDAToolkit_FOUND)
 
   if(CUDA_BUILD STREQUAL "H100opt")
 
-    # see https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+    # see https://arnon.dk/
+    # matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
     set(CMAKE_CUDA_ARCHITECTURES 90)
     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_90,code=sm_90")
     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_90a,code=sm_90a")
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_90a,code=compute_90a")
+    set(CMAKE_CUDA_FLAGS
+        "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_90a,code=compute_90a")
 
   else()  # H100, DEFAULT
 
@@ -64,25 +95,36 @@ if(CUDAToolkit_FOUND)
     endif()
 
     if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11)
-      message(FATAL_ERROR "CUDA verions must be >= 11 but is ${CMAKE_CUDA_COMPILER_VERSION}.")
+      message(FATAL_ERROR "CUDA verions must be >= 11 but is "
+                          "${CMAKE_CUDA_COMPILER_VERSION}.")
     endif()
     if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12)
-      # 37, 50 still work in CUDA 11 but are marked deprecated and will be removed in future CUDA version.
-      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_37,code=sm_37") # K80
-      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_50,code=sm_50") # M series
+      # 37, 50 still work in CUDA 11
+      # but are marked deprecated and will be removed in future CUDA version.
+      # K80
+      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_37,code=sm_37")
+      # M series
+      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_50,code=sm_50")
     endif()
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_52,code=sm_52") # M60
-    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_60,code=sm_60") # P series
-    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_61,code=sm_61") # P series
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_70,code=sm_70") # V series
-    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_75,code=sm_75") # T series
+    # M60
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_52,code=sm_52")
+    # P series
+    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_60,code=sm_60")
+    # P series
+    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_61,code=sm_61")
+    # V series
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_70,code=sm_70")
+    # T series
+    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_75,code=sm_75")
     if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11)
-      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_80,code=sm_80") # A series
-      # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_86,code=sm_86") # A series
-      # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_87,code=sm_87") # A series
+      # A series
+      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_80,code=sm_80")
+      # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_86,code=sm_86")
+      # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_87,code=sm_87")
     endif()
     if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.8)
-      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_90,code=sm_90") # H series
+      # H series
+      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_90,code=sm_90")
     endif()
   endif()
 

diff --git a/_cmake/externals/FindMyPython.cmake b/_cmake/externals/FindMyPython.cmake
@@ -71,12 +71,6 @@ else()
   message(STATUS "Use find_package(Python3).")
   set(Python3_EXECUTABLE ${PYTHON_EXECUTABLE})
   if(APPLE)
-    message(STATUS "APPLE: set env var for open mp: CC, CCX, LDFLAGS, CPPFLAGS")
-    set(ENV{CC} "/usr/local/opt/llvm/bin/clang")
-    set(ENV{CXX} "/usr/local/opt/llvm/bin/clang++")
-    set(ENV{LDFLAGS} "-L/usr/local/opt/llvm/lib")
-    set(ENV{CPPFLAGS} "-I/usr/local/opt/llvm/include")
-
     find_package(Python3 ${PYTHON_VERSION} COMPONENTS
                 Interpreter Development.Module
                 REQUIRED)

diff --git a/_cmake/externals/FindOrt.cmake b/_cmake/externals/FindOrt.cmake
@@ -4,6 +4,8 @@
 # downloads onnxruntime as a binary
 # functions ort_add_dependency, ort_add_custom_op
 
+file(WRITE "../_setup_ext.txt" "")
+
 if(NOT ORT_VERSION)
   set(ORT_VERSION 1.15.1)
   set(ORT_VERSION_INT 1150)
@@ -58,14 +60,6 @@ else()
   set(ORT_URL ${ORT_VERSION})
 endif()
 
-if(MSVC)
-  set(DLLEXT "dll")
-elseif(APPLE)
-  set(DLLEXT "dylib")
-else()
-  set(DLLEXT "so")
-endif()
-
 find_library(ONNXRUNTIME onnxruntime HINTS "${ONNXRUNTIME_LIB_DIR}")
 if(ONNXRUNTIME-NOTFOUND)
   message(FATAL_ERROR "onnxruntime cannot be found at '${ONNXRUNTIME_LIB_DIR}'")
@@ -96,26 +90,27 @@ endif()
 #
 function(ort_add_dependency name folder_copy)
   get_target_property(target_output_directory ${name} BINARY_DIR)
-  message(STATUS "ort copy ${ORT_LIB_FILES_LENGTH} files from '${ONNXRUNTIME_LIB_DIR}'")
+  message(STATUS "ort: copy-1 ${ORT_LIB_FILES_LENGTH} files from '${ONNXRUNTIME_LIB_DIR}'")
   if(MSVC)
     set(destination_dir ${target_output_directory}/${CMAKE_BUILD_TYPE})
   else()
     set(destination_dir ${target_output_directory})
   endif()
-  message(STATUS "ort copy to '${destination_dir}'")
+  message(STATUS "ort: copy-2 to '${destination_dir}'")
   if(folder_copy)
-    message(STATUS "ort copy to '${folder_copy}'")
+    message(STATUS "ort: copy-3 to '${folder_copy}'")
   endif()
   foreach(file_i ${ORT_LIB_FILES})
     if(NOT EXISTS ${destination_dir}/${file_i})
-      message(STATUS "ort copy '${file_i}' to '${destination_dir}'")
+      message(STATUS "ort: copy-4 '${file_i}' to '${destination_dir}'")
       add_custom_command(
         TARGET ${name} POST_BUILD
         COMMAND ${CMAKE_COMMAND} ARGS -E copy ${file_i} ${destination_dir})
     endif()
     if(folder_copy)
       if(NOT EXISTS ${folder_copy}/${file_i})
-        message(STATUS "ort copy '${file_i}' to '${folder_copy}'")
+        message(STATUS "ort: copy-5 '${file_i}' to '${folder_copy}'")
+        # file(APPEND "../_setup_ext.txt" "copy,${file_i},${folder_copy}\n")
         add_custom_command(
           TARGET ${name} POST_BUILD
           COMMAND ${CMAKE_COMMAND} ARGS -E copy ${file_i} ${folder_copy})
@@ -125,8 +120,6 @@ function(ort_add_dependency name folder_copy)
   # file(COPY ${ORT_LIB_FILES} DESTINATION ${target_output_directory})
 endfunction()
 
-file(WRITE "../_setup_ext.txt" "")
-
 #
 #! ort_add_custom_op : compile a pyx file into cpp
 #
@@ -136,8 +129,13 @@ file(WRITE "../_setup_ext.txt" "")
 # \argn: C++ file to compile
 #
 function(ort_add_custom_op name provider folder)
+  if (WIN32)
+    file(WRITE "${folder}/${name}.def" "LIBRARY "
+               "\"${name}.dll\"\nEXPORTS\n  RegisterCustomOps @1")
+    list(APPEND ARGN "${folder}/${name}.def")
+  endif()
   if (provider STREQUAL "CUDA")
-    message(STATUS "ort custom op ${provider}: '${name}': ${ARGN}")
+    message(STATUS "ort: custom op ${provider}: '${name}': ${ARGN}")
     add_library(${name} SHARED ${ARGN})
 
     # add property --use_fast_math to cu files
@@ -173,7 +171,7 @@ function(ort_add_custom_op name provider folder)
       PRIVATE
       ${ONNXRUNTIME_INCLUDE_DIR})
   else()
-    message(STATUS "ort custom op CPU: '${name}': ${ARGN}")
+    message(STATUS "ort: custom op CPU: '${name}': ${ARGN}")
     add_library(${name} SHARED ${ARGN})
     target_include_directories(${name} PRIVATE ${ONNXRUNTIME_INCLUDE_DIR})
     target_compile_definitions(${name} PRIVATE ORT_VERSION=${ORT_VERSION_INT})

diff --git a/_cmake/load_externals.cmake b/_cmake/load_externals.cmake
@@ -158,7 +158,8 @@ message(STATUS "-------------------")
 if(CUDA_AVAILABLE)
   set(
     config_content
-    "HAS_CUDA = 1\nCUDA_VERSION = '${CUDA_VERSION}'\nCUDA_VERSION_INT = ${CUDA_VERSION_INT}")
+    "HAS_CUDA = 1\nCUDA_VERSION = '${CUDA_VERSION}'"
+    "\nCUDA_VERSION_INT = ${CUDA_VERSION_INT}")
 else()
   set(config_content "HAS_CUDA = 0")
 endif()
diff --git a/_cmake/targets/c_op_conv_.cmake b/_cmake/targets/c_op_conv_.cmake
@@ -9,9 +9,21 @@ local_pybind11_add_module(
   ../onnx_extended/reference/c_ops/cpu/c_op_conv_.cpp)
 eigen_add_dependency(c_op_conv_)
 
+target_include_directories(
+  c_op_conv_
+  PRIVATE
+  ${ROOT_INCLUDE_PATH}/onnx_extended)
+
 add_executable(test_c_op_conv_cpp
                ../_unittests/ut_reference/test_c_op_conv.cpp
                ../onnx_extended/reference/c_ops/cpu/c_op_common.cpp)
-target_include_directories(test_c_op_conv_cpp PRIVATE ${ROOT_INCLUDE_PATH})
+
+target_include_directories(
+  test_c_op_conv_cpp
+  PRIVATE
+  ${ROOT_INCLUDE_PATH}
+  ${ROOT_INCLUDE_PATH}/onnx_extended)
+
 eigen_add_dependency(test_c_op_conv_cpp)
+
 add_test(NAME test_c_op_conv_cpp COMMAND test_c_op_conv_cpp)
diff --git a/_cmake/targets/c_op_tree_ensemble_py_.cmake b/_cmake/targets/c_op_tree_ensemble_py_.cmake
@@ -8,3 +8,7 @@ local_pybind11_add_module(
   ../onnx_extended/reference/c_ops/cpu/c_op_common.cpp
   ../onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_py_.cpp)
 
+target_include_directories(
+  c_op_tree_ensemble_py_
+  PRIVATE
+  ${ROOT_INCLUDE_PATH}/onnx_extended)