diff --git a/.gitignore b/.gitignore index 794fba36..28358319 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ *.sln *.cmake *.whl +*.def /*.png /*.onnx .build_path.txt diff --git a/CHANGELOGS.rst b/CHANGELOGS.rst index a304f4c3..fbec9801 100644 --- a/CHANGELOGS.rst +++ b/CHANGELOGS.rst @@ -4,6 +4,7 @@ Change Logs 0.2.0 +++++ +* :pr:`41`: implements a custom kernel for RandomForestRegressor easier to optimize * :pr:`34`: update to onnxruntime v1.15.1 * :pr:`31`: implement a custom CUDA kernel (gemm) * :pr:`32`: update to onnxruntime v1.15.0 diff --git a/README.rst b/README.rst index f26f13b5..23de400d 100644 --- a/README.rst +++ b/README.rst @@ -29,7 +29,7 @@ onnx-extended: extensions for onnx and onnxruntime **onnx-extended** extends the list of supported operators in onnx reference implementation, or implements faster versions in C++. Documentation `onnx-extended -`_. +`_. Source are available on `github/onnx-extended `_. @@ -115,9 +115,9 @@ can be enabled with the following command: :: - python setup.py build_ext --inplace --enable_nvtx 1 - # or - pip install -e . --config-settings="--enable_nvtx=1" + python setup.py build_ext --inplace --use_nvtx 1 + # or (not working yet) + pip install -e . --config-settings="--use_nvtx=1" Experimental cython binding for onnxruntime +++++++++++++++++++++++++++++++++++++++++++ diff --git a/_cmake/CMakeLists.txt b/_cmake/CMakeLists.txt index 6476872a..0f20b76f 100644 --- a/_cmake/CMakeLists.txt +++ b/_cmake/CMakeLists.txt @@ -1,11 +1,12 @@ cmake_minimum_required(VERSION 3.24.0) -project(onnx_extended VERSION 0.2.0) +project(onnx_extended VERSION ${ONNX_EXTENDED_VERSION}) # # initialisation # message(STATUS "-------------------") +message(STATUS "ONNX_EXTENDED_VERSION=${ONNX_EXTENDED_VERSION}") message(STATUS "CMAKE_VERSION=${CMAKE_VERSION}") message(STATUS "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}") message(STATUS "CMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}") @@ -25,6 +26,7 @@ message(STATUS "USE_CUDA=${USE_CUDA}") message(STATUS "CUDA_BUILD=${CUDA_BUILD}") message(STATUS "USE_NVTX=${USE_NVTX}") message(STATUS "ORT_VERSION=${ORT_VERSION}") + # message(STATUS "ENV-PATH=$ENV{PATH}") # message(STATUS "ENV-PYTHONPATH=$ENV{PYTHONPATH}") message(STATUS "--------------------------------------------") @@ -44,8 +46,8 @@ list(APPEND CMAKE_MODULE_PATH # Packages and constants # -include("load_externals.cmake") include("constants.cmake") +include("load_externals.cmake") # # modules @@ -61,8 +63,10 @@ include("targets/cuda_example_py.cmake") include("targets/vector_function_cy.cmake") set(ORTOPS_INCLUDE_DIR "${ROOT_INCLUDE_PATH}/onnx_extended/ortops") +set(REFOPS_INCLUDE_DIR "${ROOT_INCLUDE_PATH}/onnx_extended/reference/c_ops") include("targets/ortops_tutorial_cpu.cmake") include("targets/ortops_tutorial_cuda.cmake") +include("targets/ortops_optim_cpu.cmake") # # write version diff --git a/_cmake/clang_format.sh b/_cmake/clang_format.sh index be13c1a7..4938cf4a 100644 --- a/_cmake/clang_format.sh +++ b/_cmake/clang_format.sh @@ -6,11 +6,11 @@ echo "--cython-lint--" cython-lint . echo "--clang-format--" find onnx_extended -type f \( -name "*.h" -o -name "*.hpp" -o -name "*.cuh" -o -name "*.cpp" -o -name "*.cc" -o -name "*.cu" \) | while read f; do - echo "Processing '$f'"; - clang-format --length 88 -i $f; + echo "clang-format -i $f"; + clang-format -i $f; done echo "--cmake-lint--" find _cmake -type f \( -name "*.cmake" -o -name "*.txt" \) | while read f; do - echo "Processing '$f'"; + echo "cmake-lint $f --line-width=88 --disabled-codes C0103 C0113"; cmake-lint $f --line-width=88 --disabled-codes C0103 C0113; done diff --git a/_cmake/constants.cmake b/_cmake/constants.cmake index 0c784d0a..36775dfc 100644 --- a/_cmake/constants.cmake +++ b/_cmake/constants.cmake @@ -1,3 +1,14 @@ +# +# python extension +# +if(MSVC) + set(DLLEXT "dll") +elseif(APPLE) + set(DLLEXT "dylib") +else() + set(DLLEXT "so") +endif() + # # C++ 14 or C++ 17 # @@ -37,6 +48,9 @@ else() endif() if(APPLE) + message(STATUS "APPLE: set env var for open mp: CC, CCX, LDFLAGS, CPPFLAGS") + set(ENV{CC} "/usr/local/opt/llvm/bin/clang") + set(ENV{CXX} "/usr/local/opt/llvm/bin/clang++") set(ENV(LDFLAGS) "-L/usr/local/opt/llvm/lib") set(ENV(CPPFLAGS) "-I/usr/local/opt/llvm/include") endif() diff --git a/_cmake/externals/FindCudaExtension.cmake b/_cmake/externals/FindCudaExtension.cmake index 86778062..eca09065 100644 --- a/_cmake/externals/FindCudaExtension.cmake +++ b/_cmake/externals/FindCudaExtension.cmake @@ -4,6 +4,14 @@ # Defines USE_NTVX to enable profiling with NVIDIA profiler. # CUDA_VERSION must be defined as well. +if(${CMAKE_CUDA_COMPILER} STREQUAL "/usr/bin/nvcc") + message(FATAL_ERROR + "CMAKE_CUDA_COMPILER is equal to '${CMAKE_CUDA_COMPILER}', " + "CUDA_VERSION=${CUDA_VERSION}, " + "CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}, " + "You should specify the cuda version by adding --cuda-version=...") +endif() + if(CUDA_VERSION) find_package(CUDAToolkit ${CUDA_VERSION} EXACT) else() @@ -14,6 +22,27 @@ message(STATUS "CUDAToolkit_FOUND=${CUDAToolkit_FOUND}") if(CUDAToolkit_FOUND) + message(STATUS "befor1 language CUDA_VERSION=${CUDA_VERSION}") + message(STATUS "befor1 language CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}") + message(STATUS "befor1 language CMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}") + + if(CMAKE_CUDA_ARCHITECTURES STREQUAL "") + set(CMAKE_CUDA_ARCHITECTURES "native") + endif() + if(CMAKE_CUDA_COMPILER STREQUAL "CMAKE_CUDA_COMPILER-NOTFOUND") + if(CUDA_VERSION STREQUAL "") + message(FATAL_ERROR "No CMAKE_CUDA_COMPILER for CUDA_VERSION=${CUDA_VERSION}. " + "You can use --cuda-version= or set " + "CUDACXX=/usr/local/cuda-/bin/nvcc") + else() + set(CMAKE_CUDA_COMPILER "/usr/local/cuda-${CUDA_VERSION}/bin/nvcc") + message(STATUS "set CMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}") + endif() + endif() + + message(STATUS "before language CUDA_VERSION=${CUDA_VERSION}") + message(STATUS "before language CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}") + message(STATUS "before language CMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}") enable_language(CUDA) message(STATUS "------------- CUDA settings") message(STATUS "CUDA_VERSION=${CUDA_VERSION}") @@ -30,7 +59,7 @@ if(CUDAToolkit_FOUND) "< ${CUDA_VERSION}, nvcc is not setup properly. " "Try 'whereis nvcc' and chack the version.") endif() - + set(CMAKE_CUDA_STANDARD 17) set(CMAKE_CUDA_STANDARD_REQUIRED ON) @@ -42,11 +71,13 @@ if(CUDAToolkit_FOUND) if(CUDA_BUILD STREQUAL "H100opt") - # see https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ + # see https://arnon.dk/ + # matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ set(CMAKE_CUDA_ARCHITECTURES 90) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_90,code=sm_90") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_90a,code=sm_90a") - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_90a,code=compute_90a") + set(CMAKE_CUDA_FLAGS + "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_90a,code=compute_90a") else() # H100, DEFAULT @@ -64,25 +95,36 @@ if(CUDAToolkit_FOUND) endif() if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11) - message(FATAL_ERROR "CUDA verions must be >= 11 but is ${CMAKE_CUDA_COMPILER_VERSION}.") + message(FATAL_ERROR "CUDA verions must be >= 11 but is " + "${CMAKE_CUDA_COMPILER_VERSION}.") endif() if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12) - # 37, 50 still work in CUDA 11 but are marked deprecated and will be removed in future CUDA version. - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_37,code=sm_37") # K80 - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_50,code=sm_50") # M series + # 37, 50 still work in CUDA 11 + # but are marked deprecated and will be removed in future CUDA version. + # K80 + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_37,code=sm_37") + # M series + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_50,code=sm_50") endif() - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_52,code=sm_52") # M60 - # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_60,code=sm_60") # P series - # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_61,code=sm_61") # P series - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_70,code=sm_70") # V series - # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_75,code=sm_75") # T series + # M60 + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_52,code=sm_52") + # P series + # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_60,code=sm_60") + # P series + # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_61,code=sm_61") + # V series + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_70,code=sm_70") + # T series + # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_75,code=sm_75") if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_80,code=sm_80") # A series - # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_86,code=sm_86") # A series - # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_87,code=sm_87") # A series + # A series + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_80,code=sm_80") + # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_86,code=sm_86") + # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_87,code=sm_87") endif() if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.8) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_90,code=sm_90") # H series + # H series + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_90,code=sm_90") endif() endif() diff --git a/_cmake/externals/FindMyPython.cmake b/_cmake/externals/FindMyPython.cmake index d371dfc4..0bc26951 100644 --- a/_cmake/externals/FindMyPython.cmake +++ b/_cmake/externals/FindMyPython.cmake @@ -71,12 +71,6 @@ else() message(STATUS "Use find_package(Python3).") set(Python3_EXECUTABLE ${PYTHON_EXECUTABLE}) if(APPLE) - message(STATUS "APPLE: set env var for open mp: CC, CCX, LDFLAGS, CPPFLAGS") - set(ENV{CC} "/usr/local/opt/llvm/bin/clang") - set(ENV{CXX} "/usr/local/opt/llvm/bin/clang++") - set(ENV{LDFLAGS} "-L/usr/local/opt/llvm/lib") - set(ENV{CPPFLAGS} "-I/usr/local/opt/llvm/include") - find_package(Python3 ${PYTHON_VERSION} COMPONENTS Interpreter Development.Module REQUIRED) diff --git a/_cmake/externals/FindOrt.cmake b/_cmake/externals/FindOrt.cmake index 56c56ed0..733dff0b 100644 --- a/_cmake/externals/FindOrt.cmake +++ b/_cmake/externals/FindOrt.cmake @@ -4,6 +4,8 @@ # downloads onnxruntime as a binary # functions ort_add_dependency, ort_add_custom_op +file(WRITE "../_setup_ext.txt" "") + if(NOT ORT_VERSION) set(ORT_VERSION 1.15.1) set(ORT_VERSION_INT 1150) @@ -58,14 +60,6 @@ else() set(ORT_URL ${ORT_VERSION}) endif() -if(MSVC) - set(DLLEXT "dll") -elseif(APPLE) - set(DLLEXT "dylib") -else() - set(DLLEXT "so") -endif() - find_library(ONNXRUNTIME onnxruntime HINTS "${ONNXRUNTIME_LIB_DIR}") if(ONNXRUNTIME-NOTFOUND) message(FATAL_ERROR "onnxruntime cannot be found at '${ONNXRUNTIME_LIB_DIR}'") @@ -96,26 +90,27 @@ endif() # function(ort_add_dependency name folder_copy) get_target_property(target_output_directory ${name} BINARY_DIR) - message(STATUS "ort copy ${ORT_LIB_FILES_LENGTH} files from '${ONNXRUNTIME_LIB_DIR}'") + message(STATUS "ort: copy-1 ${ORT_LIB_FILES_LENGTH} files from '${ONNXRUNTIME_LIB_DIR}'") if(MSVC) set(destination_dir ${target_output_directory}/${CMAKE_BUILD_TYPE}) else() set(destination_dir ${target_output_directory}) endif() - message(STATUS "ort copy to '${destination_dir}'") + message(STATUS "ort: copy-2 to '${destination_dir}'") if(folder_copy) - message(STATUS "ort copy to '${folder_copy}'") + message(STATUS "ort: copy-3 to '${folder_copy}'") endif() foreach(file_i ${ORT_LIB_FILES}) if(NOT EXISTS ${destination_dir}/${file_i}) - message(STATUS "ort copy '${file_i}' to '${destination_dir}'") + message(STATUS "ort: copy-4 '${file_i}' to '${destination_dir}'") add_custom_command( TARGET ${name} POST_BUILD COMMAND ${CMAKE_COMMAND} ARGS -E copy ${file_i} ${destination_dir}) endif() if(folder_copy) if(NOT EXISTS ${folder_copy}/${file_i}) - message(STATUS "ort copy '${file_i}' to '${folder_copy}'") + message(STATUS "ort: copy-5 '${file_i}' to '${folder_copy}'") + # file(APPEND "../_setup_ext.txt" "copy,${file_i},${folder_copy}\n") add_custom_command( TARGET ${name} POST_BUILD COMMAND ${CMAKE_COMMAND} ARGS -E copy ${file_i} ${folder_copy}) @@ -125,8 +120,6 @@ function(ort_add_dependency name folder_copy) # file(COPY ${ORT_LIB_FILES} DESTINATION ${target_output_directory}) endfunction() -file(WRITE "../_setup_ext.txt" "") - # #! ort_add_custom_op : compile a pyx file into cpp # @@ -136,8 +129,13 @@ file(WRITE "../_setup_ext.txt" "") # \argn: C++ file to compile # function(ort_add_custom_op name provider folder) + if (WIN32) + file(WRITE "${folder}/${name}.def" "LIBRARY " + "\"${name}.dll\"\nEXPORTS\n RegisterCustomOps @1") + list(APPEND ARGN "${folder}/${name}.def") + endif() if (provider STREQUAL "CUDA") - message(STATUS "ort custom op ${provider}: '${name}': ${ARGN}") + message(STATUS "ort: custom op ${provider}: '${name}': ${ARGN}") add_library(${name} SHARED ${ARGN}) # add property --use_fast_math to cu files @@ -173,7 +171,7 @@ function(ort_add_custom_op name provider folder) PRIVATE ${ONNXRUNTIME_INCLUDE_DIR}) else() - message(STATUS "ort custom op CPU: '${name}': ${ARGN}") + message(STATUS "ort: custom op CPU: '${name}': ${ARGN}") add_library(${name} SHARED ${ARGN}) target_include_directories(${name} PRIVATE ${ONNXRUNTIME_INCLUDE_DIR}) target_compile_definitions(${name} PRIVATE ORT_VERSION=${ORT_VERSION_INT}) diff --git a/_cmake/load_externals.cmake b/_cmake/load_externals.cmake index 71f65954..ec6a3635 100644 --- a/_cmake/load_externals.cmake +++ b/_cmake/load_externals.cmake @@ -158,7 +158,8 @@ message(STATUS "-------------------") if(CUDA_AVAILABLE) set( config_content - "HAS_CUDA = 1\nCUDA_VERSION = '${CUDA_VERSION}'\nCUDA_VERSION_INT = ${CUDA_VERSION_INT}") + "HAS_CUDA = 1\nCUDA_VERSION = '${CUDA_VERSION}'" + "\nCUDA_VERSION_INT = ${CUDA_VERSION_INT}") else() set(config_content "HAS_CUDA = 0") endif() diff --git a/_cmake/targets/c_op_conv_.cmake b/_cmake/targets/c_op_conv_.cmake index ad334831..10c7717c 100644 --- a/_cmake/targets/c_op_conv_.cmake +++ b/_cmake/targets/c_op_conv_.cmake @@ -9,9 +9,21 @@ local_pybind11_add_module( ../onnx_extended/reference/c_ops/cpu/c_op_conv_.cpp) eigen_add_dependency(c_op_conv_) +target_include_directories( + c_op_conv_ + PRIVATE + ${ROOT_INCLUDE_PATH}/onnx_extended) + add_executable(test_c_op_conv_cpp ../_unittests/ut_reference/test_c_op_conv.cpp ../onnx_extended/reference/c_ops/cpu/c_op_common.cpp) -target_include_directories(test_c_op_conv_cpp PRIVATE ${ROOT_INCLUDE_PATH}) + +target_include_directories( + test_c_op_conv_cpp + PRIVATE + ${ROOT_INCLUDE_PATH} + ${ROOT_INCLUDE_PATH}/onnx_extended) + eigen_add_dependency(test_c_op_conv_cpp) + add_test(NAME test_c_op_conv_cpp COMMAND test_c_op_conv_cpp) diff --git a/_cmake/targets/c_op_tree_ensemble_py_.cmake b/_cmake/targets/c_op_tree_ensemble_py_.cmake index b8d66c6c..3356f9f1 100644 --- a/_cmake/targets/c_op_tree_ensemble_py_.cmake +++ b/_cmake/targets/c_op_tree_ensemble_py_.cmake @@ -8,3 +8,7 @@ local_pybind11_add_module( ../onnx_extended/reference/c_ops/cpu/c_op_common.cpp ../onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_py_.cpp) +target_include_directories( + c_op_tree_ensemble_py_ + PRIVATE + ${ROOT_INCLUDE_PATH}/onnx_extended) diff --git a/_cmake/targets/ortinf.cmake b/_cmake/targets/ortinf.cmake index e189f7a1..619f136c 100644 --- a/_cmake/targets/ortinf.cmake +++ b/_cmake/targets/ortinf.cmake @@ -4,7 +4,10 @@ message(STATUS "+ CYTHON onnx_extended.ortcy.wrap.ortapi") add_library(lib_ortapi STATIC ../onnx_extended/ortcy/wrap/ortapi.cpp) -target_include_directories(lib_ortapi PUBLIC ${ONNXRUNTIME_INCLUDE_DIR}) +target_include_directories( + lib_ortapi PUBLIC + ${ONNXRUNTIME_INCLUDE_DIR} + ${ROOT_INCLUDE_PATH}/onnx_extended) cython_add_module( ortinf @@ -13,14 +16,17 @@ cython_add_module( target_link_directories(ortinf PRIVATE ${ONNXRUNTIME_LIB_DIR}) message(STATUS " LINK ortinf <- lib_ortapi onnxruntime") target_link_libraries(ortinf PRIVATE lib_ortapi onnxruntime) +target_include_directories(ortinf PRIVATE ${ROOT_INCLUDE_PATH}/onnx_extended) ort_add_dependency(ortinf ${CMAKE_CURRENT_SOURCE_DIR}/../onnx_extended/ortcy/wrap/) set(ORTAPI_INCLUDE_DIR "${ROOT_INCLUDE_PATH}/onnx_extended/ortcy/wrap") add_executable(test_ortcy_inference_cpp ../_unittests/ut_ortcy/test_inference.cpp) target_include_directories( - test_ortcy_inference_cpp PRIVATE + test_ortcy_inference_cpp + PRIVATE ${ROOT_INCLUDE_PATH} + ${ROOT_INCLUDE_PATH}/onnx_extended ${ORT_DIR}/include) message(STATUS " LINK test_ortcy_inference_cpp <- lib_ortapi onnxruntime") target_link_directories(test_ortcy_inference_cpp PRIVATE ${ONNXRUNTIME_LIB_DIR}) diff --git a/_cmake/targets/ortops_optim_cpu.cmake b/_cmake/targets/ortops_optim_cpu.cmake new file mode 100644 index 00000000..184b4146 --- /dev/null +++ b/_cmake/targets/ortops_optim_cpu.cmake @@ -0,0 +1,26 @@ +# +# module: onnx_extended.reference.c_ops.cpu.c_op_conv_ +# +message(STATUS "+ KERNEL onnx_extended.ortops.optim.cpu") + +ort_add_custom_op( + ortops_optim_cpu + "CPU" + ../onnx_extended/ortops/optim/cpu + ../onnx_extended/ortops/optim/cpu/tree_ensemble.cc + ../onnx_extended/ortops/optim/cpu/ort_optim_cpu_lib.cc + ../onnx_extended/reference/c_ops/cpu/c_op_common.cpp) + +target_include_directories( + ortops_optim_cpu + PRIVATE + ${ROOT_INCLUDE_PATH}/onnx_extended) + +target_include_directories( + ortops_optim_cpu + PRIVATE + "${ORTAPI_INCLUDE_DIR}" + "${ORTOPS_INCLUDE_DIR}" + "${REFOPS_INCLUDE_DIR}") + +target_link_libraries(ortops_optim_cpu PRIVATE OpenMP::OpenMP_CXX) diff --git a/_cmake/targets/ortops_tutorial_cpu.cmake b/_cmake/targets/ortops_tutorial_cpu.cmake index 36242cf8..57d5a4b8 100644 --- a/_cmake/targets/ortops_tutorial_cpu.cmake +++ b/_cmake/targets/ortops_tutorial_cpu.cmake @@ -10,9 +10,11 @@ ort_add_custom_op( ../onnx_extended/ortops/tutorial/cpu/my_kernel.cc ../onnx_extended/ortops/tutorial/cpu/my_kernel_attr.cc ../onnx_extended/ortops/tutorial/cpu/ort_tutorial_cpu_lib.cc) -# needed to include helpers.h + +# needed to include onnx_extended_helpers.h target_include_directories( ortops_tutorial_cpu PRIVATE + "${ROOT_INCLUDE_PATH}/onnx_extended" "${ORTAPI_INCLUDE_DIR}" "${ORTOPS_INCLUDE_DIR}") diff --git a/_cmake/targets/ortops_tutorial_cuda.cmake b/_cmake/targets/ortops_tutorial_cuda.cmake index 4d45d7d2..8c4f9794 100644 --- a/_cmake/targets/ortops_tutorial_cuda.cmake +++ b/_cmake/targets/ortops_tutorial_cuda.cmake @@ -12,10 +12,13 @@ if(CUDA_AVAILABLE) ../onnx_extended/ortops/tutorial/cuda ../onnx_extended/ortops/tutorial/cuda/custom_gemm.cu ../onnx_extended/ortops/tutorial/cuda/ort_tutorial_cuda_lib.cc) - # needed to include helpers.h + + # needed to include onnx_extended_helpers.h target_include_directories( ortops_tutorial_cuda PRIVATE + "${ROOT_INCLUDE_PATH}/onnx_extended" "${ORTAPI_INCLUDE_DIR}" "${ORTOPS_INCLUDE_DIR}") + endif() diff --git a/_doc/api/ortops.rst b/_doc/api/ortops.rst index 764710c5..b4ea9007 100644 --- a/_doc/api/ortops.rst +++ b/_doc/api/ortops.rst @@ -3,20 +3,8 @@ ortops ====== -get_ort_ext_libs -================ +.. toctree:: + :maxdepth: -.. autofunction:: onnx_extended.ortops.tutorial.cpu.get_ort_ext_libs - -List of implemented kernels -=========================== - -onnx_extented.ortops.tutorial.cpu -+++++++++++++++++++++++++++++++++ - -.. runpython:: - :showcode: - :rst: - - from onnx_extended.ortops.tutorial.cpu import documentation - print("\n".join(documentation())) + ortops_tutorial + ortops_optim diff --git a/_doc/api/ortops_optim.rst b/_doc/api/ortops_optim.rst new file mode 100644 index 00000000..c321d0c7 --- /dev/null +++ b/_doc/api/ortops_optim.rst @@ -0,0 +1,23 @@ + +============ +ortops.optim +============ + +optimize +======== + +.. autofunction:: onnx_extended.ortops.optim.optimize.change_onnx_operator_domain + +CPU: onnx_extented.ortops.optim.cpu +=================================== + +.. autofunction:: onnx_extended.ortops.optim.cpu.get_ort_ext_libs + +**List of implemented kernels** + +.. runpython:: + :showcode: + :rst: + + from onnx_extended.ortops.optim.cpu import documentation + print("\n".join(documentation())) diff --git a/_doc/api/ortops_tutorial.rst b/_doc/api/ortops_tutorial.rst new file mode 100644 index 00000000..6ee12e9d --- /dev/null +++ b/_doc/api/ortops_tutorial.rst @@ -0,0 +1,32 @@ + +=============== +ortops.tutorial +=============== + +CPU: onnx_extented.ortops.tutorial.cpu +====================================== + +.. autofunction:: onnx_extended.ortops.tutorial.cpu.get_ort_ext_libs + +**List of implemented kernels** + +.. runpython:: + :showcode: + :rst: + + from onnx_extended.ortops.tutorial.cpu import documentation + print("\n".join(documentation())) + +CUDA: onnx_extented.ortops.tutorial.cuda +======================================== + +.. autofunction:: onnx_extended.ortops.tutorial.cuda.get_ort_ext_libs + +**List of implemented kernels** + +.. runpython:: + :showcode: + :rst: + + from onnx_extended.ortops.tutorial.cuda import documentation + print("\n".join(documentation())) diff --git a/_doc/conf.py b/_doc/conf.py index 12e6b5cb..947955fc 100644 --- a/_doc/conf.py +++ b/_doc/conf.py @@ -74,7 +74,8 @@ epkg_dictionary = { "cmake": "https://cmake.org/", "CPUExecutionProvider": "https://onnxruntime.ai/docs/execution-providers/", - "cublasLtMatmul": "https://docs.nvidia.com/cuda/cublas/index.html?highlight=cublasltmatmul#cublasltmatmul", + "cublasLtMatmul": "https://docs.nvidia.com/cuda/cublas/index.html?" + "highlight=cublasltmatmul#cublasltmatmul", "CUDA": "https://developer.nvidia.com/", "cudnn": "https://developer.nvidia.com/cudnn", "cython": "https://cython.org/", @@ -97,7 +98,8 @@ ), "onnxruntime C API": "https://onnxruntime.ai/docs/api/c/", "onnxruntime Graph Optimizations": ( - "https://onnxruntime.ai/docs/performance/model-optimizations/graph-optimizations.html" + "https://onnxruntime.ai/docs/performance/" + "model-optimizations/graph-optimizations.html" ), "openmp": "https://www.openmp.org/", "protobuf": "https://github.com/protocolbuffers/protobuf", diff --git a/_doc/examples/plot_bench_ort.py b/_doc/examples/plot_bench_cypy_ort.py similarity index 99% rename from _doc/examples/plot_bench_ort.py rename to _doc/examples/plot_bench_cypy_ort.py index 0bebb160..1fabfb5c 100644 --- a/_doc/examples/plot_bench_ort.py +++ b/_doc/examples/plot_bench_cypy_ort.py @@ -1,6 +1,4 @@ """ -.. _l-example-bench-ort: - Measuring onnxruntime performance ================================= diff --git a/_doc/index.rst b/_doc/index.rst index 79bb2b76..dfd1cd04 100644 --- a/_doc/index.rst +++ b/_doc/index.rst @@ -105,15 +105,15 @@ If not, some extensions might not be available. :: python setup.py build_ext --inplace - # or - pip install -e . --config-settings="--enable_nvtx=1" + # or (not working yet) + pip install -e . --config-settings="--use_nvtx=1" `NVTX `_ can be enabled with the following command: :: - python setup.py build_ext --inplace --enable_nvtx 1 + python setup.py build_ext --inplace --use_nvtx 1 Experimental cython binding for onnxruntime +++++++++++++++++++++++++++++++++++++++++++ diff --git a/_doc/tutorial/index.rst b/_doc/tutorial/index.rst index 7b84ed93..b851d4b2 100644 --- a/_doc/tutorial/index.rst +++ b/_doc/tutorial/index.rst @@ -37,7 +37,7 @@ with two instructions: By default, *cmake* builds with CUDA if it is available. It can be disabled: * ``python setup.py build_ext --inplace --with-cuda=0``, the legacy way -* ``pip install -e . --config-settings="--with-cuda=0"``, the new way +* ``pip install -e . --config-settings="--with-cuda=0"``, the new way (not fully working yet) In case there are multiple versions of CUDA installed, option `cuda-version` can be specified: @@ -72,7 +72,7 @@ Validation, Experiments ../auto_examples/plot_bench_cpu_vector_sum_parallel ../auto_examples/plot_bench_cpu_vector_sum_avx_parallel ../auto_examples/plot_bench_gpu_vector_sum_gpu - ../auto_examples/plot_bench_ort + ../auto_examples/plot_bench_cypy_ort ../auto_examples/plot_bench_gemm_f8 ../auto_examples/plot_bench_gemm_ort ../auto_examples/plot_profile_gemm_ort diff --git a/_unittests/ut_ortcy/test_ortcy.py b/_unittests/ut_ortcy/test_ortcy.py index eef1e03d..86dabc50 100644 --- a/_unittests/ut_ortcy/test_ortcy.py +++ b/_unittests/ut_ortcy/test_ortcy.py @@ -1,4 +1,5 @@ import unittest +import warnings import os import numpy from onnx import TensorProto @@ -13,8 +14,26 @@ from onnx.checker import check_model from onnx_extended.ext_test_case import ExtTestCase +try: + from onnx_extended.ortcy.wrap.ortinf import OrtSession +except ImportError as e: + msg = "libonnxruntime.so.1.15.1: cannot open shared object file" + if msg in str(e): + from onnx_extended.ortcy.wrap import __file__ as loc + + all_files = os.listdir(os.path.dirname(loc)) + warnings.warn( + f"Unable to find onnxruntime {e!r}, found files in {os.path.dirname(loc)}: " + f"{all_files}." + ) + OrtSession = None + here = os.path.dirname(__file__) + else: + OrtSession = "OrtSession is not initialized" + class TestOrtCy(ExtTestCase): + @unittest.skipIf(OrtSession is None, reason="libonnxruntime installation failed") def test_ort_get_available_providers(self): from onnx_extended.ortcy.wrap.ortinf import ort_get_available_providers @@ -23,6 +42,7 @@ def test_ort_get_available_providers(self): self.assertGreater(len(res), 0) self.assertIn("CPUExecutionProvider", res) + @unittest.skipIf(OrtSession is None, reason="libonnxruntime installation failed") def test_session(self): from onnx_extended.ortcy.wrap.ortinf import OrtSession @@ -66,6 +86,7 @@ def test_session(self): self.assertEqual(len(got), 1) self.assertEqualArray(got[0], x + y) + @unittest.skipIf(OrtSession is None, reason="libonnxruntime installation failed") def test_my_custom_ops_cy(self): from onnx_extended.ortcy.wrap.ortinf import OrtSession from onnx_extended.ortops.tutorial.cpu import get_ort_ext_libs @@ -95,6 +116,7 @@ def test_my_custom_ops_cy(self): got = session.run_2(x, y)[0] self.assertEqualArray(x + y, got) + @unittest.skipIf(OrtSession is None, reason="libonnxruntime installation failed") def test_my_custom_ops_with_attributes(self): from onnx_extended.ortcy.wrap.ortinf import OrtSession from onnx_extended.ortops.tutorial.cpu import get_ort_ext_libs diff --git a/_unittests/ut_ortops/test_optim_cpu.py b/_unittests/ut_ortops/test_optim_cpu.py new file mode 100644 index 00000000..380496af --- /dev/null +++ b/_unittests/ut_ortops/test_optim_cpu.py @@ -0,0 +1,91 @@ +import unittest +import numpy +from sklearn.datasets import make_regression +from sklearn.ensemble import RandomForestRegressor +from skl2onnx import to_onnx +from onnx_extended.ortops.tutorial.cpu import documentation +from onnx_extended.ortops.optim.optimize import ( + change_onnx_operator_domain, + get_node_attribute, +) +from onnx_extended.reference import CReferenceEvaluator +from onnx_extended.ext_test_case import ExtTestCase + +try: + from onnxruntime import InferenceSession, SessionOptions +except ImportError: + SessionOptions, InferenceSession = None, None + + +class TestOrtOpOptimCpu(ExtTestCase): + def test_get_ort_ext_libs(self): + from onnx_extended.ortops.optim.cpu import get_ort_ext_libs + + r = get_ort_ext_libs() + self.assertEqual(len(r), 1) + + def test_documentation(self): + doc = documentation() + self.assertIsInstance(doc, list) + self.assertEqual(len(doc), 2) + for d in doc: + self.assertIn("~~~~", d) + self.assertIsInstance(d, str) + + @unittest.skipIf(InferenceSession is None, "onnxruntime not installed") + def test_random_forest_regressor(self): + from onnx_extended.ortops.optim.cpu import get_ort_ext_libs + + X, y = make_regression(100, 2, n_informative=1, random_state=32) + X = X.astype(numpy.float32) + y = y.astype(numpy.float32) + + rf = RandomForestRegressor(3, max_depth=2, random_state=32) + rf.fit(X[:80], y[:80]) + expected = rf.predict(X[80:]).astype(numpy.float32).reshape((-1, 1)) + onx = to_onnx(rf, X[:1]) + feeds = {"X": X[80:]} + + # check with onnxruntime + sess = InferenceSession( + onx.SerializeToString(), providers=["CPUExecutionProvider"] + ) + got = sess.run(None, feeds)[0] + self.assertEqualArray(expected, got, atol=1e-5) + + # check with CReferenceEvaluator + ref = CReferenceEvaluator(onx) + got = ref.run(None, feeds)[0] + self.assertEqualArray(expected, got.reshape((-1, 1)), atol=1e-5) + + # transformation + att = get_node_attribute(onx.graph.node[0], "nodes_modes") + modes = ",".join(map(lambda s: s.decode("ascii"), att.strings)) + onx2 = change_onnx_operator_domain( + onx, + op_type="TreeEnsembleRegressor", + op_domain="ai.onnx.ml", + new_op_domain="onnx_extented.ortops.optim.cpu", + nodes_modes=modes, + ) + self.assertIn("onnx_extented.ortops.optim.cpu", str(onx2)) + + # check with CReferenceEvaluator + ref = CReferenceEvaluator(onx2) + got = ref.run(None, feeds)[0] + self.assertEqualArray(expected, got.reshape((-1, 1)), atol=1e-5) + + # check with onnxruntime + custom op + r = get_ort_ext_libs() + self.assertExists(r[0]) + opts = SessionOptions() + opts.register_custom_ops_library(r[0]) + sess = InferenceSession( + onx2.SerializeToString(), opts, providers=["CPUExecutionProvider"] + ) + got = sess.run(None, feeds)[0] + self.assertEqualArray(expected, got, atol=1e-5) + + +if __name__ == "__main__": + unittest.main(verbosity=2) diff --git a/_unittests/ut_ortops/test_optim_py.py b/_unittests/ut_ortops/test_optim_py.py new file mode 100644 index 00000000..a7eef2a2 --- /dev/null +++ b/_unittests/ut_ortops/test_optim_py.py @@ -0,0 +1,164 @@ +import unittest +import numpy +from onnx import TensorProto +from onnx.checker import check_model +from onnx.reference import ReferenceEvaluator +from onnx.helper import ( + make_model, + make_node, + make_graph, + make_opsetid, + make_tensor_value_info, +) +from onnx_extended.ext_test_case import ExtTestCase +from onnx_extended.ortops.optim.optimize import change_onnx_operator_domain + + +class TestOrtOpOptimPy(ExtTestCase): + def test_replace_add(self): + X = make_tensor_value_info("X", TensorProto.FLOAT, [None]) + Y = make_tensor_value_info("Y", TensorProto.FLOAT, [None]) + Z = make_tensor_value_info("Z", TensorProto.FLOAT, [None]) + node = make_node("Add", ["X", "Y"], ["Z"]) + graph = make_graph([node], "g", [X, Y], [Z]) + onnx_model = make_model(graph, opset_imports=[make_opsetid("", 16)]) + check_model(onnx_model) + + repl = change_onnx_operator_domain(onnx_model, op_type="Add", new_op_type="Sub") + check_model(repl) + self.assertEqual(len(repl.graph.node), 1) + self.assertEqual(repl.graph.node[0].op_type, "Sub") + ref = ReferenceEvaluator(repl) + x = numpy.arange(5).astype(numpy.float32) + y = (x * 10).astype(numpy.float32) + got = ref.run(None, {"X": x, "Y": y}) + self.assertEqualArray(x - y, got[0]) + + def test_replace_argmin_1(self): + X = make_tensor_value_info("X", TensorProto.FLOAT, [None]) + Z = make_tensor_value_info("Z", TensorProto.FLOAT, [None]) + node = make_node("ArgMin", ["X"], ["Z"], axis=0) + graph = make_graph([node], "g", [X], [Z]) + onnx_model = make_model(graph, opset_imports=[make_opsetid("", 16)]) + check_model(onnx_model) + + repl = change_onnx_operator_domain( + onnx_model, op_type="ArgMin", new_op_type="ArgMin", axis=None + ) + check_model(repl) + self.assertEqual(len(repl.graph.node), 1) + self.assertEqual(repl.graph.node[0].op_type, "ArgMin") + self.assertEqual(len(repl.graph.node[0].attribute), 0) + ref = ReferenceEvaluator(repl) + x = numpy.arange(5).astype(numpy.float32) + got = ref.run(None, {"X": x}) + self.assertEqualArray(numpy.argmin(x).reshape((-1,)), got[0]) + + def test_replace_argmin_2(self): + X = make_tensor_value_info("X", TensorProto.FLOAT, [None]) + Z = make_tensor_value_info("Z", TensorProto.FLOAT, [None]) + node = make_node("ArgMin", ["X"], ["Z"]) + graph = make_graph([node], "g", [X], [Z]) + onnx_model = make_model(graph, opset_imports=[make_opsetid("", 16)]) + check_model(onnx_model) + + repl = change_onnx_operator_domain( + onnx_model, op_type="ArgMin", new_op_type="ArgMin", axis=0 + ) + check_model(repl) + self.assertEqual(len(repl.graph.node), 1) + self.assertEqual(repl.graph.node[0].op_type, "ArgMin") + self.assertEqual(len(repl.graph.node[0].attribute), 1) + ref = ReferenceEvaluator(repl) + x = numpy.arange(5).astype(numpy.float32) + got = ref.run(None, {"X": x}) + self.assertEqualArray(numpy.argmin(x).reshape((-1,)), got[0]) + + def test_replace_argmin_3(self): + X = make_tensor_value_info("X", TensorProto.FLOAT, [None, None]) + Z = make_tensor_value_info("Z", TensorProto.FLOAT, [None, None]) + node = make_node("ArgMin", ["X"], ["Z"], axis=1) + graph = make_graph([node], "g", [X], [Z]) + onnx_model = make_model(graph, opset_imports=[make_opsetid("", 16)]) + check_model(onnx_model) + + repl = change_onnx_operator_domain( + onnx_model, op_type="ArgMin", new_op_type="ArgMax", axis=0 + ) + check_model(repl) + self.assertEqual(len(repl.graph.node), 1) + self.assertEqual(repl.graph.node[0].op_type, "ArgMax") + self.assertEqual(len(repl.graph.node[0].attribute), 1) + ref = ReferenceEvaluator(repl) + x = numpy.arange(4).astype(numpy.float32).reshape((2, -1)) + got = ref.run(None, {"X": x}) + self.assertEqualArray(numpy.argmax(x, axis=0, keepdims=1), got[0]) + + def test_replace_domain(self): + X = make_tensor_value_info("X", TensorProto.FLOAT, [None]) + Y = make_tensor_value_info("Y", TensorProto.FLOAT, [None]) + Z = make_tensor_value_info("Z", TensorProto.FLOAT, [None]) + node = make_node("Add", ["X", "Y"], ["Z"]) + graph = make_graph([node], "g", [X, Y], [Z]) + onnx_model = make_model(graph, opset_imports=[make_opsetid("", 16)]) + check_model(onnx_model) + + repl = change_onnx_operator_domain( + onnx_model, + op_type="Add", + new_op_type="Sub", + new_op_domain="NEW", + ) + check_model(repl) + self.assertEqual(len(repl.graph.node), 1) + self.assertEqual(repl.graph.node[0].op_type, "Sub") + self.assertIn('domain: "NEW"', str(repl)) + + def test_replace_domain_att(self): + X = make_tensor_value_info("X", TensorProto.FLOAT, [None]) + Y = make_tensor_value_info("Y", TensorProto.FLOAT, [None]) + Z = make_tensor_value_info("Z", TensorProto.FLOAT, [None]) + node = make_node("Add", ["X", "Y"], ["Z"]) + graph = make_graph([node], "g", [X, Y], [Z]) + onnx_model = make_model(graph, opset_imports=[make_opsetid("", 16)]) + check_model(onnx_model) + + repl = change_onnx_operator_domain( + onnx_model, + op_type="Add", + new_op_type="Sub", + new_op_domain="NEW", + ATTR=6, + ) + check_model(repl) + self.assertEqual(len(repl.graph.node), 1) + self.assertEqual(repl.graph.node[0].op_type, "Sub") + self.assertIn('domain: "NEW"', str(repl)) + self.assertIn('name: "ATTR"', str(repl)) + self.assertIn("i: 6", str(repl)) + + def test_replace_domain_att_same(self): + X = make_tensor_value_info("X", TensorProto.FLOAT, [None]) + Y = make_tensor_value_info("Y", TensorProto.FLOAT, [None]) + Z = make_tensor_value_info("Z", TensorProto.FLOAT, [None]) + node = make_node("Add", ["X", "Y"], ["Z"]) + graph = make_graph([node], "g", [X, Y], [Z]) + onnx_model = make_model(graph, opset_imports=[make_opsetid("", 16)]) + check_model(onnx_model) + + repl = change_onnx_operator_domain( + onnx_model, + op_type="Add", + new_op_domain="NEW", + ATTR=6, + ) + check_model(repl) + self.assertEqual(len(repl.graph.node), 1) + self.assertEqual(repl.graph.node[0].op_type, "Add") + self.assertIn('domain: "NEW"', str(repl)) + self.assertIn('name: "ATTR"', str(repl)) + self.assertIn("i: 6", str(repl)) + + +if __name__ == "__main__": + unittest.main(verbosity=2) diff --git a/_unittests/ut_ortops/test_tutorial_cuda.py b/_unittests/ut_ortops/test_tutorial_cuda.py index 1df38c72..737c8b4c 100644 --- a/_unittests/ut_ortops/test_tutorial_cuda.py +++ b/_unittests/ut_ortops/test_tutorial_cuda.py @@ -16,27 +16,19 @@ except ImportError: onnx_simple_text_plot = str try: + from onnxruntime import InferenceSession +except ImportError: + InferenceSession = None + ort_version = "0.0" +if InferenceSession is not None: from onnxruntime import ( - InferenceSession, SessionOptions, get_available_providers, __version__ as ort_version, ) from onnxruntime.capi.onnxruntime_pybind11_state import Fail as OrtFail -except ImportError: - ( - SessionOptions, - InferenceSession, - get_available_providers, - ort_version, - OrtFail, - ) = ( - None, - None, - None, - None, - None, - ) + + from onnx_extended.ortops.tutorial.cuda import documentation from onnx_extended.ext_test_case import ExtTestCase from onnx_extended import has_cuda @@ -47,7 +39,21 @@ get_device_prop = None +from onnx_extended.validation.cuda import cuda_version + + +def has_cuda_ort(): + if not has_cuda(): + return False + if InferenceSession is None: + return False + if "CUDAExecutionProvider" not in get_available_providers(): + return False + return True + + class TestOrtOpTutorialCuda(ExtTestCase): + @unittest.skipIf(get_device_prop is None, reason="CUDA not available") def test_get_ort_ext_libs(self): from onnx_extended.ortops.tutorial.cuda import get_ort_ext_libs @@ -211,10 +217,9 @@ def check(f): f"\n----\ngot=\n{got[0][:2,:2]}" ) from e - @unittest.skipIf(InferenceSession is None, "onnxruntime not installed") @unittest.skipIf( - "CUDAExecutionProvider" not in get_available_providers(), - reason="CUDA provider not available", + not has_cuda_ort(), + reason="onnxruntime not installed or CUDA provider not available", ) def test_custom_gemm_float32_default(self): self.common_test_custom_gemm( @@ -226,10 +231,9 @@ def test_custom_gemm_float32_default(self): computeType="CUBLAS_COMPUTE_32F_FAST_TF32", ) - @unittest.skipIf(InferenceSession is None, "onnxruntime not installed") @unittest.skipIf( - "CUDAExecutionProvider" not in get_available_providers(), - reason="CUDA provider not available", + not has_cuda_ort(), + reason="onnxruntime not installed or CUDA provider not available", ) def test_custom_gemm_float32_relu(self): self.common_test_custom_gemm( @@ -242,10 +246,9 @@ def test_custom_gemm_float32_relu(self): activation="RELU", ) - @unittest.skipIf(InferenceSession is None, "onnxruntime not installed") @unittest.skipIf( - "CUDAExecutionProvider" not in get_available_providers(), - reason="CUDA provider not available", + not has_cuda_ort(), + reason="onnxruntime not installed or CUDA provider not available", ) def test_custom_gemm_float32_gelu(self): self.common_test_custom_gemm( @@ -258,10 +261,9 @@ def test_custom_gemm_float32_gelu(self): activation="GELU", ) - @unittest.skipIf(InferenceSession is None, "onnxruntime not installed") @unittest.skipIf( - "CUDAExecutionProvider" not in get_available_providers(), - reason="CUDA provider not available", + not has_cuda_ort(), + reason="onnxruntime not installed or CUDA provider not available", ) def test_custom_gemm_float32_col_major_relu(self): self.common_test_custom_gemm( @@ -275,10 +277,9 @@ def test_custom_gemm_float32_col_major_relu(self): rowMajor=0, ) - @unittest.skipIf(InferenceSession is None, "onnxruntime not installed") @unittest.skipIf( - "CUDAExecutionProvider" not in get_available_providers(), - reason="CUDA provider not available", + not has_cuda_ort(), + reason="onnxruntime not installed or CUDA provider not available", ) def test_custom_gemm_float32_col_major_gelu(self): self.common_test_custom_gemm( @@ -292,10 +293,9 @@ def test_custom_gemm_float32_col_major_gelu(self): rowMajor=0, ) - @unittest.skipIf(InferenceSession is None, "onnxruntime not installed") @unittest.skipIf( - "CUDAExecutionProvider" not in get_available_providers(), - reason="CUDA provider not available", + not has_cuda_ort(), + reason="onnxruntime not installed or CUDA provider not available", ) def test_custom_gemm_float32_not_square(self): self.common_test_custom_gemm( @@ -308,10 +308,9 @@ def test_custom_gemm_float32_not_square(self): square=False, ) - @unittest.skipIf(InferenceSession is None, "onnxruntime not installed") @unittest.skipIf( - "CUDAExecutionProvider" not in get_available_providers(), - reason="CUDA provider not available", + not has_cuda_ort(), + reason="onnxruntime not installed or CUDA provider not available", ) def test_custom_gemm_float32_col_major(self): self.common_test_custom_gemm( @@ -324,10 +323,9 @@ def test_custom_gemm_float32_col_major(self): rowMajor=0, ) - @unittest.skipIf(InferenceSession is None, "onnxruntime not installed") @unittest.skipIf( - "CUDAExecutionProvider" not in get_available_providers(), - reason="CUDA provider not available", + not has_cuda_ort(), + reason="onnxruntime not installed or CUDA provider not available", ) def test_custom_gemm_float32_col_major_not_square(self): self.common_test_custom_gemm( @@ -341,10 +339,13 @@ def test_custom_gemm_float32_col_major_not_square(self): square=False, ) - @unittest.skipIf(InferenceSession is None, "onnxruntime not installed") @unittest.skipIf( - "CUDAExecutionProvider" not in get_available_providers(), - reason="CUDA provider not available", + not has_cuda_ort(), + reason="onnxruntime not installed or CUDA provider not available", + ) + @unittest.skipIf( + Version(cuda_version()) < Version("12.0"), + reason="beta != 0 bugged in CUDA 11.8.", ) def test_custom_gemm_float32_bias(self): self.common_test_custom_gemm( @@ -357,10 +358,13 @@ def test_custom_gemm_float32_bias(self): beta=1.0, ) - @unittest.skipIf(InferenceSession is None, "onnxruntime not installed") @unittest.skipIf( - "CUDAExecutionProvider" not in get_available_providers(), - reason="CUDA provider not available", + not has_cuda_ort(), + reason="onnxruntime not installed or CUDA provider not available", + ) + @unittest.skipIf( + Version(cuda_version()) < Version("12.0"), + reason="beta != 0 bugged in CUDA 11.8.", ) def test_custom_gemm_float32_bias_01(self): self.common_test_custom_gemm( @@ -373,10 +377,13 @@ def test_custom_gemm_float32_bias_01(self): beta=1.0, ) - @unittest.skipIf(InferenceSession is None, "onnxruntime not installed") @unittest.skipIf( - "CUDAExecutionProvider" not in get_available_providers(), - reason="CUDA provider not available", + not has_cuda_ort(), + reason="onnxruntime not installed or CUDA provider not available", + ) + @unittest.skipIf( + Version(cuda_version()) < Version("12.0"), + reason="beta != 0 bugged in CUDA 11.8.", ) def test_custom_gemm_float32_bias_col_major(self): self.common_test_custom_gemm( @@ -390,10 +397,13 @@ def test_custom_gemm_float32_bias_col_major(self): rowMajor=0, ) - @unittest.skipIf(InferenceSession is None, "onnxruntime not installed") @unittest.skipIf( - "CUDAExecutionProvider" not in get_available_providers(), - reason="CUDA provider not available", + not has_cuda_ort(), + reason="onnxruntime not installed or CUDA provider not available", + ) + @unittest.skipIf( + Version(cuda_version()) < Version("12.0"), + reason="beta != 0 bugged in CUDA 11.8.", ) def test_custom_gemm_float32_not_square_bias(self): self.common_test_custom_gemm( @@ -407,10 +417,13 @@ def test_custom_gemm_float32_not_square_bias(self): square=False, ) - @unittest.skipIf(InferenceSession is None, "onnxruntime not installed") @unittest.skipIf( - "CUDAExecutionProvider" not in get_available_providers(), - reason="CUDA provider not available", + not has_cuda_ort(), + reason="onnxruntime not installed or CUDA provider not available", + ) + @unittest.skipIf( + Version(cuda_version()) < Version("12.0"), + reason="beta != 0 bugged in CUDA 11.8.", ) def test_custom_gemm_float32_not_square_bias_col_major(self): self.common_test_custom_gemm( @@ -425,10 +438,9 @@ def test_custom_gemm_float32_not_square_bias_col_major(self): rowMajor=0, ) - @unittest.skipIf(InferenceSession is None, "onnxruntime not installed") @unittest.skipIf( - "CUDAExecutionProvider" not in get_available_providers(), - reason="CUDA provider not available", + not has_cuda_ort(), + reason="onnxruntime not installed or CUDA provider not available", ) def test_custom_gemm_float16_default(self): self.common_test_custom_gemm( @@ -440,10 +452,9 @@ def test_custom_gemm_float16_default(self): computeType="CUBLAS_COMPUTE_32F", ) - @unittest.skipIf(InferenceSession is None, "onnxruntime not installed") @unittest.skipIf( - "CUDAExecutionProvider" not in get_available_providers(), - reason="CUDA provider not available", + not has_cuda_ort(), + reason="onnxruntime not installed or CUDA provider not available", ) def test_custom_gemm_float32_row_major(self): self.common_test_custom_gemm( @@ -456,10 +467,9 @@ def test_custom_gemm_float32_row_major(self): rowMajor=1, ) - @unittest.skipIf(InferenceSession is None, "onnxruntime not installed") @unittest.skipIf( - "CUDAExecutionProvider" not in get_available_providers(), - reason="CUDA provider not available", + not has_cuda_ort(), + reason="onnxruntime not installed or CUDA provider not available", ) @unittest.skipIf( Version(ort_version) < Version("1.16"), reason="float8 types not released" @@ -478,10 +488,9 @@ def test_custom_gemm_float8(self): rowMajor=0, ) - @unittest.skipIf(InferenceSession is None, "onnxruntime not installed") @unittest.skipIf( - "CUDAExecutionProvider" not in get_available_providers(), - reason="CUDA provider not available", + not has_cuda_ort(), + reason="onnxruntime not installed or CUDA provider not available", ) @unittest.skipIf( Version(ort_version) < Version("1.16"), reason="float8 types not released" diff --git a/_unittests/ut_reference/test_c_tree_ensemble.py b/_unittests/ut_reference/test_c_tree_ensemble.py index 8fb614ea..88cf7b1b 100644 --- a/_unittests/ut_reference/test_c_tree_ensemble.py +++ b/_unittests/ut_reference/test_c_tree_ensemble.py @@ -15,19 +15,16 @@ from lightgbm import LGBMRegressor from onnx_extended.ext_test_case import ExtTestCase, ignore_warnings from onnx_extended.reference import CReferenceEvaluator -from onnx_extended.reference.c_ops.c_op_tree_ensemble_classifier import ( - TreeEnsembleClassifier_1, -) -from onnx_extended.reference.c_ops.c_op_tree_ensemble_regressor import ( - TreeEnsembleRegressor_1, - TreeEnsembleRegressor_3, -) class TestCTreeEnsemble(ExtTestCase): @unittest.skipIf(onnx_opset_version() < 19, reason="ReferenceEvaluator is bugged") @ignore_warnings((FutureWarning, DeprecationWarning)) def test_decision_tree_classifier_bin(self): + from onnx_extended.reference.c_ops.c_op_tree_ensemble_classifier import ( + TreeEnsembleClassifier_1, + ) + iris = load_iris() X, y = iris.data.astype(numpy.float32), iris.target y[y == 2] = 0 @@ -51,6 +48,10 @@ def test_decision_tree_classifier_bin(self): @unittest.skipIf(onnx_opset_version() < 19, reason="ReferenceEvaluator is bugged") @ignore_warnings((FutureWarning, DeprecationWarning)) def test_decision_tree_classifier_multi(self): + from onnx_extended.reference.c_ops.c_op_tree_ensemble_classifier import ( + TreeEnsembleClassifier_1, + ) + iris = load_iris() X, y = iris.data.astype(numpy.float32), iris.target X_train, X_test, y_train, _ = train_test_split(X, y, random_state=11) @@ -73,6 +74,10 @@ def test_decision_tree_classifier_multi(self): @unittest.skipIf(onnx_opset_version() < 19, reason="ReferenceEvaluator is bugged") @ignore_warnings((FutureWarning, DeprecationWarning)) def test_decision_tree_classifier_plusten(self): + from onnx_extended.reference.c_ops.c_op_tree_ensemble_classifier import ( + TreeEnsembleClassifier_1, + ) + iris = load_iris() X, y = iris.data.astype(numpy.float32), iris.target y += 10 @@ -95,6 +100,10 @@ def test_decision_tree_classifier_plusten(self): @unittest.skipIf(onnx_opset_version() < 19, reason="ReferenceEvaluator is bugged") @ignore_warnings((FutureWarning, DeprecationWarning)) def test_gradient_boosting_classifier2(self): + from onnx_extended.reference.c_ops.c_op_tree_ensemble_classifier import ( + TreeEnsembleClassifier_1, + ) + iris = load_iris() X, y = iris.data.astype(numpy.float32), iris.target y[y == 2] = 1 @@ -116,6 +125,10 @@ def test_gradient_boosting_classifier2(self): @unittest.skipIf(onnx_opset_version() < 19, reason="ReferenceEvaluator is bugged") @ignore_warnings((FutureWarning, DeprecationWarning)) def test_gradient_boosting_classifier3(self): + from onnx_extended.reference.c_ops.c_op_tree_ensemble_classifier import ( + TreeEnsembleClassifier_1, + ) + iris = load_iris() X, y = iris.data.astype(numpy.float32), iris.target X_train, X_test, y_train, _ = train_test_split(X, y, random_state=11) @@ -139,6 +152,10 @@ def test_gradient_boosting_classifier3(self): onnx_opset_version() < 19, reason="ArrayFeatureExtractor has no implementation" ) def test_decision_tree_classifier_mlabel(self): + from onnx_extended.reference.c_ops.c_op_tree_ensemble_classifier import ( + TreeEnsembleClassifier_1, + ) + iris = load_iris() X, y_ = iris.data.astype(numpy.float32), iris.target y = numpy.zeros((y_.shape[0], 3), dtype=numpy.int64) @@ -166,6 +183,10 @@ def test_decision_tree_classifier_mlabel(self): @unittest.skipIf(onnx_opset_version() < 19, reason="ReferenceEvaluator is bugged") @ignore_warnings((FutureWarning, DeprecationWarning)) def test_decision_tree_regressor(self): + from onnx_extended.reference.c_ops.c_op_tree_ensemble_regressor import ( + TreeEnsembleRegressor_1, + ) + iris = load_iris() X, y = iris.data, iris.target X_train, X_test, y_train, _ = train_test_split(X, y, random_state=11) @@ -195,6 +216,10 @@ def test_decision_tree_regressor(self): @ignore_warnings((FutureWarning, DeprecationWarning, UserWarning)) def test_decision_tree_regressor_double(self): + from onnx_extended.reference.c_ops.c_op_tree_ensemble_regressor import ( + TreeEnsembleRegressor_3, + ) + iris = load_iris() X, y = iris.data, iris.target X_train, X_test, y_train, _ = train_test_split(X, y, random_state=11) @@ -379,7 +404,7 @@ def common_test_onnxrt_python_tree_ensemble_runtime_version_cls( oinf = CReferenceEvaluator(model_def) y = oinf.run(None, {"X": X_test.astype(dtype)}) lexp = clr.predict_proba(X_test).astype(numpy.float32) - atol = {numpy.float32: 1e-5, numpy.float64: 1e-1} + atol = {numpy.float32: 1e-5, numpy.float64: 1.01e-1} with self.subTest(dtype=dtype): if single_cls: diff = list(sorted(numpy.abs(lexp.ravel() - y[1]))) diff --git a/_unittests/ut_validation/test_cuda_gemm.py b/_unittests/ut_validation/test_cuda_gemm.py index cdeb52a5..823a92dc 100644 --- a/_unittests/ut_validation/test_cuda_gemm.py +++ b/_unittests/ut_validation/test_cuda_gemm.py @@ -13,6 +13,7 @@ class TestCudaGemm(ExtTestCase): + @unittest.skipIf(get_device_prop is None, reason="CUDA not available") def test_get_device_prop(self): r = get_device_prop() self.assertIsInstance(r, dict) diff --git a/_unittests/ut_validation/test_speed_metrics.py b/_unittests/ut_validation/test_speed_metrics.py index 1a1003ca..27965a2d 100644 --- a/_unittests/ut_validation/test_speed_metrics.py +++ b/_unittests/ut_validation/test_speed_metrics.py @@ -1,17 +1,21 @@ import unittest from onnx_extended.ext_test_case import ExtTestCase -from onnx_extended.validation.cpu._validation import ( - benchmark_cache, - benchmark_cache_tree, -) class TestSpeedMetrics(ExtTestCase): def test_benchmark_cache(self): + from onnx_extended.validation.cpu._validation import ( + benchmark_cache, + ) + res = benchmark_cache(1000, False) self.assertGreater(res, 0) def test_benchmark_cache_tree(self): + from onnx_extended.validation.cpu._validation import ( + benchmark_cache_tree, + ) + res = benchmark_cache_tree(1000) self.assertIsInstance(res, list) self.assertEqual(len(res), 1000) diff --git a/_unittests/ut_validation/test_vector_sum.py b/_unittests/ut_validation/test_vector_sum.py index d3196ad0..0c3ce28a 100644 --- a/_unittests/ut_validation/test_vector_sum.py +++ b/_unittests/ut_validation/test_vector_sum.py @@ -1,22 +1,12 @@ import unittest import numpy from onnx_extended.ext_test_case import ExtTestCase -from onnx_extended.validation.cpu._validation import ( - vector_add, - vector_sum, - vector_sum_array, - vector_sum_array_parallel, - vector_sum_array_avx, - vector_sum_array_avx_parallel, -) -from onnx_extended.validation.cython.vector_function_cy import ( - vector_sum_c, - vector_add_c, -) class TestVectorSum(ExtTestCase): def test_vector_sum_c(self): + from onnx_extended.validation.cython.vector_function_cy import vector_sum_c + values = numpy.array([[10, 1, 4, 5, 6, 7]], dtype=numpy.float32) t1 = vector_sum_c(values, True) t2 = vector_sum_c(values, False) @@ -24,6 +14,8 @@ def test_vector_sum_c(self): self.assertEqual(t2, 33) def test_vector_sum(self): + from onnx_extended.validation.cpu._validation import vector_sum + values = [10, 1, 4, 5, 6, 7] t1 = vector_sum(1, values, True) t2 = vector_sum(1, values, False) @@ -36,6 +28,8 @@ def test_vector_sum(self): self.assertEqual(t2, 33) def test_vector_sum_array(self): + from onnx_extended.validation.cpu._validation import vector_sum_array + values = numpy.array([10, 1, 4, 5, 6, 7], dtype=numpy.float32) t1 = vector_sum_array(1, values, True) t2 = vector_sum_array(1, values, False) @@ -48,6 +42,8 @@ def test_vector_sum_array(self): self.assertEqual(t2, 33) def test_vector_sum_array_parallel(self): + from onnx_extended.validation.cpu._validation import vector_sum_array_parallel + values = numpy.arange(16 * 16).reshape((-1, 16)).astype(numpy.float32) t = values.sum() t1 = vector_sum_array_parallel(16, values, True) @@ -56,18 +52,26 @@ def test_vector_sum_array_parallel(self): self.assertEqual(t, t2) def test_vector_sum_array_avx(self): + from onnx_extended.validation.cpu._validation import vector_sum_array_avx + values = numpy.arange(16 * 16).reshape((-1, 16)).astype(numpy.float32) t = values.sum() t1 = vector_sum_array_avx(16, values) self.assertEqual(t, t1) def test_vector_sum_array_avx_parallel(self): + from onnx_extended.validation.cpu._validation import ( + vector_sum_array_avx_parallel, + ) + values = numpy.arange(16 * 16).reshape((-1, 16)).astype(numpy.float32) t = values.sum() t1 = vector_sum_array_avx_parallel(16, values) self.assertEqual(t, t1) def test_vector_add_exc(self): + from onnx_extended.validation.cpu._validation import vector_add + # This test checks function vector_add # raises an exception if the dimension do not match. v1 = numpy.ones((3, 4), dtype=numpy.float32) @@ -77,6 +81,8 @@ def test_vector_add_exc(self): self.assertRaise(lambda: vector_add(v1, v2), RuntimeError) def test_vector_add(self): + from onnx_extended.validation.cpu._validation import vector_add + v1 = numpy.ones((3, 4), dtype=numpy.float32) v2 = (numpy.ones((3, 4)) * 10).astype(numpy.float32) v3 = vector_add(v1, v2) @@ -84,6 +90,8 @@ def test_vector_add(self): self.assertEqualArray(v1 + v2, v3) def test_vector_add_c(self): + from onnx_extended.validation.cython.vector_function_cy import vector_add_c + t1 = numpy.arange(10).reshape((2, 5)).astype(numpy.float32) t2 = numpy.arange(10).reshape((2, 5)).astype(numpy.float32) res = t1 + t2 diff --git a/_unittests/ut_xrun_doc/test_documentation_examples.py b/_unittests/ut_xrun_doc/test_documentation_examples.py index cb707f38..538a272b 100644 --- a/_unittests/ut_xrun_doc/test_documentation_examples.py +++ b/_unittests/ut_xrun_doc/test_documentation_examples.py @@ -1,4 +1,5 @@ import unittest +import warnings import os import sys import importlib @@ -10,6 +11,23 @@ VERBOSE = 0 ROOT = os.path.realpath(os.path.abspath(os.path.join(onnx_extended_file, "..", ".."))) +try: + from onnx_extended.ortcy.wrap.ortinf import OrtSession +except ImportError as e: + msg = "libonnxruntime.so.1.15.1: cannot open shared object file" + if msg in str(e): + from onnx_extended.ortcy.wrap import __file__ as loc + + all_files = os.listdir(os.path.dirname(loc)) + warnings.warn( + f"Unable to find onnxruntime {e!r}, found files in {os.path.dirname(loc)}: " + f"{all_files}." + ) + OrtSession = None + here = os.path.dirname(__file__) + else: + OrtSession = "OrtSession is not initialized" + def import_source(module_file_path, module_name): if not os.path.exists(module_file_path): @@ -64,6 +82,9 @@ def add_test_methods(cls): fold = os.path.normpath(os.path.join(this, "..", "..", "_doc", "examples")) found = os.listdir(fold) for name in found: + if OrtSession is None and name in {"plot_bench_cypy_ort.py"}: + # The build went wrong. + continue if name.startswith("plot_") and name.endswith(".py"): short_name = os.path.split(os.path.splitext(name)[0])[-1] diff --git a/azure-pipelines.yml b/azure-pipelines.yml index c315f5d0..778fe91c 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -4,8 +4,8 @@ jobs: vmImage: 'ubuntu-latest' strategy: matrix: - Python310-Linux: - python.version: '3.10' + Python311-Linux: + python.version: '3.11' maxParallel: 3 steps: @@ -30,8 +30,8 @@ jobs: black --diff . displayName: 'Black' - script: | - cmake-lint _cmake/Find* --disabled-codes C0103 C0113 --linelength=88 - cmake-lint _cmake/CMake* --disabled-codes C0103 C0113 --linelength=88 + cmake-lint _cmake/Find* --disabled-codes C0103 C0113 --line-width=88 + cmake-lint _cmake/CMake* --disabled-codes C0103 C0113 --line-width=88 displayName: 'cmake-lint' - script: | rstcheck -r ./_doc ./onnx_extended @@ -40,17 +40,25 @@ jobs: cython-lint . displayName: 'cython-lint' - script: | - python -m pip wheel . --wheel-dir dist --config-settings="--with_cuda=0" -v -v -v + export USE_CUDA=0 + python -m pip install -e . -v + displayName: 'pip install -e . -v' + - script: | + python -m pytest _unittests --durations=10 + displayName: 'Runs Unit Tests' + - script: | + # --config-settings does not work yet. + # python -m pip wheel . --config-settings="--use_cuda=0" -v + export USE_CUDA=0 + python -m pip wheel . -v displayName: 'build wheel' - script: | - python -m pip install . -v -v -v - displayName: 'install wheel' -# - script: | # It fails due to ModuleNotFoundError: No module named 'onnx_extended.reference.c_ops.cpu.c_op_conv_' -# python -m pytest . --durations=10 -# displayName: 'Runs Unit Tests' + mkdir dist + cp onnx_extended*.whl dist + displayName: 'copy wheel' - task: PublishPipelineArtifact@0 inputs: - artifactName: 'wheel-linux-wheel-$(python.version)' + artifactName: 'wheel-linux-pip-$(python.version)' targetPath: 'dist' - job: 'TestLinux' @@ -58,8 +66,8 @@ jobs: vmImage: 'ubuntu-latest' strategy: matrix: - Python310-Linux: - python.version: '3.10' + Python311-Linux: + python.version: '3.11' maxParallel: 3 steps: @@ -88,8 +96,8 @@ jobs: black --diff . displayName: 'Black' - script: | - cmake-lint _cmake/Find* --disabled-codes C0103 C0113 --linelength=88 - cmake-lint _cmake/CMake* --disabled-codes C0103 C0113 --linelength=88 + cmake-lint _cmake/Find* --disabled-codes C0103 C0113 --line-width=88 + cmake-lint _cmake/CMake* --disabled-codes C0103 C0113 --line-width=88 displayName: 'cmake-lint' - script: | cython-lint . @@ -109,9 +117,7 @@ jobs: displayName: 'Run C++ Unit Tests' - script: | - cd _unittests - python -m pytest . --durations=10 - cd .. + python -m pytest _unittests --durations=10 displayName: 'Runs Unit Tests' - script: | python -u setup.py bdist_wheel @@ -160,9 +166,7 @@ jobs: ctest -C Release --rerun-failed --output-on-failure displayName: 'Runs C++ Unit Tests' - script: | - cd _unittests - python -m pytest . --durations=10 - cd .. + python -m pytest _unittests --durations=10 displayName: 'Runs Unit Tests' - script: | python -u setup.py bdist_wheel @@ -210,6 +214,7 @@ jobs: pip install -r requirements-dev.txt displayName: 'Install Requirements dev' - script: | + gcc --version python -c "import sys;print('PYTHON', sys.executable)" python -c "import sys;print('PYTHON', sys.version_info)" python -c "import numpy;print('numpy', numpy.__version__)" @@ -223,9 +228,7 @@ jobs: displayName: 'build wheel' - script: | source activate myEnvironment - cd _unittests - python -m pytest . --durations=10 - cd .. + python -m pytest _unittests --durations=10 displayName: 'Runs Unit Tests' - script: | python -u setup.py bdist_wheel diff --git a/onnx_extended/ortcy/wrap/helpers.h b/onnx_extended/onnx_extended_helpers.h similarity index 70% rename from onnx_extended/ortcy/wrap/helpers.h rename to onnx_extended/onnx_extended_helpers.h index f382ab5d..4e44971b 100644 --- a/onnx_extended/ortcy/wrap/helpers.h +++ b/onnx_extended/onnx_extended_helpers.h @@ -8,7 +8,23 @@ #include #include -namespace orthelpers { +namespace onnx_extended_helpers { + +inline std::vector SplitString(const std::string &input, + char delimiter) { + std::vector parts; + std::string::size_type start = 0; + std::string::size_type end = input.find(delimiter); + + while (end != std::string::npos) { + parts.push_back(input.substr(start, end - start)); + start = end + 1; + end = input.find(delimiter, start); + } + + parts.push_back(input.substr(start)); + return parts; +} inline void MakeStringInternal(std::ostringstream &ss) noexcept {} @@ -73,18 +89,18 @@ template inline std::string MakeString(const Args &...args) { } #if !defined(_THROW_DEFINED) -#define EXT_THROW(...) throw std::runtime_error(orthelpers::MakeString(__VA_ARGS__)); +#define EXT_THROW(...) \ + throw std::runtime_error(onnx_extended_helpers::MakeString(__VA_ARGS__)); #define _THROW_DEFINED #endif #if !defined(_ENFORCE_DEFINED) #define EXT_ENFORCE(cond, ...) \ if (!(cond)) \ - throw std::runtime_error( \ - orthelpers::MakeString("`", #cond, "` failed. ", orthelpers::MakeString(__VA_ARGS__))); + throw std::runtime_error(onnx_extended_helpers::MakeString( \ + "`", #cond, "` failed. ", \ + onnx_extended_helpers::MakeString(__VA_ARGS__))); #define _ENFORCE_DEFINED #endif - - -} // namespace orthelpers +} // namespace onnx_extended_helpers diff --git a/onnx_extended/ortcy/wrap/ortapi.cpp b/onnx_extended/ortcy/wrap/ortapi.cpp index 96a8580e..1273911b 100644 --- a/onnx_extended/ortcy/wrap/ortapi.cpp +++ b/onnx_extended/ortcy/wrap/ortapi.cpp @@ -1,5 +1,5 @@ #include "ortapi.h" -#include "helpers.h" +#include "onnx_extended_helpers.h" #include "ortapi_inline.h" #ifdef _WIN32 #include @@ -11,235 +11,246 @@ namespace ortapi { std::vector get_available_providers() { - int len; - char** providers; - ThrowOnError(GetOrtApi()->GetAvailableProviders(&providers, &len)); - std::vector available_providers(providers, providers + len); - ThrowOnError(GetOrtApi()->ReleaseAvailableProviders(providers, len)); - return available_providers; + int len; + char **providers; + ThrowOnError(GetOrtApi()->GetAvailableProviders(&providers, &len)); + std::vector available_providers(providers, providers + len); + ThrowOnError(GetOrtApi()->ReleaseAvailableProviders(providers, len)); + return available_providers; } void OrtCpuValue::free_ort_value() { - if (ort_value_ != nullptr) { - GetOrtApi()->ReleaseValue((OrtValue*)ort_value_); - ort_value_ = nullptr; - } + if (ort_value_ != nullptr) { + GetOrtApi()->ReleaseValue((OrtValue *)ort_value_); + ort_value_ = nullptr; + } } class OrtInference { public: + OrtInference() { + ThrowOnError( + GetOrtApi()->CreateEnv(ORT_LOGGING_LEVEL_WARNING, "ortcy", &env_)); + ThrowOnError(GetOrtApi()->CreateSessionOptions(&sess_options_)); + ThrowOnError(GetOrtApi()->CreateRunOptions(&run_options_)); + ThrowOnError(GetOrtApi()->CreateCpuMemoryInfo( + OrtArenaAllocator, OrtMemTypeDefault, &cpu_memory_info_)); + sess_ = nullptr; + cpu_allocator_ = nullptr; + n_inputs_ = 0; + n_outputs_ = 0; + } - OrtInference() { - ThrowOnError(GetOrtApi()->CreateEnv(ORT_LOGGING_LEVEL_WARNING, "ortcy", &env_)); - ThrowOnError(GetOrtApi()->CreateSessionOptions(&sess_options_)); - ThrowOnError(GetOrtApi()->CreateRunOptions(&run_options_)); - ThrowOnError(GetOrtApi()->CreateCpuMemoryInfo(OrtArenaAllocator, OrtMemTypeDefault, &cpu_memory_info_)); - sess_ = nullptr; - cpu_allocator_ = nullptr; - n_inputs_ = 0; - n_outputs_ = 0; - } + void LoadFromFile(const char *filepath) { + EXT_ENFORCE(filepath != nullptr); + EXT_ENFORCE(env_ != nullptr); + EXT_ENFORCE(sess_options_ != nullptr); +#ifdef _WIN32 + std::string name(filepath); + std::wstring_convert> cvt; + std::wstring wname(cvt.from_bytes(name)); + ThrowOnError( + GetOrtApi()->CreateSession(env_, wname.c_str(), sess_options_, &sess_)); +#else + ThrowOnError( + GetOrtApi()->CreateSession(env_, filepath, sess_options_, &sess_)); +#endif + LoadFinalize(); + } + + void LoadFromBytes(const void *model_data, size_t model_data_length) { + ThrowOnError(GetOrtApi()->CreateSessionFromArray( + env_, model_data, model_data_length, sess_options_, &sess_)); + LoadFinalize(); + } + + ~OrtInference() { + if (cpu_allocator_ != nullptr) + GetOrtApi()->ReleaseAllocator(cpu_allocator_); + if (sess_ != nullptr) + GetOrtApi()->ReleaseSession(sess_); + GetOrtApi()->ReleaseSessionOptions(sess_options_); + GetOrtApi()->ReleaseRunOptions(run_options_); + GetOrtApi()->ReleaseMemoryInfo(cpu_memory_info_); + GetOrtApi()->ReleaseEnv(env_); + } - void LoadFromFile(const char* filepath) { - EXT_ENFORCE(filepath != nullptr); - EXT_ENFORCE(env_ != nullptr); - EXT_ENFORCE(sess_options_ != nullptr); - #ifdef _WIN32 - std::string name(filepath); + size_t GetInputCount() const { return n_inputs_; } + size_t GetOutputCount() const { return n_outputs_; } + + void Initialize(const char *optimized_file_path = nullptr, + int graph_optimization_level = -1, int enable_cuda = 0, + int cuda_device_id = 0, int set_denormal_as_zero = 0, + int intra_op_num_threads = -1, int inter_op_num_threads = -1, + const char **custom_libs = nullptr) { + if (graph_optimization_level != -1) { + ThrowOnError(GetOrtApi()->SetSessionGraphOptimizationLevel( + sess_options_, (GraphOptimizationLevel)graph_optimization_level)); + } + if (optimized_file_path != nullptr) { + std::string path(optimized_file_path); + if (!path.empty()) { +#ifdef _WIN32 std::wstring_convert> cvt; - std::wstring wname(cvt.from_bytes(name)); - ThrowOnError(GetOrtApi()->CreateSession(env_, wname.c_str(), sess_options_, &sess_)); - #else - ThrowOnError(GetOrtApi()->CreateSession(env_, filepath, sess_options_, &sess_)); - #endif - LoadFinalize(); + std::wstring wpath(cvt.from_bytes(path)); + ThrowOnError(GetOrtApi()->SetOptimizedModelFilePath(sess_options_, + wpath.c_str())); +#else + ThrowOnError(GetOrtApi()->SetOptimizedModelFilePath(sess_options_, + path.c_str())); +#endif + } } - - void LoadFromBytes(const void* model_data, size_t model_data_length) { - ThrowOnError(GetOrtApi()->CreateSessionFromArray(env_, model_data, model_data_length, sess_options_, &sess_)); - LoadFinalize(); + if (enable_cuda) { + OrtCUDAProviderOptions cuda_options; + cuda_options.device_id = cuda_device_id; + cuda_options.do_copy_in_default_stream = true; + // TODO: Support arena configuration for users of test runner + ThrowOnError(GetOrtApi()->SessionOptionsAppendExecutionProvider_CUDA( + sess_options_, &cuda_options)); } - - ~OrtInference() { - if (cpu_allocator_ != nullptr) GetOrtApi()->ReleaseAllocator(cpu_allocator_); - if (sess_ != nullptr) GetOrtApi()->ReleaseSession(sess_); - GetOrtApi()->ReleaseSessionOptions(sess_options_); - GetOrtApi()->ReleaseRunOptions(run_options_); - GetOrtApi()->ReleaseMemoryInfo(cpu_memory_info_); - GetOrtApi()->ReleaseEnv(env_); + // see https://github.com/microsoft/onnxruntime/blob/main/ + // include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h + if (set_denormal_as_zero) { + ThrowOnError(GetOrtApi()->AddSessionConfigEntry( + sess_options_, "session.set_denormal_as_zero", "1")); + } + if (intra_op_num_threads != -1) { + ThrowOnError(GetOrtApi()->SetIntraOpNumThreads(sess_options_, + intra_op_num_threads)); + } + if (inter_op_num_threads != -1) { + ThrowOnError(GetOrtApi()->SetInterOpNumThreads(sess_options_, + inter_op_num_threads)); + } + if (custom_libs != nullptr) { +#ifdef _WIN32 + std::wstring_convert> cvt; +#endif + while (*custom_libs != nullptr) { +#ifdef _WIN32 + std::wstring wpath(cvt.from_bytes(*custom_libs)); + ThrowOnError(GetOrtApi()->RegisterCustomOpsLibrary_V2(sess_options_, + wpath.c_str())); +#else + ThrowOnError(GetOrtApi()->RegisterCustomOpsLibrary_V2(sess_options_, + *custom_libs)); +#endif + ++custom_libs; + } } + } - size_t GetInputCount() const { return n_inputs_; } - size_t GetOutputCount() const { return n_outputs_; } + size_t Run(size_t n_inputs, OrtShape *shapes, OrtCpuValue *values, + size_t max_outputs, OrtShape *out_shapes, + OrtCpuValue *out_values) { + if (max_outputs < n_outputs_) + EXT_THROW("Not enough expected outputs, max_outputs=", max_outputs, " > ", + n_outputs_, "."); + if (n_inputs > n_inputs_) + EXT_THROW("Too many inputs, n_inputs=", n_inputs, " > ", n_inputs, "."); + std::vector ort_values(n_inputs); - void Initialize(const char* optimized_file_path = nullptr, - int graph_optimization_level = -1, - int enable_cuda = 0, - int cuda_device_id = 0, - int set_denormal_as_zero = 0, - int intra_op_num_threads = -1, - int inter_op_num_threads = -1, - const char** custom_libs = nullptr) { - if (graph_optimization_level != -1) { - ThrowOnError(GetOrtApi()->SetSessionGraphOptimizationLevel( - sess_options_, (GraphOptimizationLevel)graph_optimization_level)); - } - if (optimized_file_path != nullptr) { - std::string path(optimized_file_path); - if (!path.empty()) { - #ifdef _WIN32 - std::wstring_convert> cvt; - std::wstring wpath(cvt.from_bytes(path)); - ThrowOnError(GetOrtApi()->SetOptimizedModelFilePath( - sess_options_, wpath.c_str())); - #else - ThrowOnError(GetOrtApi()->SetOptimizedModelFilePath( - sess_options_, path.c_str())); - #endif - } - } - if (enable_cuda) { - OrtCUDAProviderOptions cuda_options; - cuda_options.device_id = cuda_device_id; - cuda_options.do_copy_in_default_stream = true; - // TODO: Support arena configuration for users of test runner - ThrowOnError(GetOrtApi()->SessionOptionsAppendExecutionProvider_CUDA(sess_options_, &cuda_options)); - } - // see https://github.com/microsoft/onnxruntime/blob/main/ - // include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h - if (set_denormal_as_zero) { - ThrowOnError(GetOrtApi()->AddSessionConfigEntry( - sess_options_, "session.set_denormal_as_zero", "1")); - } - if (intra_op_num_threads != -1) { - ThrowOnError(GetOrtApi()->SetIntraOpNumThreads(sess_options_, intra_op_num_threads)); - } - if (inter_op_num_threads != -1) { - ThrowOnError(GetOrtApi()->SetInterOpNumThreads(sess_options_, inter_op_num_threads)); - } - if (custom_libs != nullptr) { - #ifdef _WIN32 - std::wstring_convert> cvt; - #endif - while (*custom_libs != nullptr) { - #ifdef _WIN32 - std::wstring wpath(cvt.from_bytes(*custom_libs)); - ThrowOnError(GetOrtApi()->RegisterCustomOpsLibrary_V2(sess_options_, wpath.c_str())); - #else - ThrowOnError(GetOrtApi()->RegisterCustomOpsLibrary_V2(sess_options_, *custom_libs)); - #endif - ++custom_libs; - } - } + for (size_t i = 0; i < n_inputs; ++i) { + ONNXTensorElementDataType elem_type = + (ONNXTensorElementDataType)values[i].elem_type(); + ThrowOnError(GetOrtApi()->CreateTensorWithDataAsOrtValue( + cpu_memory_info_, values[i].data(), + values[i].size() * ElementSize(elem_type), shapes[i].dims(), + shapes[i].ndim(), elem_type, &ort_values[i])); } - size_t Run(size_t n_inputs, - OrtShape* shapes, - OrtCpuValue* values, - size_t max_outputs, - OrtShape* out_shapes, - OrtCpuValue* out_values) { - if (max_outputs < n_outputs_) - EXT_THROW("Not enough expected outputs, max_outputs=", - max_outputs, " > ", n_outputs_, "."); - if (n_inputs > n_inputs_) - EXT_THROW("Too many inputs, n_inputs=", n_inputs, " > ", n_inputs, "."); - std::vector ort_values(n_inputs); - - for(size_t i = 0; i < n_inputs; ++i) { - ONNXTensorElementDataType elem_type = (ONNXTensorElementDataType)values[i].elem_type(); - ThrowOnError(GetOrtApi()->CreateTensorWithDataAsOrtValue( - cpu_memory_info_, values[i].data(), - values[i].size() * ElementSize(elem_type), - shapes[i].dims(), shapes[i].ndim(), - elem_type, &ort_values[i])); - } + std::vector ort_values_out(n_outputs_); + ThrowOnError(GetOrtApi()->Run(sess_, run_options_, input_names_call_.data(), + ort_values.data(), n_inputs, + output_names_call_.data(), n_outputs_, + ort_values_out.data())); - std::vector ort_values_out(n_outputs_); - ThrowOnError(GetOrtApi()->Run( - sess_, run_options_, - input_names_call_.data(), ort_values.data(), n_inputs, - output_names_call_.data(), n_outputs_, ort_values_out.data())); - - for(size_t i = 0; i < n_inputs; ++i) { - GetOrtApi()->ReleaseValue(ort_values[i]); - } - OrtTensorTypeAndShapeInfo* info; - ONNXTensorElementDataType elem_type; - size_t size, n_dims; - void* data; - for(size_t i = 0; i < n_outputs_; ++i) { - ThrowOnError(GetOrtApi()->GetTensorTypeAndShape(ort_values_out[i], &info)); - ThrowOnError(GetOrtApi()->GetTensorElementType(info, &elem_type)); - if (elem_type == ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING) { - GetOrtApi()->ReleaseTensorTypeAndShapeInfo(info); - for(; i < n_outputs_; ++i) { - GetOrtApi()->ReleaseValue(ort_values_out[i]); - } - throw std::runtime_error("tensor(string) is not supported as outputs."); - } - ThrowOnError(GetOrtApi()->GetTensorShapeElementCount(info, &size)); - ThrowOnError(GetOrtApi()->GetTensorMutableData(ort_values_out[i], &data)); - ThrowOnError(GetOrtApi()->GetDimensionsCount(info, &n_dims)); - out_shapes[i].init(n_dims); - ThrowOnError(GetOrtApi()->GetDimensions(info, (int64_t*)out_shapes[i].dims(), n_dims)); - /* typedef void copy_allocate(size_t output, int elem_type, size_t size, - OrtShape shape, void* data, void* args); */ - GetOrtApi()->ReleaseTensorTypeAndShapeInfo(info); - out_values[i].init(size, elem_type, data, ort_values_out[i]); - // GetOrtApi()->ReleaseValue(ort_values_out[i]); + for (size_t i = 0; i < n_inputs; ++i) { + GetOrtApi()->ReleaseValue(ort_values[i]); + } + OrtTensorTypeAndShapeInfo *info; + ONNXTensorElementDataType elem_type; + size_t size, n_dims; + void *data; + for (size_t i = 0; i < n_outputs_; ++i) { + ThrowOnError( + GetOrtApi()->GetTensorTypeAndShape(ort_values_out[i], &info)); + ThrowOnError(GetOrtApi()->GetTensorElementType(info, &elem_type)); + if (elem_type == + ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING) { + GetOrtApi()->ReleaseTensorTypeAndShapeInfo(info); + for (; i < n_outputs_; ++i) { + GetOrtApi()->ReleaseValue(ort_values_out[i]); } - return n_outputs_; + throw std::runtime_error("tensor(string) is not supported as outputs."); + } + ThrowOnError(GetOrtApi()->GetTensorShapeElementCount(info, &size)); + ThrowOnError(GetOrtApi()->GetTensorMutableData(ort_values_out[i], &data)); + ThrowOnError(GetOrtApi()->GetDimensionsCount(info, &n_dims)); + out_shapes[i].init(n_dims); + ThrowOnError(GetOrtApi()->GetDimensions( + info, (int64_t *)out_shapes[i].dims(), n_dims)); + /* typedef void copy_allocate(size_t output, int elem_type, size_t size, + OrtShape shape, void* data, void* args); */ + GetOrtApi()->ReleaseTensorTypeAndShapeInfo(info); + out_values[i].init(size, elem_type, data, ort_values_out[i]); + // GetOrtApi()->ReleaseValue(ort_values_out[i]); } + return n_outputs_; + } protected: - void LoadFinalize() { - EXT_ENFORCE(cpu_memory_info_ != nullptr); - ThrowOnError(GetOrtApi()->CreateAllocator(sess_, cpu_memory_info_ , &cpu_allocator_)); - EXT_ENFORCE(cpu_allocator_ != nullptr); - ThrowOnError(GetOrtApi()->SessionGetInputCount(sess_, &n_inputs_)); - ThrowOnError(GetOrtApi()->SessionGetOutputCount(sess_, &n_outputs_)); - input_names_.reserve(n_inputs_); - output_names_.reserve(n_outputs_); - - char* name; - for(size_t i = 0; i < n_inputs_; ++i) { - ThrowOnError(GetOrtApi()->SessionGetInputName(sess_, i, cpu_allocator_, &name)); - input_names_.emplace_back(std::string(name)); - ThrowOnError(GetOrtApi()->AllocatorFree(cpu_allocator_, name)); - } - for(size_t i = 0; i < n_outputs_; ++i) { - ThrowOnError(GetOrtApi()->SessionGetOutputName(sess_, i, cpu_allocator_, &name)); - output_names_.emplace_back(std::string(name)); - ThrowOnError(GetOrtApi()->AllocatorFree(cpu_allocator_, name)); - } - input_names_call_.resize(n_inputs_); - for(size_t i = 0; i < n_inputs_; ++i) { - input_names_call_[i] = input_names_[i].c_str(); - } - output_names_call_.resize(n_inputs_); - for(size_t i = 0; i < n_inputs_; ++i) { - output_names_call_[i] = output_names_[i].c_str(); - } + void LoadFinalize() { + EXT_ENFORCE(cpu_memory_info_ != nullptr); + ThrowOnError( + GetOrtApi()->CreateAllocator(sess_, cpu_memory_info_, &cpu_allocator_)); + EXT_ENFORCE(cpu_allocator_ != nullptr); + ThrowOnError(GetOrtApi()->SessionGetInputCount(sess_, &n_inputs_)); + ThrowOnError(GetOrtApi()->SessionGetOutputCount(sess_, &n_outputs_)); + input_names_.reserve(n_inputs_); + output_names_.reserve(n_outputs_); + + char *name; + for (size_t i = 0; i < n_inputs_; ++i) { + ThrowOnError( + GetOrtApi()->SessionGetInputName(sess_, i, cpu_allocator_, &name)); + input_names_.emplace_back(std::string(name)); + ThrowOnError(GetOrtApi()->AllocatorFree(cpu_allocator_, name)); + } + for (size_t i = 0; i < n_outputs_; ++i) { + ThrowOnError( + GetOrtApi()->SessionGetOutputName(sess_, i, cpu_allocator_, &name)); + output_names_.emplace_back(std::string(name)); + ThrowOnError(GetOrtApi()->AllocatorFree(cpu_allocator_, name)); + } + input_names_call_.resize(n_inputs_); + for (size_t i = 0; i < n_inputs_; ++i) { + input_names_call_[i] = input_names_[i].c_str(); } + output_names_call_.resize(n_inputs_); + for (size_t i = 0; i < n_inputs_; ++i) { + output_names_call_[i] = output_names_[i].c_str(); + } + } private: - // before loading the model - OrtEnv* env_; - OrtSessionOptions* sess_options_; - OrtRunOptions* run_options_; - OrtMemoryInfo* cpu_memory_info_; + // before loading the model + OrtEnv *env_; + OrtSessionOptions *sess_options_; + OrtRunOptions *run_options_; + OrtMemoryInfo *cpu_memory_info_; private: - // after loading the model - OrtSession* sess_; - OrtAllocator* cpu_allocator_; - size_t n_inputs_; - size_t n_outputs_; - std::vector input_names_; - std::vector output_names_; - std::vector input_names_call_; - std::vector output_names_call_; + // after loading the model + OrtSession *sess_; + OrtAllocator *cpu_allocator_; + size_t n_inputs_; + size_t n_outputs_; + std::vector input_names_; + std::vector output_names_; + std::vector input_names_call_; + std::vector output_names_call_; }; /* @@ -253,47 +264,44 @@ typedef enum { //////// SIMPLE API ////// -OrtSessionType* create_session() { return (OrtSessionType*)(new OrtInference()); } -void delete_session(OrtSessionType* ptr) { - if (ptr == nullptr) - throw std::runtime_error("Cannot delete a null pointer (delete_session)."); - delete (OrtInference*)ptr; +OrtSessionType *create_session() { + return (OrtSessionType *)(new OrtInference()); +} +void delete_session(OrtSessionType *ptr) { + if (ptr == nullptr) + throw std::runtime_error("Cannot delete a null pointer (delete_session)."); + delete (OrtInference *)ptr; +} +void session_load_from_file(OrtSessionType *ptr, const char *filename) { + ((OrtInference *)ptr)->LoadFromFile(filename); +} +void session_load_from_bytes(OrtSessionType *ptr, const void *buffer, + size_t size) { + ((OrtInference *)ptr)->LoadFromBytes(buffer, size); +} +size_t session_get_input_count(OrtSessionType *ptr) { + return ((OrtInference *)ptr)->GetInputCount(); } -void session_load_from_file(OrtSessionType* ptr, const char* filename) { ((OrtInference*)ptr)->LoadFromFile(filename); } -void session_load_from_bytes(OrtSessionType* ptr, const void* buffer, size_t size) { - ((OrtInference*)ptr)->LoadFromBytes(buffer, size); +size_t session_get_output_count(OrtSessionType *ptr) { + return ((OrtInference *)ptr)->GetOutputCount(); } -size_t session_get_input_count(OrtSessionType* ptr) { return ((OrtInference*)ptr)->GetInputCount(); } -size_t session_get_output_count(OrtSessionType* ptr) { return ((OrtInference*)ptr)->GetOutputCount(); } -void session_initialize(OrtSessionType* ptr, - const char* optimized_file_path, - int graph_optimization_level, - int enable_cuda, - int cuda_device_id, - int set_denormal_as_zero, - int intra_op_num_threads, - int inter_op_num_threads, - char** custom_libs) { - ((OrtInference*)ptr)->Initialize(optimized_file_path, - graph_optimization_level, - enable_cuda, - cuda_device_id, - set_denormal_as_zero, - intra_op_num_threads, - inter_op_num_threads, - (const char**)custom_libs); +void session_initialize(OrtSessionType *ptr, const char *optimized_file_path, + int graph_optimization_level, int enable_cuda, + int cuda_device_id, int set_denormal_as_zero, + int intra_op_num_threads, int inter_op_num_threads, + char **custom_libs) { + ((OrtInference *)ptr) + ->Initialize(optimized_file_path, graph_optimization_level, enable_cuda, + cuda_device_id, set_denormal_as_zero, intra_op_num_threads, + inter_op_num_threads, (const char **)custom_libs); } -size_t session_run(OrtSessionType* ptr, - size_t n_inputs, - OrtShape* shapes, - OrtCpuValue* values, - size_t max_outputs, - OrtShape* out_shapes, - OrtCpuValue* out_values) { - return ((OrtInference*)ptr)->Run(n_inputs, shapes, values, - max_outputs, out_shapes, out_values); +size_t session_run(OrtSessionType *ptr, size_t n_inputs, OrtShape *shapes, + OrtCpuValue *values, size_t max_outputs, + OrtShape *out_shapes, OrtCpuValue *out_values) { + return ((OrtInference *)ptr) + ->Run(n_inputs, shapes, values, max_outputs, out_shapes, out_values); } } // namespace ortapi diff --git a/onnx_extended/ortcy/wrap/ortapi.h b/onnx_extended/ortcy/wrap/ortapi.h index 718feea2..929b9daf 100644 --- a/onnx_extended/ortcy/wrap/ortapi.h +++ b/onnx_extended/ortcy/wrap/ortapi.h @@ -10,100 +10,99 @@ namespace ortapi { inline size_t ElementSize(ONNXTensorElementDataType elem_type) { - switch(elem_type) { - case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE: - case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64: - case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: - return 8; - case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT: - case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: - case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32: - return 4; - case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16: - case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16: - case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16: - case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16: - return 2; - case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8: - case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8: - return 2; - default: - throw std::runtime_error( - "One element type is not implemented in function `ortapi::ElementSize()`."); - } + switch (elem_type) { + case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE: + case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64: + case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: + return 8; + case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT: + case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: + case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32: + return 4; + case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16: + case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16: + case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16: + case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16: + return 2; + case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8: + case ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8: + return 2; + default: + throw std::runtime_error("One element type is not implemented in function " + "`ortapi::ElementSize()`."); + } } inline size_t ElementSizeI(int elem_type) { - return ElementSize((ONNXTensorElementDataType)elem_type); + return ElementSize((ONNXTensorElementDataType)elem_type); } class OrtShape { private: - int64_t size_; - int64_t dims_[8]; + int64_t size_; + int64_t dims_[8]; public: - inline OrtShape() { size_ = 0; } - inline OrtShape(size_t ndim) { init(ndim); } - inline void init(size_t ndim) { - if (ndim > 8) - throw std::runtime_error("shape cannot have more than 8 dimensions."); - size_ = ndim; - } - inline int64_t ndim() const { return size_; } - inline void set(size_t i, int64_t dim) { dims_[i] = dim; } - inline const int64_t *dims() const { return dims_; } + inline OrtShape() { size_ = 0; } + inline OrtShape(size_t ndim) { init(ndim); } + inline void init(size_t ndim) { + if (ndim > 8) + throw std::runtime_error("shape cannot have more than 8 dimensions."); + size_ = ndim; + } + inline int64_t ndim() const { return size_; } + inline void set(size_t i, int64_t dim) { dims_[i] = dim; } + inline const int64_t *dims() const { return dims_; } }; class OrtCpuValue { - private: - size_t size_; - int elem_type_; // ONNXTensorElementDataType - void* data_; - void* ort_value_; - public: - inline OrtCpuValue() { elem_type_ = -1; size_ = 0; ort_value_ = nullptr; data_ = nullptr; } - inline void init(size_t size, int elem_type, void* data, void* ort_value) { - size_ = size; - elem_type_ = elem_type; - data_ = data; - ort_value_ = ort_value; - } - inline size_t size() { return size_; } - inline int elem_type() { return elem_type_; } - inline void* data() { return data_; } - void free_ort_value(); +private: + size_t size_; + int elem_type_; // ONNXTensorElementDataType + void *data_; + void *ort_value_; + +public: + inline OrtCpuValue() { + elem_type_ = -1; + size_ = 0; + ort_value_ = nullptr; + data_ = nullptr; + } + inline void init(size_t size, int elem_type, void *data, void *ort_value) { + size_ = size; + elem_type_ = elem_type; + data_ = data; + ort_value_ = ort_value; + } + inline size_t size() { return size_; } + inline int elem_type() { return elem_type_; } + inline void *data() { return data_; } + void free_ort_value(); }; // Simplified API for this project. // see https://onnxruntime.ai/docs/api/c/ -typedef void release(size_t output, int elem_type, size_t size, OrtShape* shape, void* data, void* args); +typedef void release(size_t output, int elem_type, size_t size, OrtShape *shape, + void *data, void *args); std::vector get_available_providers(); OrtSessionType *create_session(); void delete_session(OrtSessionType *); -void session_load_from_file(OrtSessionType*, const char* filename); -void session_load_from_bytes(OrtSessionType*, const void* buffer, size_t size); -void session_initialize(OrtSessionType* ptr, - const char* optimized_file_path, - int graph_optimization_level = -1, - int enable_cuda = 0, - int cuda_device_id = 0, - int set_denormal_as_zero = 0, +void session_load_from_file(OrtSessionType *, const char *filename); +void session_load_from_bytes(OrtSessionType *, const void *buffer, size_t size); +void session_initialize(OrtSessionType *ptr, const char *optimized_file_path, + int graph_optimization_level = -1, int enable_cuda = 0, + int cuda_device_id = 0, int set_denormal_as_zero = 0, int intra_op_num_threads = -1, int inter_op_num_threads = -1, - char** custom_libs = nullptr); + char **custom_libs = nullptr); size_t session_get_input_count(OrtSessionType *); size_t session_get_output_count(OrtSessionType *); -size_t session_run(OrtSessionType* ptr, - size_t n_inputs, - OrtShape* shapes, - OrtCpuValue* values, - size_t max_outputs, - OrtShape* out_shapes, - OrtCpuValue* out_values); - +size_t session_run(OrtSessionType *ptr, size_t n_inputs, OrtShape *shapes, + OrtCpuValue *values, size_t max_outputs, + OrtShape *out_shapes, OrtCpuValue *out_values); } // namespace ortapi diff --git a/onnx_extended/ortcy/wrap/ortapi_inline.h b/onnx_extended/ortcy/wrap/ortapi_inline.h index 188a66ae..33813492 100644 --- a/onnx_extended/ortcy/wrap/ortapi_inline.h +++ b/onnx_extended/ortcy/wrap/ortapi_inline.h @@ -1,27 +1,29 @@ #pragma once -#include "helpers.h" - #define ORT_API_MANUAL_INIT #include "onnxruntime_c_api.h" #undef ORT_API_MANUAL_INIT +#include "onnx_extended_helpers.h" + namespace ortapi { -inline static const OrtApi *GetOrtApi() { - const OrtApi* api_ = OrtGetApiBase()->GetApi(ORT_API_VERSION); - return api_; +inline static const OrtApi *GetOrtApi() { + const OrtApi *api_ = OrtGetApiBase()->GetApi(ORT_API_VERSION); + return api_; } -inline const char* ort_version() { return OrtGetApiBase()->GetVersionString(); } - -inline void _ThrowOnError_(OrtStatus* ort_status, const char* filename, int line) { - if (ort_status) { - std::string message(GetOrtApi()->GetErrorMessage(ort_status)); - OrtErrorCode code = GetOrtApi()->GetErrorCode(ort_status); - throw std::runtime_error( - orthelpers::MakeString("error: onnxruntime(", code, "), ", message, "\n ", filename, ":", line)); - } +inline const char *ort_version() { return OrtGetApiBase()->GetVersionString(); } + +inline void _ThrowOnError_(OrtStatus *ort_status, const char *filename, + int line) { + if (ort_status) { + std::string message(GetOrtApi()->GetErrorMessage(ort_status)); + OrtErrorCode code = GetOrtApi()->GetErrorCode(ort_status); + throw std::runtime_error(onnx_extended_helpers::MakeString( + "error: onnxruntime(", code, "), ", message, "\n ", filename, ":", + line)); + } } #define ThrowOnError(ort_status) _ThrowOnError_(ort_status, __FILE__, __LINE__) diff --git a/onnx_extended/ortops/common/common_kernels.h b/onnx_extended/ortops/common/common_kernels.h index 8e326fe2..f5fec662 100644 --- a/onnx_extended/ortops/common/common_kernels.h +++ b/onnx_extended/ortops/common/common_kernels.h @@ -1,75 +1,17 @@ #pragma once -#include "helpers.h" #define ORT_API_MANUAL_INIT #include #include #undef ORT_API_MANUAL_INIT -namespace ortops { - -inline void MakeStringInternal(std::ostringstream &ss) noexcept {} - -template -inline void MakeStringInternal(std::ostringstream &ss, const T &t) noexcept { - ss << t; -} - -template <> -inline void MakeStringInternal(std::ostringstream &ss, - const std::vector &t) noexcept { - for (auto it : t) - ss << "x" << it; -} - -template <> -inline void MakeStringInternal(std::ostringstream &ss, - const std::vector &t) noexcept { - for (auto it : t) - ss << "x" << it; -} - -template <> -inline void MakeStringInternal(std::ostringstream &ss, - const std::vector &t) noexcept { - for (auto it : t) - ss << "x" << it; -} +#include "onnx_extended_helpers.h" -template <> -inline void MakeStringInternal(std::ostringstream &ss, - const std::vector &t) noexcept { - for (auto it : t) - ss << "x" << it; -} - -template <> -inline void MakeStringInternal(std::ostringstream &ss, - const std::vector &t) noexcept { - for (auto it : t) - ss << "x" << it; -} - -template <> -inline void MakeStringInternal(std::ostringstream &ss, - const std::vector &t) noexcept { - for (auto it : t) - ss << "x" << it; -} - -template -inline void MakeStringInternal(std::ostringstream &ss, const T &t, - const Args &...args) noexcept { - MakeStringInternal(ss, t); - MakeStringInternal(ss, args...); -} - -template inline std::string MakeString(const Args &...args) { - std::ostringstream ss; - MakeStringInternal(ss, args...); - return std::string(ss.str()); -} +namespace ortops { +//////////////////////// +// errors and exceptions +//////////////////////// inline void _ThrowOnError_(OrtStatus *ort_status, const char *filename, int line, const OrtApi &api) { @@ -81,9 +23,9 @@ inline void _ThrowOnError_(OrtStatus *ort_status, const char *filename, std::string message(api.GetErrorMessage(ort_status)); api.ReleaseStatus(ort_status); if (code != ORT_OK) { - throw std::runtime_error( - orthelpers::MakeString("error: onnxruntime(", code, "), ", message, - "\n ", filename, ":", line)); + throw std::runtime_error(onnx_extended_helpers::MakeString( + "error: onnxruntime(", code, "), ", message, "\n ", filename, + ":", line)); } } } @@ -92,6 +34,10 @@ inline void _ThrowOnError_(OrtStatus *ort_status, const char *filename, #define ThrowOnError(api, ort_status) \ _ThrowOnError_(ort_status, __FILE__, __LINE__, api) +//////////////////// +// kernel attributes +//////////////////// + inline std::string KernelInfoGetOptionalAttributeString( const OrtApi &api, const OrtKernelInfo *info, const char *name, const std::string &default_value) { @@ -137,6 +83,51 @@ KernelInfoGetAttributeApi(const OrtApi &api, const OrtKernelInfo *info, return api.KernelInfoGetAttribute_float(info, name, &out); } +template <> +inline OrtStatus *KernelInfoGetAttributeApi>( + const OrtApi &api, const OrtKernelInfo *info, const char *name, + std::vector &out) { + size_t size = 0; + + // Feed nullptr for the data buffer to query the true size of the attribute + OrtStatus *status = + api.KernelInfoGetAttributeArray_float(info, name, nullptr, &size); + + if (status == nullptr) { + out.resize(size); + status = + api.KernelInfoGetAttributeArray_float(info, name, out.data(), &size); + } + + return status; +} + +template <> +inline OrtStatus *KernelInfoGetAttributeApi>( + const OrtApi &api, const OrtKernelInfo *info, const char *name, + std::vector &out) { + size_t size = 0; + + // Feed nullptr for the data buffer to query the true size of the attribute + OrtStatus *status = + api.KernelInfoGetAttributeArray_int64(info, name, nullptr, &size); + + if (status == nullptr) { + out.resize(size); + ThrowOnError(api, api.KernelInfoGetAttributeArray_int64(info, name, + out.data(), &size)); + } + return status; +} + +template <> +inline OrtStatus *KernelInfoGetAttributeApi>( + const OrtApi &api, const OrtKernelInfo *info, const char *name, + std::vector &output) { + EXT_THROW("Unable to retrieve attribute as an array of strings. " + "You should use a single comma separated string."); +} + template inline T KernelInfoGetOptionalAttribute(const OrtApi &api, const OrtKernelInfo *info, @@ -144,9 +135,8 @@ inline T KernelInfoGetOptionalAttribute(const OrtApi &api, T out; OrtStatus *status = KernelInfoGetAttributeApi(api, info, name, out); - if (status == nullptr) { + if (status == nullptr) return out; - } OrtErrorCode code = api.GetErrorCode(status); if (code == ORT_FAIL) { api.ReleaseStatus(status); diff --git a/onnx_extended/ortops/common/common_kernels_cuda.h b/onnx_extended/ortops/common/common_kernels_cuda.h index 45c05717..51b19ca9 100644 --- a/onnx_extended/ortops/common/common_kernels_cuda.h +++ b/onnx_extended/ortops/common/common_kernels_cuda.h @@ -2,7 +2,7 @@ #include "common_kernels.h" #include "cublas_v2.h" -#include "helpers.h" +#include "onnx_extended_helpers.h" #include namespace ortops { diff --git a/onnx_extended/ortops/optim/__init__.py b/onnx_extended/ortops/optim/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/onnx_extended/ortops/optim/__init__.py @@ -0,0 +1 @@ + diff --git a/onnx_extended/ortops/optim/cpu/__init__.py b/onnx_extended/ortops/optim/cpu/__init__.py new file mode 100644 index 00000000..77cfc65e --- /dev/null +++ b/onnx_extended/ortops/optim/cpu/__init__.py @@ -0,0 +1,53 @@ +import os +import textwrap +from typing import List +from ... import _get_ort_ext_libs + + +def get_ort_ext_libs() -> List[str]: + """ + Returns the list of libraries implementing new simple + :epkg:`onnxruntime` kernels implemented for the + :epkg:`CPUExecutionProvider`. + """ + return _get_ort_ext_libs(os.path.dirname(__file__)) + + +def documentation() -> List[str]: + """ + Returns a list of rst string documenting every implemented kernels + in this subfolder. + """ + return list( + map( + textwrap.dedent, + [ + """ + onnx_extented.ortops.option.cpu.RandomForestRegressor + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + It does the sum of two tensors. + + **Provider** + + CPUExecutionProvider + + **Inputs** + + * X (T1): tensor of type T1 + + **Outputs** + + * Y (T2): prediction of type T2 + + **Constraints** + + * T1: float, double + * T2: float, double + + **Attributes** + + """, + ], + ) + ) diff --git a/onnx_extended/ortops/optim/cpu/ort_optim_cpu_lib.cc b/onnx_extended/ortops/optim/cpu/ort_optim_cpu_lib.cc new file mode 100644 index 00000000..0cbf20c5 --- /dev/null +++ b/onnx_extended/ortops/optim/cpu/ort_optim_cpu_lib.cc @@ -0,0 +1,42 @@ +// Source: https://github.com/microsoft/onnxruntime/tree/main/ +// onnxruntime/test/testdata/custom_op_get_const_input_test_library + +#include +#include + +#include "ort_optim_cpu_lib.h" +#include "tree_ensemble.h" + +static const char *c_OpDomain = "onnx_extented.ortops.optim.cpu"; + +static void AddOrtCustomOpDomainToContainer(Ort::CustomOpDomain &&domain) { + static std::vector ort_custom_op_domain_container; + static std::mutex ort_custom_op_domain_mutex; + std::lock_guard lock(ort_custom_op_domain_mutex); + ort_custom_op_domain_container.push_back(std::move(domain)); +} + +OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options, + const OrtApiBase *api_base) { + Ort::InitApi(api_base->GetApi(ORT_API_VERSION)); + Ort::UnownedSessionOptions session_options(options); + + // An instance remaining available until onnxruntime unload the library. + static ortops::TreeEnsembleRegressor c_TreeEnsembleRegressor; + + OrtStatus *result = nullptr; + + try { + Ort::CustomOpDomain domain{c_OpDomain}; + + domain.Add(&c_TreeEnsembleRegressor); + + session_options.Add(domain); + AddOrtCustomOpDomainToContainer(std::move(domain)); + } catch (const std::exception &e) { + Ort::Status status{e}; + result = status.release(); + } + + return result; +} diff --git a/onnx_extended/ortops/optim/cpu/ort_optim_cpu_lib.h b/onnx_extended/ortops/optim/cpu/ort_optim_cpu_lib.h new file mode 100644 index 00000000..b7f1241e --- /dev/null +++ b/onnx_extended/ortops/optim/cpu/ort_optim_cpu_lib.h @@ -0,0 +1,14 @@ +#pragma once + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +ORT_EXPORT OrtStatus *ORT_API_CALL +RegisterCustomOps(OrtSessionOptions *options, const OrtApiBase *api_base); + +#ifdef __cplusplus +} +#endif diff --git a/onnx_extended/ortops/optim/cpu/tree_ensemble.cc b/onnx_extended/ortops/optim/cpu/tree_ensemble.cc new file mode 100644 index 00000000..50c1e77e --- /dev/null +++ b/onnx_extended/ortops/optim/cpu/tree_ensemble.cc @@ -0,0 +1,145 @@ +#include "tree_ensemble.h" + +namespace ortops { + +//////////////////////// +// Operators declaration +//////////////////////// + +void *TreeEnsembleRegressor::CreateKernel(const OrtApi &api, + const OrtKernelInfo *info) const { + return std::make_unique(api, info).release(); +}; + +const char *TreeEnsembleRegressor::GetName() const { + return "TreeEnsembleRegressor"; +}; + +const char *TreeEnsembleRegressor::GetExecutionProviderType() const { + return "CPUExecutionProvider"; +}; + +size_t TreeEnsembleRegressor::GetInputTypeCount() const { return 1; }; + +ONNXTensorElementDataType +TreeEnsembleRegressor::GetInputType(size_t index) const { + return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; +}; + +size_t TreeEnsembleRegressor::GetOutputTypeCount() const { return 1; }; + +ONNXTensorElementDataType +TreeEnsembleRegressor::GetOutputType(size_t index) const { + return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; +}; + +//////////////////////// +// Kernel initialization +//////////////////////// + +TreeEnsembleKernel::TreeEnsembleKernel(const OrtApi &api, + const OrtKernelInfo *info) { + reg_float_float_float = nullptr; + + std::string aggregate_function = KernelInfoGetOptionalAttributeString( + api, info, "aggregate_function", "SUM"); + std::vector base_values = KernelInfoGetOptionalAttribute( + api, info, "base_values", std::vector()); + n_targets_or_classes = KernelInfoGetOptionalAttribute( + api, info, "n_targets", static_cast(1)); + std::vector nodes_falsenodeids = KernelInfoGetOptionalAttribute( + api, info, "nodes_falsenodeids", std::vector()); + std::vector nodes_featureids = KernelInfoGetOptionalAttribute( + api, info, "nodes_featureids", std::vector()); + std::vector nodes_hitrates = KernelInfoGetOptionalAttribute( + api, info, "nodes_hitrates", std::vector()); + std::vector nodes_missing_value_tracks_true = + KernelInfoGetOptionalAttribute( + api, info, "nodes_missing_value_tracks_true", std::vector()); + std::string nodes_modes_single = + KernelInfoGetOptionalAttributeString(api, info, "nodes_modes", ""); + std::vector nodes_nodeids = KernelInfoGetOptionalAttribute( + api, info, "nodes_nodeids", std::vector()); + std::vector nodes_treeids = KernelInfoGetOptionalAttribute( + api, info, "nodes_treeids", std::vector()); + std::vector nodes_truenodeids = KernelInfoGetOptionalAttribute( + api, info, "nodes_truenodeids", std::vector()); + std::vector nodes_values = KernelInfoGetOptionalAttribute( + api, info, "nodes_values", std::vector()); + std::string post_transform = + KernelInfoGetOptionalAttributeString(api, info, "post_transform", "NONE"); + + std::vector target_class_ids = KernelInfoGetOptionalAttribute( + api, info, "target_ids", std::vector()); + std::vector target_class_nodeids = KernelInfoGetOptionalAttribute( + api, info, "target_nodeids", std::vector()); + std::vector target_class_treeids = KernelInfoGetOptionalAttribute( + api, info, "target_treeids", std::vector()); + std::vector target_class_weights = KernelInfoGetOptionalAttribute( + api, info, "target_weights", std::vector()); + + std::vector nodes_modes = SplitString(nodes_modes_single, ','); + EXT_ENFORCE(n_targets_or_classes > 0); + EXT_ENFORCE(nodes_values.size() > 0); + EXT_ENFORCE(nodes_nodeids.size() > 0); + EXT_ENFORCE(nodes_modes.size() == nodes_falsenodeids.size(), + " nodes_modes.size()==", nodes_modes.size(), + "!=", nodes_falsenodeids.size(), + ", nodes_modes=", nodes_modes_single, "."); + EXT_ENFORCE(n_targets_or_classes > 0); + + std::unique_ptr> ptr( + new onnx_c_ops::TreeEnsembleCommon()); + reg_float_float_float.swap(ptr); + auto status = reg_float_float_float->Init( + aggregate_function, base_values, n_targets_or_classes, nodes_falsenodeids, + nodes_featureids, nodes_hitrates, nodes_missing_value_tracks_true, + nodes_modes, nodes_nodeids, nodes_treeids, nodes_truenodeids, + nodes_values, post_transform, target_class_ids, target_class_nodeids, + target_class_treeids, target_class_weights); + EXT_ENFORCE(status.IsOK(), "The tree ensemble initialisation failed."); + + int64_t parallel_tree = KernelInfoGetOptionalAttribute( + api, info, "parallel_tree", static_cast(80)); + int64_t parallel_tree_N = KernelInfoGetOptionalAttribute( + api, info, "parallel_tree_N", static_cast(128)); + int64_t parallel_N = KernelInfoGetOptionalAttribute(api, info, "parallel_N", + static_cast(50)); + int64_t batch_size_tree = KernelInfoGetOptionalAttribute( + api, info, "batch_size_tree", static_cast(2)); + int64_t batch_size_rows = KernelInfoGetOptionalAttribute( + api, info, "batch_size_rows", static_cast(2)); + int64_t use_node3 = KernelInfoGetOptionalAttribute(api, info, "use_node3", + static_cast(0)); + + reg_float_float_float->set(parallel_tree, parallel_tree_N, parallel_N, + batch_size_tree, batch_size_rows, use_node3); +} + +//////////////////////// +// Kernel Implementation +//////////////////////// + +void TreeEnsembleKernel::Compute(OrtKernelContext *context) { + Ort::KernelContext ctx(context); + Ort::ConstValue input_X = ctx.GetInput(0); + std::vector dimensions_in = + input_X.GetTensorTypeAndShapeInfo().GetShape(); + EXT_ENFORCE(dimensions_in.size() == 2, "TreeEnsemble only allows 2D inputs."); + std::vector dimensions_out{dimensions_in[0], n_targets_or_classes}; + Ort::UnownedValue output = ctx.GetOutput(0, dimensions_out); + + if (reg_float_float_float.get() != nullptr) { + const float *X = input_X.GetTensorData(); + float *out = output.GetTensorMutableData(); + reg_float_float_float->Compute(dimensions_in[0], dimensions_in[1], X, out, + nullptr); + } else { + EXT_ENFORCE("No implementation yet for input type=", + input_X.GetTensorTypeAndShapeInfo().GetElementType(), + " and output type=", + output.GetTensorTypeAndShapeInfo().GetElementType(), "."); + } +} + +} // namespace ortops diff --git a/onnx_extended/ortops/optim/cpu/tree_ensemble.h b/onnx_extended/ortops/optim/cpu/tree_ensemble.h new file mode 100644 index 00000000..f04362de --- /dev/null +++ b/onnx_extended/ortops/optim/cpu/tree_ensemble.h @@ -0,0 +1,30 @@ +#pragma once + +#include "common/common_kernels.h" +#include "cpu/c_op_tree_ensemble_common_.hpp" +// #include + +namespace ortops { + +struct TreeEnsembleKernel { + TreeEnsembleKernel(const OrtApi &api, const OrtKernelInfo *info); + void Compute(OrtKernelContext *context); + + // Attributes + int64_t n_targets_or_classes; + std::unique_ptr> + reg_float_float_float; +}; + +struct TreeEnsembleRegressor + : Ort::CustomOpBase { + void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const; + const char *GetName() const; + const char *GetExecutionProviderType() const; + size_t GetInputTypeCount() const; + ONNXTensorElementDataType GetInputType(size_t index) const; + size_t GetOutputTypeCount() const; + ONNXTensorElementDataType GetOutputType(size_t index) const; +}; + +} // namespace ortops diff --git a/onnx_extended/ortops/optim/optimize.py b/onnx_extended/ortops/optim/optimize.py new file mode 100644 index 00000000..7e5d8869 --- /dev/null +++ b/onnx_extended/ortops/optim/optimize.py @@ -0,0 +1,151 @@ +from typing import Any, Dict, Optional, Union +from onnx import AttributeProto, ModelProto, NodeProto, GraphProto, FunctionProto +from onnx.helper import make_model, make_node, make_graph, make_opsetid + + +def has_subgraph(node: NodeProto) -> bool: + """ + Tells if a node has a subgraph as an attribute. + """ + for att in node.attribute: + if att.type == AttributeProto.GRAPH: + return True + return False + + +def get_node_attribute(node: NodeProto, name: str) -> AttributeProto: + """ + Returns the value of one attribute. + + :param node: node + :param name: attribute name + :return: value + """ + for att in node.attribute: + if att.name == name: + return att + raise KeyError( + f"Unable to find {name!r} among {list(att.name for att in node.attribute)}." + ) + + +def change_onnx_operator_domain( + onx: Union[ModelProto, GraphProto, FunctionProto], + op_type: str, + op_domain: str = "", + new_op_type: Optional[str] = None, + new_op_domain: Optional[str] = None, + new_opset: Optional[int] = None, + **kwargs: Dict[str, Any], +) -> Union[ModelProto, GraphProto, FunctionProto]: + """ + Replaces an operator by another one in the same domain + or another one. + + :param onx: proto to modify + :param op_type: operator to look for + :param op_domain: domain to look for + :param new_op_type: new operator name or None for the same name + :param new_op_domain: new domain name or None the for the same domain + :param new_opset: new opset for the new domain + :param kwargs: modified parameters, set it to None to remove them + :return: same type as the input + + The function is not recursive yet. + """ + + def change_node(node): + atts = [] + new_kwargs = {} + for att in node.attribute: + if att.name in kwargs: + v = kwargs[att.name] + if v is None: + continue + new_kwargs[att.name] = v + continue + atts.append(att) + for k, v in kwargs.items(): + if v is None or k in new_kwargs: + continue + new_kwargs[k] = v + new_node = make_node( + new_op_type or node.op_type, + node.input, + node.output, + domain=new_op_domain or node.domain, + **new_kwargs, + ) + if len(atts) > 0: + new_node.attribute.extend(atts) + return new_node + + if isinstance(onx, GraphProto): + new_nodes = [] + modified = False + for node in onx.node: + if has_subgraph(node): + raise NotImplementedError( + f"The function is not recursive yet and cannot " + f"handle node {node.op_type!r} from domain " + f"{node.domain!r}." + ) + if node.op_type == op_type and node.domain == op_domain: + new_node = change_node(node) + new_nodes.append(new_node) + modified = True + continue + new_nodes.append(node) + if not modified: + return onx + return make_graph( + new_nodes, + onx.name, + onx.input, + onx.output, + onx.initializer, + onx.sparse_initializer, + ) + + if isinstance(onx, FunctionProto): + raise NotImplementedError() + + if not isinstance(onx, ModelProto): + raise TypeError(f"Unexpected type for onx {type(onx)}.") + + new_graph = change_onnx_operator_domain( + onx.graph, + op_type=op_type, + op_domain=op_domain, + new_opset=new_opset, + new_op_type=new_op_type, + new_op_domain=new_op_domain, + **kwargs, + ) + if id(new_graph) == id(onx.graph): + # no change + return onx + + if new_op_domain is None: + new_op_domain = op_domain + if new_op_domain == op_domain and new_opset is not None: + raise ValueError( + f"If new_op_domain==domain=={new_op_domain!r}, " + f"new_opset must be None not {new_opset}." + ) + opsets = list(onx.opset_import) + if new_op_domain != op_domain: + opsets.append(make_opsetid(new_op_domain, new_opset or 1)) + + new_model = make_model( + new_graph, + functions=onx.functions, + ir_version=onx.ir_version, + producer_name=onx.producer_name, + producer_version=onx.producer_version, + model_version=onx.model_version, + doc_string=onx.doc_string, + opset_imports=opsets, + domain=onx.domain, + ) + return new_model diff --git a/onnx_extended/ortops/tutorial/cpu/my_kernel.cc b/onnx_extended/ortops/tutorial/cpu/my_kernel.cc index 9832e46d..6ef7ef32 100644 --- a/onnx_extended/ortops/tutorial/cpu/my_kernel.cc +++ b/onnx_extended/ortops/tutorial/cpu/my_kernel.cc @@ -12,7 +12,8 @@ void MyCustomKernel::Compute(OrtKernelContext *context) { const float *Y = input_Y.GetTensorData(); // Setup output, which is assumed to have the same dimensions as the inputs. - std::vector dimensions = input_X.GetTensorTypeAndShapeInfo().GetShape(); + std::vector dimensions = + input_X.GetTensorTypeAndShapeInfo().GetShape(); Ort::UnownedValue output = ctx.GetOutput(0, dimensions); float *out = output.GetTensorMutableData(); @@ -25,20 +26,27 @@ void MyCustomKernel::Compute(OrtKernelContext *context) { } } -void* MyCustomOp::CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const { +void *MyCustomOp::CreateKernel(const OrtApi &api, + const OrtKernelInfo *info) const { return std::make_unique(api, info).release(); }; -const char* MyCustomOp::GetName() const { return "MyCustomOp"; }; +const char *MyCustomOp::GetName() const { return "MyCustomOp"; }; -const char* MyCustomOp::GetExecutionProviderType() const { return "CPUExecutionProvider"; }; +const char *MyCustomOp::GetExecutionProviderType() const { + return "CPUExecutionProvider"; +}; size_t MyCustomOp::GetInputTypeCount() const { return 2; }; -ONNXTensorElementDataType MyCustomOp::GetInputType(size_t index) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; }; +ONNXTensorElementDataType MyCustomOp::GetInputType(size_t index) const { + return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; +}; size_t MyCustomOp::GetOutputTypeCount() const { return 1; }; -ONNXTensorElementDataType MyCustomOp::GetOutputType(size_t index) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; }; +ONNXTensorElementDataType MyCustomOp::GetOutputType(size_t index) const { + return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; +}; } // namespace ortops diff --git a/onnx_extended/ortops/tutorial/cpu/my_kernel.h b/onnx_extended/ortops/tutorial/cpu/my_kernel.h index fba3344d..67be0686 100644 --- a/onnx_extended/ortops/tutorial/cpu/my_kernel.h +++ b/onnx_extended/ortops/tutorial/cpu/my_kernel.h @@ -6,13 +6,13 @@ namespace ortops { struct MyCustomKernel { MyCustomKernel(const OrtApi &api, const OrtKernelInfo *info); - void Compute(OrtKernelContext* context); + void Compute(OrtKernelContext *context); }; struct MyCustomOp : Ort::CustomOpBase { - void* CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const ; - const char* GetName() const; - const char* GetExecutionProviderType() const; + void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const; + const char *GetName() const; + const char *GetExecutionProviderType() const; size_t GetInputTypeCount() const; ONNXTensorElementDataType GetInputType(size_t index) const; size_t GetOutputTypeCount() const; diff --git a/onnx_extended/ortops/tutorial/cpu/my_kernel_attr.cc b/onnx_extended/ortops/tutorial/cpu/my_kernel_attr.cc index a02fd3e8..5fcf911d 100644 --- a/onnx_extended/ortops/tutorial/cpu/my_kernel_attr.cc +++ b/onnx_extended/ortops/tutorial/cpu/my_kernel_attr.cc @@ -6,18 +6,21 @@ MyCustomKernelWithAttributes::MyCustomKernelWithAttributes( const OrtApi &api, const OrtKernelInfo *info) { // A float attribute. float value_float; - ThrowOnError(api, api.KernelInfoGetAttribute_float(info, "att_float", &value_float)); + ThrowOnError( + api, api.KernelInfoGetAttribute_float(info, "att_float", &value_float)); att_float = value_float; // An integer attribute. int64_t value_int64; - ThrowOnError(api, api.KernelInfoGetAttribute_int64(info, "att_int64", &value_int64)); + ThrowOnError( + api, api.KernelInfoGetAttribute_int64(info, "att_int64", &value_int64)); att_int64 = value_int64; // A string attribute. char value_string[1000]; size_t size = 1000; - ThrowOnError(api, api.KernelInfoGetAttribute_string(info, "att_string", value_string, &size)); + ThrowOnError(api, api.KernelInfoGetAttribute_string(info, "att_string", + value_string, &size)); att_string = value_string; // A tensor attribute @@ -25,22 +28,24 @@ MyCustomKernelWithAttributes::MyCustomKernelWithAttributes( OrtAllocator *cpu_allocator; ThrowOnError(api, api.GetAllocatorWithDefaultOptions(&cpu_allocator)); - OrtValue* value_tensor = nullptr; - ThrowOnError(api, api.KernelInfoGetAttribute_tensor(info, "att_tensor", cpu_allocator, &value_tensor)); + OrtValue *value_tensor = nullptr; + ThrowOnError(api, api.KernelInfoGetAttribute_tensor( + info, "att_tensor", cpu_allocator, &value_tensor)); // Retrieve the dimensions and the element type. - OrtTensorTypeAndShapeInfo* shape_info; + OrtTensorTypeAndShapeInfo *shape_info; ThrowOnError(api, api.GetTensorTypeAndShape(value_tensor, &shape_info)); // Retrieve the element type. ONNXTensorElementDataType elem_type; ThrowOnError(api, api.GetTensorElementType(shape_info, &elem_type)); - if (elem_type != ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE) { + if (elem_type != + ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE) { api.ReleaseTensorTypeAndShapeInfo(shape_info); api.ReleaseValue(value_tensor); throw std::runtime_error( - "Attribute 'att_tensor' of operator 'MyCustomOpWithAttributes' expects a double tensor." - ); + "Attribute 'att_tensor' of operator 'MyCustomOpWithAttributes' expects " + "a double tensor."); } // Retrieve the number of elements in the shape. @@ -52,7 +57,7 @@ MyCustomKernelWithAttributes::MyCustomKernelWithAttributes( size_t size_tensor; ThrowOnError(api, api.GetTensorShapeElementCount(shape_info, &size_tensor)); att_tensor_double.resize(size_tensor); - void* data; + void *data; ThrowOnError(api, api.GetTensorMutableData(value_tensor, &data)); memcpy(att_tensor_double.data(), data, size_tensor * sizeof(double)); @@ -63,9 +68,8 @@ MyCustomKernelWithAttributes::MyCustomKernelWithAttributes( // Verifications. if (att_tensor_double.empty()) { - throw std::runtime_error( - "Attribute 'att_tensor' of operator 'MyCustomOpWithAttributes' cannot be empty." - ); + throw std::runtime_error("Attribute 'att_tensor' of operator " + "'MyCustomOpWithAttributes' cannot be empty."); } } @@ -77,7 +81,8 @@ void MyCustomKernelWithAttributes::Compute(OrtKernelContext *context) { const double *Y = input_Y.GetTensorData(); // Setup output, which is assumed to have the same dimensions as the inputs. - std::vector dimensions = input_X.GetTensorTypeAndShapeInfo().GetShape(); + std::vector dimensions = + input_X.GetTensorTypeAndShapeInfo().GetShape(); Ort::UnownedValue output = ctx.GetOutput(0, dimensions); double *out = output.GetTensorMutableData(); @@ -85,27 +90,40 @@ void MyCustomKernelWithAttributes::Compute(OrtKernelContext *context) { const size_t size = output.GetTensorTypeAndShapeInfo().GetElementCount(); // Do computation - double cst = att_tensor_double[0] + static_cast(att_float) + static_cast(att_int64) + static_cast(att_string[0]); + double cst = att_tensor_double[0] + static_cast(att_float) + + static_cast(att_int64) + + static_cast(att_string[0]); for (size_t i = 0; i < size; i++) { out[i] = X[i] + Y[i] + cst; } } -void* MyCustomOpWithAttributes::CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const { +void *MyCustomOpWithAttributes::CreateKernel(const OrtApi &api, + const OrtKernelInfo *info) const { return std::make_unique(api, info).release(); }; -const char* MyCustomOpWithAttributes::GetName() const { return "MyCustomOpWithAttributes"; }; +const char *MyCustomOpWithAttributes::GetName() const { + return "MyCustomOpWithAttributes"; +}; -const char* MyCustomOpWithAttributes::GetExecutionProviderType() const { return "CPUExecutionProvider"; }; +const char *MyCustomOpWithAttributes::GetExecutionProviderType() const { + return "CPUExecutionProvider"; +}; size_t MyCustomOpWithAttributes::GetInputTypeCount() const { return 2; }; -ONNXTensorElementDataType MyCustomOpWithAttributes::GetInputType(size_t index) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE; }; +ONNXTensorElementDataType +MyCustomOpWithAttributes::GetInputType(size_t index) const { + return ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE; +}; size_t MyCustomOpWithAttributes::GetOutputTypeCount() const { return 1; }; -ONNXTensorElementDataType MyCustomOpWithAttributes::GetOutputType(size_t index) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE; }; +ONNXTensorElementDataType +MyCustomOpWithAttributes::GetOutputType(size_t index) const { + return ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE; +}; } // namespace ortops diff --git a/onnx_extended/ortops/tutorial/cpu/my_kernel_attr.h b/onnx_extended/ortops/tutorial/cpu/my_kernel_attr.h index d8817f19..7d9d5082 100644 --- a/onnx_extended/ortops/tutorial/cpu/my_kernel_attr.h +++ b/onnx_extended/ortops/tutorial/cpu/my_kernel_attr.h @@ -5,20 +5,22 @@ namespace ortops { struct MyCustomKernelWithAttributes { - MyCustomKernelWithAttributes(const OrtApi& api, const OrtKernelInfo* info); - void Compute(OrtKernelContext* context); + MyCustomKernelWithAttributes(const OrtApi &api, const OrtKernelInfo *info); + void Compute(OrtKernelContext *context); - private: - std::string att_string; - float att_float; - int64_t att_int64; - std::vector att_tensor_double; +private: + std::string att_string; + float att_float; + int64_t att_int64; + std::vector att_tensor_double; }; -struct MyCustomOpWithAttributes : Ort::CustomOpBase { - void* CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const ; - const char* GetName() const; - const char* GetExecutionProviderType() const; +struct MyCustomOpWithAttributes + : Ort::CustomOpBase { + void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const; + const char *GetName() const; + const char *GetExecutionProviderType() const; size_t GetInputTypeCount() const; ONNXTensorElementDataType GetInputType(size_t index) const; size_t GetOutputTypeCount() const; diff --git a/onnx_extended/ortops/tutorial/cpu/ort_tutorial_cpu_lib.cc b/onnx_extended/ortops/tutorial/cpu/ort_tutorial_cpu_lib.cc index 3609c0d1..74e2bcce 100644 --- a/onnx_extended/ortops/tutorial/cpu/ort_tutorial_cpu_lib.cc +++ b/onnx_extended/ortops/tutorial/cpu/ort_tutorial_cpu_lib.cc @@ -4,20 +4,21 @@ #include #include -#include "ort_tutorial_cpu_lib.h" #include "my_kernel.h" #include "my_kernel_attr.h" +#include "ort_tutorial_cpu_lib.h" -static const char* c_OpDomain = "onnx_extented.ortops.tutorial.cpu"; +static const char *c_OpDomain = "onnx_extented.ortops.tutorial.cpu"; -static void AddOrtCustomOpDomainToContainer(Ort::CustomOpDomain&& domain) { +static void AddOrtCustomOpDomainToContainer(Ort::CustomOpDomain &&domain) { static std::vector ort_custom_op_domain_container; static std::mutex ort_custom_op_domain_mutex; std::lock_guard lock(ort_custom_op_domain_mutex); ort_custom_op_domain_container.push_back(std::move(domain)); } -OrtStatus* ORT_API_CALL RegisterCustomOps(OrtSessionOptions* options, const OrtApiBase* api_base) { +OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options, + const OrtApiBase *api_base) { Ort::InitApi(api_base->GetApi(ORT_API_VERSION)); Ort::UnownedSessionOptions session_options(options); @@ -25,7 +26,7 @@ OrtStatus* ORT_API_CALL RegisterCustomOps(OrtSessionOptions* options, const OrtA static ortops::MyCustomOp c_CustomOp; static ortops::MyCustomOpWithAttributes c_CustomOpAttr; - OrtStatus* result = nullptr; + OrtStatus *result = nullptr; try { Ort::CustomOpDomain domain{c_OpDomain}; @@ -35,8 +36,7 @@ OrtStatus* ORT_API_CALL RegisterCustomOps(OrtSessionOptions* options, const OrtA session_options.Add(domain); AddOrtCustomOpDomainToContainer(std::move(domain)); - } - catch (const std::exception& e) { + } catch (const std::exception &e) { Ort::Status status{e}; result = status.release(); } diff --git a/onnx_extended/ortops/tutorial/cpu/ort_tutorial_cpu_lib.h b/onnx_extended/ortops/tutorial/cpu/ort_tutorial_cpu_lib.h index a69d2d7d..0e7867fa 100644 --- a/onnx_extended/ortops/tutorial/cpu/ort_tutorial_cpu_lib.h +++ b/onnx_extended/ortops/tutorial/cpu/ort_tutorial_cpu_lib.h @@ -8,7 +8,8 @@ extern "C" { #endif -ORT_EXPORT OrtStatus* ORT_API_CALL RegisterCustomOps(OrtSessionOptions* options, const OrtApiBase* api_base); +ORT_EXPORT OrtStatus *ORT_API_CALL +RegisterCustomOps(OrtSessionOptions *options, const OrtApiBase *api_base); #ifdef __cplusplus } diff --git a/onnx_extended/ortops/tutorial/cuda/custom_gemm.cu b/onnx_extended/ortops/tutorial/cuda/custom_gemm.cu index a791dae4..5f285204 100644 --- a/onnx_extended/ortops/tutorial/cuda/custom_gemm.cu +++ b/onnx_extended/ortops/tutorial/cuda/custom_gemm.cu @@ -215,8 +215,8 @@ CustomGemmKernel::CustomGemmKernel(const OrtApi &api, EXT_THROW("Unexpected value for compute_type '", compute_type, "'."); } - std::string activation = KernelInfoGetOptionalAttributeString( - api, info, "activation", "DEFUALT"); + std::string activation = + KernelInfoGetOptionalAttributeString(api, info, "activation", "DEFUALT"); if (activation == "DEFUALT") { epilogue_ = EpiloqueGemmKernel::Default; } else if (activation == "RELU") { @@ -319,7 +319,8 @@ void CustomGemmKernel::Compute(OrtKernelContext *context) { bool has_scales = n_inputs > 3; if (has_scales) { - EXT_ENFORCE(n_inputs == 6, "Number of inputs must be 6 but is ", n_inputs, "."); + EXT_ENFORCE(n_inputs == 6, "Number of inputs must be 6 but is ", n_inputs, + "."); scale_A = ctx.GetInput(3); scale_B = ctx.GetInput(4); scale_Y = ctx.GetInput(5); @@ -516,17 +517,17 @@ void CustomGemmKernel::ComputeGemm( } cublasLtEpilogue_t epilogue; - switch(epilogue_) { - case EpiloqueGemmKernel::Default: + switch (epilogue_) { + case EpiloqueGemmKernel::Default: epilogue = CUBLASLT_EPILOGUE_DEFAULT; break; - case EpiloqueGemmKernel::Relu: + case EpiloqueGemmKernel::Relu: epilogue = CUBLASLT_EPILOGUE_RELU; break; - case EpiloqueGemmKernel::Gelu: + case EpiloqueGemmKernel::Gelu: epilogue = CUBLASLT_EPILOGUE_GELU; break; - } + } cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue)); diff --git a/onnx_extended/ortops/tutorial/cuda/ort_tutorial_cuda_lib.cc b/onnx_extended/ortops/tutorial/cuda/ort_tutorial_cuda_lib.cc index 5cad8ae6..28afd8db 100644 --- a/onnx_extended/ortops/tutorial/cuda/ort_tutorial_cuda_lib.cc +++ b/onnx_extended/ortops/tutorial/cuda/ort_tutorial_cuda_lib.cc @@ -4,44 +4,44 @@ #include #include -#include "ort_tutorial_cuda_lib.h" #include "custom_gemm.h" +#include "ort_tutorial_cuda_lib.h" -static const char* c_OpDomain = "onnx_extented.ortops.tutorial.cuda"; +static const char *c_OpDomain = "onnx_extented.ortops.tutorial.cuda"; -static void AddOrtCustomOpDomainToContainer(Ort::CustomOpDomain&& domain) { +static void AddOrtCustomOpDomainToContainer(Ort::CustomOpDomain &&domain) { static std::vector ort_custom_op_domain_container; static std::mutex ort_custom_op_domain_mutex; std::lock_guard lock(ort_custom_op_domain_mutex); ort_custom_op_domain_container.push_back(std::move(domain)); } -OrtStatus* ORT_API_CALL RegisterCustomOps(OrtSessionOptions* options, const OrtApiBase* api_base) { +OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options, + const OrtApiBase *api_base) { Ort::InitApi(api_base->GetApi(ORT_API_VERSION)); Ort::UnownedSessionOptions session_options(options); // An instance remaining available until onnxruntime unload the library. static ortops::CustomGemmOpFloat c_CustomGemmFloat; static ortops::CustomGemmOpFloat16 c_CustomGemmFloat16; - #if ORT_VERSION >= 1160 && CUDA_VERSION >= 11080 +#if ORT_VERSION >= 1160 && CUDA_VERSION >= 11080 static ortops::CustomGemmOpFloat8E4M3FN c_CustomGemmFloat8E4M3FN; - #endif +#endif - OrtStatus* result = nullptr; + OrtStatus *result = nullptr; try { Ort::CustomOpDomain domain{c_OpDomain}; domain.Add(&c_CustomGemmFloat); domain.Add(&c_CustomGemmFloat16); - #if ORT_VERSION >= 1160 && CUDA_VERSION >= 11080 +#if ORT_VERSION >= 1160 && CUDA_VERSION >= 11080 domain.Add(&c_CustomGemmFloat8E4M3FN); - #endif +#endif session_options.Add(domain); AddOrtCustomOpDomainToContainer(std::move(domain)); - } - catch (const std::exception& e) { + } catch (const std::exception &e) { Ort::Status status{e}; result = status.release(); } diff --git a/onnx_extended/ortops/tutorial/cuda/ort_tutorial_cuda_lib.h b/onnx_extended/ortops/tutorial/cuda/ort_tutorial_cuda_lib.h index a69d2d7d..0e7867fa 100644 --- a/onnx_extended/ortops/tutorial/cuda/ort_tutorial_cuda_lib.h +++ b/onnx_extended/ortops/tutorial/cuda/ort_tutorial_cuda_lib.h @@ -8,7 +8,8 @@ extern "C" { #endif -ORT_EXPORT OrtStatus* ORT_API_CALL RegisterCustomOps(OrtSessionOptions* options, const OrtApiBase* api_base); +ORT_EXPORT OrtStatus *ORT_API_CALL +RegisterCustomOps(OrtSessionOptions *options, const OrtApiBase *api_base); #ifdef __cplusplus } diff --git a/onnx_extended/reference/c_custom_ops/__init__.py b/onnx_extended/reference/c_custom_ops/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/onnx_extended/reference/c_custom_ops/__init__.py @@ -0,0 +1 @@ + diff --git a/onnx_extended/reference/c_custom_ops/custom_op_tree_ensemble_regressor.py b/onnx_extended/reference/c_custom_ops/custom_op_tree_ensemble_regressor.py new file mode 100644 index 00000000..45cf49b6 --- /dev/null +++ b/onnx_extended/reference/c_custom_ops/custom_op_tree_ensemble_regressor.py @@ -0,0 +1,190 @@ +from typing import Any, Dict +import numpy +from onnx import NodeProto +from onnx.defs import OpSchema, get_schema +from onnx.reference.op_run import OpRun +from ..c_ops.cpu.c_op_tree_ensemble_py_ import ( + RuntimeTreeEnsembleRegressorFloat, + RuntimeTreeEnsembleRegressorDouble, +) + + +class TreeEnsembleRegressorCommon(OpRun): + op_domain = "onnx_extented.ortops.optim.cpu" + + def __init__( + self, onnx_node: NodeProto, run_params: Dict[str, Any], schema: Any = None + ): + OpRun.__init__(self, onnx_node, run_params, schema=schema) + self.parallel = None + self.rt_ = None + # default is no parallelization + self.set_parallel(int(100e6), int(100e6), int(100e6), 1, 1, 0) + + def set_parallel( + self, + parallel_tree: int = -1, + parallel_tree_N: int = -1, + parallel_N: int = -1, + batch_size_tree: int = -1, + batch_size_rows: int = -1, + node3: int = -1, + ): + """ + Sets the parameter for parallelization. + If a parameter is set to -1, its value does not change. + + :param parallel_tree: parallization by trees if the number of trees is higher + :param parallel_tree_N: batch size (rows) if parallization by trees + :param parallel_N: parallization by rows if the number of rows is higher + :param batch_size_tree: number of trees to compute at the same time + :param batch_size_rows: number of rows to compute at the same time + :param node3: use bigger nodes + """ + self.parallel = ( + parallel_tree, + parallel_tree_N, + parallel_N, + batch_size_tree, + batch_size_rows, + node3, + ) + if self.rt_ is not None: + self.rt_.set(*self.parallel) + + def _init(self, dtype, **kwargs): + if dtype == numpy.float32: + cls = RuntimeTreeEnsembleRegressorFloat + else: + cls = RuntimeTreeEnsembleRegressorDouble + + self.rt_ = cls() + + empty_f = numpy.array([], dtype=dtype) + base_values = numpy.array( + kwargs.get("base_values", None) + or kwargs.get("base_values_as_tensor", None) + or empty_f + ) + nodes_values = numpy.array( + kwargs.get("nodes_values", None) + or kwargs.get("nodes_values_as_tensor", None) + or empty_f + ) + nodes_hitrates = numpy.array( + kwargs.get("nodes_hitrates", None) + or kwargs.get("nodes_hitrates_as_tensor", None) + or empty_f + ) + tw = numpy.array( + kwargs.get("target_weights", None) + or kwargs.get("target_weights", None) + or empty_f + ) + + self.rt_.init( + kwargs.get("aggregate_function", "SUM"), # 3 + base_values, # 4 + kwargs["n_targets"], # 5 + kwargs["nodes_falsenodeids"], # 6 + kwargs["nodes_featureids"], # 7 + nodes_hitrates, # 8 + kwargs.get("nodes_missing_value_tracks_true", []), # 9 + kwargs["nodes_modes"].split(","), # 10 + kwargs["nodes_nodeids"], # 11 + kwargs["nodes_treeids"], # 12 + kwargs["nodes_truenodeids"], # 13 + nodes_values, # 14 + kwargs["post_transform"], # 15 + kwargs["target_ids"], # 16 + kwargs["target_nodeids"], # 17 + kwargs["target_treeids"], # 18 + tw, # 19 + ) + if self.parallel is not None: + self.rt_.set(*self.parallel) + + def _run(self, x, **kwargs): + if hasattr(x, "todense"): + x = x.todense() + if self.rt_ is None: + self._init(x.dtype, **kwargs) + pred = self.rt_.compute(x) + if pred.shape[0] != x.shape[0]: + pred = pred.reshape((x.shape[0], -1)) + return (pred,) + + +def _make_schema(): + attributes = [] + sch = get_schema("TreeEnsembleRegressor", 1, "ai.onnx.ml") + for att in sch.attributes.values(): + if att.name == "nodes_modes": + attributes.append( + OpSchema.Attribute( + "nodes_modes", + OpSchema.AttrType.STRING, + "comma separated value nodes_modes", + ) + ) + else: + attributes.append(att) + return OpSchema( + "TreeEnsembleRegressor", + TreeEnsembleRegressorCommon.op_domain, + 1, + inputs=[ + OpSchema.FormalParameter("X", "T"), + ], + outputs=[ + OpSchema.FormalParameter("Y", "T"), + ], + type_constraints=[("T", ["tensor(float)"], "")], + attributes=attributes, + ) + + +class TreeEnsembleRegressor_1(TreeEnsembleRegressorCommon): + op_schema = _make_schema() + + def _run( + self, + x, + aggregate_function=None, + base_values=None, + n_targets=None, + nodes_falsenodeids=None, + nodes_featureids=None, + nodes_hitrates=None, + nodes_missing_value_tracks_true=None, + nodes_modes=None, + nodes_nodeids=None, + nodes_treeids=None, + nodes_truenodeids=None, + nodes_values=None, + post_transform=None, + target_ids=None, + target_nodeids=None, + target_treeids=None, + target_weights=None, + ): + return TreeEnsembleRegressorCommon._run( + self, + x, + base_values=base_values, + n_targets=n_targets, + nodes_falsenodeids=nodes_falsenodeids, + nodes_featureids=nodes_featureids, + nodes_hitrates=nodes_hitrates, + nodes_missing_value_tracks_true=nodes_missing_value_tracks_true, + nodes_modes=nodes_modes, + nodes_nodeids=nodes_nodeids, + nodes_treeids=nodes_treeids, + nodes_truenodeids=nodes_truenodeids, + nodes_values=nodes_values, + post_transform=post_transform, + target_ids=target_ids, + target_nodeids=target_nodeids, + target_treeids=target_treeids, + target_weights=target_weights, + ) diff --git a/onnx_extended/reference/c_ops/c_op_common.cpp b/onnx_extended/reference/c_ops/c_op_common.cpp new file mode 100755 index 00000000..19ade3aa Binary files /dev/null and b/onnx_extended/reference/c_ops/c_op_common.cpp differ diff --git a/onnx_extended/reference/c_ops/cpu/c_op_common.h b/onnx_extended/reference/c_ops/cpu/c_op_common.h index a3e6a190..1097cd84 100644 --- a/onnx_extended/reference/c_ops/cpu/c_op_common.h +++ b/onnx_extended/reference/c_ops/cpu/c_op_common.h @@ -13,6 +13,8 @@ #include #endif +#include "onnx_extended_helpers.h" + namespace onnx_c_ops { void *AllocatorDefaultAlloc(size_t size); @@ -440,79 +442,4 @@ void debug_print(const std::string &msg, size_t i, size_t j, size_t k, float pa, void debug_print(const std::string &msg, size_t i, size_t j, size_t k, double pa, double pb, double val); -inline void MakeStringInternal(std::ostringstream &ss) noexcept {} - -template -inline void MakeStringInternal(std::ostringstream &ss, const T &t) noexcept { - ss << t; -} - -template <> -inline void MakeStringInternal(std::ostringstream &ss, - const std::vector &t) noexcept { - for (auto it : t) - ss << "x" << it; -} - -template <> -inline void MakeStringInternal(std::ostringstream &ss, - const std::vector &t) noexcept { - for (auto it : t) - ss << "x" << it; -} - -template <> -inline void MakeStringInternal(std::ostringstream &ss, - const std::vector &t) noexcept { - for (auto it : t) - ss << "x" << it; -} - -template <> -inline void MakeStringInternal(std::ostringstream &ss, - const std::vector &t) noexcept { - for (auto it : t) - ss << "x" << it; -} - -template <> -inline void MakeStringInternal(std::ostringstream &ss, - const std::vector &t) noexcept { - for (auto it : t) - ss << "x" << it; -} - -template <> -inline void MakeStringInternal(std::ostringstream &ss, - const std::vector &t) noexcept { - for (auto it : t) - ss << "x" << it; -} - -template -inline void MakeStringInternal(std::ostringstream &ss, const T &t, - const Args &...args) noexcept { - MakeStringInternal(ss, t); - MakeStringInternal(ss, args...); -} - -template inline std::string MakeString(const Args &...args) { - std::ostringstream ss; - MakeStringInternal(ss, args...); - return std::string(ss.str()); -} - -#if !defined(_THROW_DEFINED) -#define EXT_THROW(...) throw std::runtime_error(MakeString(__VA_ARGS__)); -#define _THROW_DEFINED -#endif - -#if !defined(_ENFORCE_DEFINED) -#define EXT_ENFORCE(cond, ...) \ - if (!(cond)) \ - throw std::runtime_error( \ - MakeString("`", #cond, "` failed. ", MakeString(__VA_ARGS__))); -#define _ENFORCE_DEFINED -#endif - } // namespace onnx_c_ops diff --git a/onnx_extended/reference/c_ops/cpu/c_op_common_parallel.hpp b/onnx_extended/reference/c_ops/cpu/c_op_common_parallel.hpp index 21461e91..71dc927b 100644 --- a/onnx_extended/reference/c_ops/cpu/c_op_common_parallel.hpp +++ b/onnx_extended/reference/c_ops/cpu/c_op_common_parallel.hpp @@ -58,7 +58,7 @@ inline void TrySimpleParallelFor(int64_t n_threads, int64_t batch_size, template inline void TryBatchParallelFor(int64_t n_threads, int64_t batch_size, - int64_t total, F &&fn) { + int64_t total, F &&fn) { if (n_threads != omp_get_max_threads()) { throw std::runtime_error("TryBatchParallelFor not implemented when " "n_threads != omp_get_max_threads()."); diff --git a/onnx_extended/reference/c_ops/cpu/c_op_conv.h b/onnx_extended/reference/c_ops/cpu/c_op_conv.h index 0a355458..193e18d7 100644 --- a/onnx_extended/reference/c_ops/cpu/c_op_conv.h +++ b/onnx_extended/reference/c_ops/cpu/c_op_conv.h @@ -1,5 +1,7 @@ #include "c_op_conv_common.h" +using namespace onnx_extended_helpers; + namespace onnx_c_ops { template diff --git a/onnx_extended/reference/c_ops/cpu/c_op_conv_common.h b/onnx_extended/reference/c_ops/cpu/c_op_conv_common.h index ddb5c3b2..09d1d098 100644 --- a/onnx_extended/reference/c_ops/cpu/c_op_conv_common.h +++ b/onnx_extended/reference/c_ops/cpu/c_op_conv_common.h @@ -11,110 +11,104 @@ namespace onnx_c_ops { // was initialized. template void gemm(bool transA, bool transB, size_t M, size_t N, size_t K, NTYPE alpha, - const NTYPE* A, const NTYPE* B, NTYPE beta, NTYPE* C) { + const NTYPE *A, const NTYPE *B, NTYPE beta, NTYPE *C) { #if defined(__APPLE__) - // link issues on apple, "___kmpc_fork_call", referenced from: - if (transA) { - if (transB) { - } - else { - // a A B + b C, dimension = M * N - NTYPE* begin; - NTYPE val; - NTYPE val0; - size_t i, j, k, maxc = 0; - const NTYPE* pA, * pB; - for (i = 0, begin = C; i < M; ++i) { - for (j = 0; j < N; ++j, ++begin) { - val0 = *begin * beta; - val = 0; - pA = A + i; - pB = B + j; - for (k = K; k > 0; --k, pA += K, pB += N) - val += *pA * *pB; - *begin = val0 + val * alpha; - maxc = maxc > (size_t)(begin - C) ? maxc : (size_t)(begin - C); - if (maxc > M * N) - throw std::invalid_argument("gemm10: maxc > M * N"); - } - } - return; + // link issues on apple, "___kmpc_fork_call", referenced from: + if (transA) { + if (transB) { + } else { + // a A B + b C, dimension = M * N + NTYPE *begin; + NTYPE val; + NTYPE val0; + size_t i, j, k, maxc = 0; + const NTYPE *pA, *pB; + for (i = 0, begin = C; i < M; ++i) { + for (j = 0; j < N; ++j, ++begin) { + val0 = *begin * beta; + val = 0; + pA = A + i; + pB = B + j; + for (k = K; k > 0; --k, pA += K, pB += N) + val += *pA * *pB; + *begin = val0 + val * alpha; + maxc = maxc > (size_t)(begin - C) ? maxc : (size_t)(begin - C); + if (maxc > M * N) + throw std::invalid_argument("gemm10: maxc > M * N"); } + } + return; } - else { - if (transB) { - } - else { - // a A B + b C, dimension = M * N - NTYPE* begin; - NTYPE val; - NTYPE val0; - size_t i, j, k, maxc = 0; - const NTYPE* pA, * pB; - for (i = 0, begin = C; i < M; ++i) { - for (j = 0; j < N; ++j, ++begin) { - val0 = *begin * beta; - val = 0; - pA = A + i * K; - pB = B + j; - for (k = K; k > 0; --k, ++pA, pB += N) - val += *pA * *pB; - *begin = val0 + val * alpha; - maxc = maxc > (size_t)(begin - C) ? maxc : (size_t)(begin - C); - if (maxc > M * N) - throw std::invalid_argument("gemm00: maxc > M * N"); - } - } - return; + } else { + if (transB) { + } else { + // a A B + b C, dimension = M * N + NTYPE *begin; + NTYPE val; + NTYPE val0; + size_t i, j, k, maxc = 0; + const NTYPE *pA, *pB; + for (i = 0, begin = C; i < M; ++i) { + for (j = 0; j < N; ++j, ++begin) { + val0 = *begin * beta; + val = 0; + pA = A + i * K; + pB = B + j; + for (k = K; k > 0; --k, ++pA, pB += N) + val += *pA * *pB; + *begin = val0 + val * alpha; + maxc = maxc > (size_t)(begin - C) ? maxc : (size_t)(begin - C); + if (maxc > M * N) + throw std::invalid_argument("gemm00: maxc > M * N"); } + } + return; } + } #else - typedef Map> matrixdd_row; - typedef Map> matrixdd_col; - matrixdd_row mc(C, M, N); - if (beta != 1) - mc *= beta; - if (transA) { - matrixdd_col ma((NTYPE*)A, M, K); - if (transB) { - matrixdd_col mb((NTYPE*)B, K, N); - if (alpha != 1) - mc.noalias() += alpha * ma * mb; - else - mc.noalias() += ma * mb; - return; - } - else { - matrixdd_row mb((NTYPE*)B, K, N); - if (alpha != 1) - mc.noalias() += alpha * ma * mb; - else - mc.noalias() += ma * mb; - return; - } + typedef Map> matrixdd_row; + typedef Map> matrixdd_col; + matrixdd_row mc(C, M, N); + if (beta != 1) + mc *= beta; + if (transA) { + matrixdd_col ma((NTYPE *)A, M, K); + if (transB) { + matrixdd_col mb((NTYPE *)B, K, N); + if (alpha != 1) + mc.noalias() += alpha * ma * mb; + else + mc.noalias() += ma * mb; + return; + } else { + matrixdd_row mb((NTYPE *)B, K, N); + if (alpha != 1) + mc.noalias() += alpha * ma * mb; + else + mc.noalias() += ma * mb; + return; } - else { - matrixdd_row ma((NTYPE*)A, M, K); - if (transB) { - matrixdd_col mb((NTYPE*)B, K, N); - if (alpha != 1) - mc.noalias() += alpha * ma * mb; - else - mc.noalias() += ma * mb; - return; - } - else { - matrixdd_row mb((NTYPE*)B, K, N); - if (alpha != 1) - mc.noalias() += alpha * ma * mb; - else - mc.noalias() += ma * mb; - return; - } + } else { + matrixdd_row ma((NTYPE *)A, M, K); + if (transB) { + matrixdd_col mb((NTYPE *)B, K, N); + if (alpha != 1) + mc.noalias() += alpha * ma * mb; + else + mc.noalias() += ma * mb; + return; + } else { + matrixdd_row mb((NTYPE *)B, K, N); + if (alpha != 1) + mc.noalias() += alpha * ma * mb; + else + mc.noalias() += ma * mb; + return; } + } #endif - throw std::invalid_argument( - "Not implemented for adjointd matrices (Gemm)."); + throw std::invalid_argument( + "Not implemented for adjointd matrices (Gemm)."); } }; // namespace onnx_c_ops diff --git a/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_common_.hpp b/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_common_.hpp index 218bb57c..7a29cbe6 100644 --- a/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_common_.hpp +++ b/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_common_.hpp @@ -6,13 +6,18 @@ #include "c_op_common_parallel.hpp" #include "c_op_tree_ensemble_common_agg_.hpp" +#include +#include -// #define DEBUG_PRINT(...) printf("%s", MakeString("*", __FILE__, ":", __LINE__, ":", MakeString(__VA_ARGS__), "\n").c_str()); +// #define DEBUG_PRINT(...) printf("%s", MakeString("*", __FILE__, ":", +// __LINE__, ":", MakeString(__VA_ARGS__), "\n").c_str()); #define DEBUG_PRINT(...) // https://cims.nyu.edu/~stadler/hpc17/material/ompLec.pdf // http://amestoy.perso.enseeiht.fr/COURS/CoursMulticoreProgrammingButtari.pdf +using namespace onnx_extended_helpers; + namespace onnx_c_ops { template struct TreeAlloc { @@ -185,7 +190,9 @@ Status TreeEnsembleCommon::Init( DEBUG_PRINT("Init") EXT_ENFORCE(n_targets_or_classes > 0); EXT_ENFORCE(nodes_falsenodeids.size() == nodes_featureids.size()); - EXT_ENFORCE(nodes_falsenodeids.size() == nodes_modes.size()); + EXT_ENFORCE(nodes_falsenodeids.size() == nodes_modes.size(), + "nodes_falsenodeids.size()=", nodes_falsenodeids.size(), + " nodes_modes.size()=", nodes_modes.size()); EXT_ENFORCE(nodes_falsenodeids.size() == nodes_nodeids.size()); EXT_ENFORCE(nodes_falsenodeids.size() == nodes_treeids.size()); EXT_ENFORCE(nodes_falsenodeids.size() == nodes_truenodeids.size()); @@ -262,8 +269,8 @@ Status TreeEnsembleCommon::Init( auto p = idi.insert(std::pair(node_tree_id, i)); if (!p.second) { - EXT_THROW("Node ", node_tree_id.node_id, " in tree ", node_tree_id.tree_id, - " is already there."); + EXT_THROW("Node ", node_tree_id.node_id, " in tree ", + node_tree_id.tree_id, " is already there."); } nodes_.emplace_back(node); node_tree_ids.emplace_back(node_tree_id); @@ -289,11 +296,11 @@ Status TreeEnsembleCommon::Init( auto found = idi.find(coor); if (found == idi.end()) { EXT_THROW("Unable to find node ", coor.tree_id, "-", coor.node_id, - " (truenode)."); + " (truenode)."); } if (found->second == truenode_ids.size()) { EXT_THROW("A node cannot point to itself: ", coor.tree_id, "-", - node_tree_id.node_id, " (truenode)."); + node_tree_id.node_id, " (truenode)."); } truenode_ids.emplace_back(found->second); @@ -302,11 +309,11 @@ Status TreeEnsembleCommon::Init( found = idi.find(coor); if (found == idi.end()) { EXT_THROW("Unable to find node ", coor.tree_id, "-", coor.node_id, - " (falsenode)."); + " (falsenode)."); } if (found->second == falsenode_ids.size()) { EXT_THROW("A node cannot point to itself: ", coor.tree_id, "-", - node_tree_id.node_id, " (falsenode)."); + node_tree_id.node_id, " (falsenode)."); } falsenode_ids.emplace_back(found->second); // We could also check that truenode_ids[truenode_ids.size() - 1] != @@ -334,7 +341,7 @@ Status TreeEnsembleCommon::Init( auto found = idi.find(ind); if (found == idi.end()) { EXT_THROW("Unable to find node ", ind.tree_id, "-", ind.node_id, - " (weights)."); + " (weights)."); } TreeNodeElement &leaf = nodes_[found->second]; @@ -365,16 +372,16 @@ Status TreeEnsembleCommon::Init( if (!nodes_[i].is_not_leaf()) { if (nodes_[i].falsenode_inc_or_n_weights == 0) { EXT_THROW("Target is missing for leaf ", ind.tree_id, "-", ind.node_id, - "."); + "."); } continue; } EXT_ENFORCE(truenode_ids[i] != i); // That would mean the left node is - // itself, leading to an infinite loop. + // itself, leading to an infinite loop. nodes_[i].truenode_inc_or_first_weight = static_cast(truenode_ids[i] - i); EXT_ENFORCE(falsenode_ids[i] != i); // That would mean the right node is - // itself, leading to an infinite loop. + // itself, leading to an infinite loop. nodes_[i].falsenode_inc_or_n_weights = static_cast(falsenode_ids[i] - i); } @@ -445,7 +452,8 @@ int TreeEnsembleCommon:: while (!stack.empty()) { pair = stack.front(); stack.pop_front(); - // EXT_ENFORCE(map_node_to_node3.find(pair.first) == map_node_to_node3.end(), + // EXT_ENFORCE(map_node_to_node3.find(pair.first) == + // map_node_to_node3.end(), // "This node index ", pair.first, // " was already added as a TreeNodeElement3."); node = pair.second; @@ -603,7 +611,8 @@ void TreeEnsembleCommon::ComputeAgg( DEBUG_PRINT("max_num_threads=", max_num_threads) DEBUG_PRINT("parallel_tree_N_=", parallel_tree_N_) DEBUG_PRINT("parallel_tree_n=", parallel_tree_n) - DEBUG_PRINT("n_targets_or_classes_=", n_targets_or_classes_, " N=", N, " agg.kind()=", agg.kind()) + DEBUG_PRINT("n_targets_or_classes_=", n_targets_or_classes_, " N=", N, + " agg.kind()=", agg.kind()) if (n_targets_or_classes_ == 1) { DEBUG_PRINT() @@ -654,7 +663,8 @@ void TreeEnsembleCommon::ComputeAgg( // That's why the first loop split into batch so that every batch holds // on caches, then loop on trees and finally loop on the batch rows. DEBUG_PRINT() - std::vector> scores(std::min(parallel_tree_n, N)); + std::vector> scores( + std::min(parallel_tree_n, N)); size_t j; int64_t i, batch, batch_end; @@ -806,10 +816,9 @@ void TreeEnsembleCommon::ComputeAgg( } } for (i = batch; i < batch_end; ++i) { - agg.FinalizeScores(scores[i - batch], - z_data + i * n_targets_or_classes_, -1, - label_data == nullptr ? nullptr - : (label_data + i)); + agg.FinalizeScores( + scores[i - batch], z_data + i * n_targets_or_classes_, -1, + label_data == nullptr ? nullptr : (label_data + i)); } } DEBUG_PRINT() @@ -992,7 +1001,7 @@ const TreeNodeElement * TreeEnsembleCommon::ProcessTreeNodeLeave3( size_t root_id, const InputType *x_data) const { EXT_ENFORCE(same_mode_, "This optimization is only available when all node " - "follow the same mode."); + "follow the same mode."); const TreeNodeElement3 *root3 = roots3_[root_id]; const TreeNodeElement *root; EXT_ENFORCE(root3 != nullptr, "No optimization for tree ", root_id, "."); @@ -1018,7 +1027,8 @@ TreeEnsembleCommon::ProcessTreeNodeLeave3( } break; default: - EXT_THROW("TreeNodeElement3 not yet implement with mode ", root3->mode(), "."); + EXT_THROW("TreeNodeElement3 not yet implement with mode ", root3->mode(), + "."); } } @@ -1047,7 +1057,8 @@ TreeEnsembleCommon::ProcessTreeNodeLeave( DEBUG_PRINT("LEQ+") while (root->is_not_leaf()) { val = x_data[root->feature_id]; - DEBUG_PRINT("val=", val, " root->value_or_unique_weight=", root->value_or_unique_weight) + DEBUG_PRINT("val=", val, " root->value_or_unique_weight=", + root->value_or_unique_weight) root += val <= root->value_or_unique_weight ? root->truenode_inc_or_first_weight : root->falsenode_inc_or_n_weights; diff --git a/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_common_agg_.hpp b/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_common_agg_.hpp index b59ec7ca..fc82d694 100644 --- a/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_common_agg_.hpp +++ b/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_common_agg_.hpp @@ -9,12 +9,6 @@ #include #include -#include -#include -#include - -namespace py = pybind11; - #include "c_op_common.h" namespace onnx_c_ops { @@ -278,7 +272,7 @@ class TreeAggregator { write_scores(predictions, post_transform_, Z, add_second_class); } - const char* kind() const { return "NONE"; } + const char *kind() const { return "NONE"; } }; ///////////// @@ -354,7 +348,7 @@ class TreeAggregatorSum write_scores(predictions, this->post_transform_, Z, add_second_class); } - const char* kind() const { return "SUM"; } + const char *kind() const { return "SUM"; } }; template @@ -393,7 +387,7 @@ class TreeAggregatorAverage write_scores(predictions, this->post_transform_, Z, add_second_class); } - const char* kind() const { return "AVERAGE"; } + const char *kind() const { return "AVERAGE"; } }; template @@ -462,7 +456,7 @@ class TreeAggregatorMin } } - const char* kind() const { return "MIN"; } + const char *kind() const { return "MIN"; } }; template @@ -532,7 +526,7 @@ class TreeAggregatorMax } } - const char* kind() const { return "MAX"; } + const char *kind() const { return "MAX"; } }; ///////////////// @@ -710,7 +704,7 @@ class TreeAggregatorClassifier predictions.resize(2); } - const char* kind() const { return "CLASSIFICATION"; } + const char *kind() const { return "CLASSIFICATION"; } }; } // namespace onnx_c_ops diff --git a/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_common_classifier_.hpp b/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_common_classifier_.hpp index e219d12f..cdb6a3ad 100644 --- a/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_common_classifier_.hpp +++ b/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_common_classifier_.hpp @@ -36,12 +36,12 @@ class TreeEnsembleCommonClassifier } } - Status Init(const std::string &aggregate_function, // 3 - const std::vector &base_values, // 4 - int64_t n_targets_or_classes, // 5 - const std::vector &nodes_falsenodeids, // 6 - const std::vector &nodes_featureids, // 7 - const std::vector &nodes_hitrates, // 8 + Status Init(const std::string &aggregate_function, // 3 + const std::vector &base_values, // 4 + int64_t n_targets_or_classes, // 5 + const std::vector &nodes_falsenodeids, // 6 + const std::vector &nodes_featureids, // 7 + const std::vector &nodes_hitrates, // 8 const std::vector &nodes_missing_value_tracks_true, // 9 const std::vector &nodes_modes, // 10 const std::vector &nodes_nodeids, // 11 @@ -55,23 +55,23 @@ class TreeEnsembleCommonClassifier const std::vector &class_weights // 19 ) { TreeEnsembleCommon::Init( - aggregate_function, // 3 - base_values, // 4 - n_targets_or_classes, // 5 - nodes_falsenodeids, // 6 - nodes_featureids, // 7 - nodes_hitrates, // 8 - nodes_missing_value_tracks_true, // 9 - nodes_modes, // 10 - nodes_nodeids, // 11 - nodes_treeids, // 12 - nodes_truenodeids, // 13 - nodes_values, // 14 - post_transform, // 15 - class_ids, // 16 - class_nodeids, // 17 - class_treeids, // 18 - class_weights // 19 + aggregate_function, // 3 + base_values, // 4 + n_targets_or_classes, // 5 + nodes_falsenodeids, // 6 + nodes_featureids, // 7 + nodes_hitrates, // 8 + nodes_missing_value_tracks_true, // 9 + nodes_modes, // 10 + nodes_nodeids, // 11 + nodes_treeids, // 12 + nodes_truenodeids, // 13 + nodes_values, // 14 + post_transform, // 15 + class_ids, // 16 + class_nodeids, // 17 + class_treeids, // 18 + class_weights // 19 ); DEBUG_PRINT("Init") diff --git a/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_py_.cpp b/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_py_.cpp index d06569b1..b094151d 100644 --- a/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_py_.cpp +++ b/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_py_.cpp @@ -4,6 +4,12 @@ #include "c_op_tree_ensemble_py_.hpp" #include "c_op_tree_ensemble_py_classifier_.hpp" +#include +#include +#include + +namespace py = pybind11; + ////////////////////////////////////////// // Classifier ////////////////////////////////////////// diff --git a/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_py_.hpp b/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_py_.hpp index 88e79610..7942f187 100644 --- a/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_py_.hpp +++ b/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_py_.hpp @@ -3,6 +3,12 @@ #include "c_op_tree_ensemble_common_.hpp" +#include +#include +#include + +namespace py = pybind11; + #define py_array_t_int64_t \ py::array_t #define py_array_t_ntype_t \ @@ -86,23 +92,23 @@ class RuntimeTreeEnsembleCommon array2vector(ttarget_class_treeids, target_class_treeids, int64_t); array2vector(ttarget_class_weights, target_class_weights, NTYPE); - init_c(aggregate_function, // 3 - cbasevalues, // 4 - n_targets_or_classes, // 5 - tnodes_falsenodeids, // 6 - tnodes_featureids, // 7 - tnodes_hitrates, // 8 - tmissing_tracks_true, // 9 - nodes_modes, // 10 - tnodes_nodeids, // 11 - tnodes_treeids, // 12 - tnodes_truenodeids, // 13 - tnodes_values, // 14 - post_transform, // 15 - ttarget_class_ids, // 16 - ttarget_class_nodeids, // 17 - ttarget_class_treeids, // 18 - ttarget_class_weights // 19 + init_c(aggregate_function, // 3 + cbasevalues, // 4 + n_targets_or_classes, // 5 + tnodes_falsenodeids, // 6 + tnodes_featureids, // 7 + tnodes_hitrates, // 8 + tmissing_tracks_true, // 9 + nodes_modes, // 10 + tnodes_nodeids, // 11 + tnodes_treeids, // 12 + tnodes_truenodeids, // 13 + tnodes_values, // 14 + post_transform, // 15 + ttarget_class_ids, // 16 + ttarget_class_nodeids, // 17 + ttarget_class_treeids, // 18 + ttarget_class_weights // 19 ); } @@ -124,23 +130,23 @@ class RuntimeTreeEnsembleCommon const std::vector &target_class_treeids, // 18 const std::vector &target_class_weights // 19 ) { - this->Init(aggregate_function, // 3 - base_values, // 4 - n_targets_or_classes, // 5 - nodes_falsenodeids, // 6 - nodes_featureids, // 7 - nodes_hitrates, // 8 - nodes_missing_value_tracks_true, // 9 - nodes_modes, // 10 - nodes_nodeids, // 11 - nodes_treeids, // 12 - nodes_truenodeids, // 13 - nodes_values, // 14 - post_transform, // 15 - target_class_ids, // 16 - target_class_nodeids, // 17 - target_class_treeids, // 18 - target_class_weights // 19 + this->Init(aggregate_function, // 3 + base_values, // 4 + n_targets_or_classes, // 5 + nodes_falsenodeids, // 6 + nodes_featureids, // 7 + nodes_hitrates, // 8 + nodes_missing_value_tracks_true, // 9 + nodes_modes, // 10 + nodes_nodeids, // 11 + nodes_treeids, // 12 + nodes_truenodeids, // 13 + nodes_values, // 14 + post_transform, // 15 + target_class_ids, // 16 + target_class_nodeids, // 17 + target_class_treeids, // 18 + target_class_weights // 19 ); } diff --git a/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_py_classifier_.hpp b/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_py_classifier_.hpp index ec41a068..548cac28 100644 --- a/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_py_classifier_.hpp +++ b/onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_py_classifier_.hpp @@ -67,23 +67,23 @@ class RuntimeTreeEnsembleClassifier array2vector(ttarget_class_treeids, target_class_treeids, int64_t); array2vector(ttarget_class_weights, target_class_weights, NTYPE); - init_c(aggregate_function, // 3 - cbasevalues, // 4 - n_targets_or_classes, // 5 - tnodes_falsenodeids, // 6 - tnodes_featureids, // 7 - tnodes_hitrates, // 8 - tmissing_tracks_true, // 9 - nodes_modes, // 10 - tnodes_nodeids, // 11 - tnodes_treeids, // 12 - tnodes_truenodeids, // 13 - tnodes_values, // 14 - post_transform, // 15 - ttarget_class_ids, // 16 - ttarget_class_nodeids, // 17 - ttarget_class_treeids, // 18 - ttarget_class_weights // 19 + init_c(aggregate_function, // 3 + cbasevalues, // 4 + n_targets_or_classes, // 5 + tnodes_falsenodeids, // 6 + tnodes_featureids, // 7 + tnodes_hitrates, // 8 + tmissing_tracks_true, // 9 + nodes_modes, // 10 + tnodes_nodeids, // 11 + tnodes_treeids, // 12 + tnodes_truenodeids, // 13 + tnodes_values, // 14 + post_transform, // 15 + ttarget_class_ids, // 16 + ttarget_class_nodeids, // 17 + ttarget_class_treeids, // 18 + ttarget_class_weights // 19 ); } @@ -105,23 +105,23 @@ class RuntimeTreeEnsembleClassifier const std::vector &target_class_treeids, // 18 const std::vector &target_class_weights // 19 ) { - this->Init(aggregate_function, // 3 - base_values, // 4 - n_targets_or_classes, // 5 - nodes_falsenodeids, // 6 - nodes_featureids, // 7 - nodes_hitrates, // 8 - nodes_missing_value_tracks_true, // 9 - nodes_modes, // 10 - nodes_nodeids, // 11 - nodes_treeids, // 12 - nodes_truenodeids, // 13 - nodes_values, // 14 - post_transform, // 15 - target_class_ids, // 16 - target_class_nodeids, // 17 - target_class_treeids, // 18 - target_class_weights // 19 + this->Init(aggregate_function, // 3 + base_values, // 4 + n_targets_or_classes, // 5 + nodes_falsenodeids, // 6 + nodes_featureids, // 7 + nodes_hitrates, // 8 + nodes_missing_value_tracks_true, // 9 + nodes_modes, // 10 + nodes_nodeids, // 11 + nodes_treeids, // 12 + nodes_truenodeids, // 13 + nodes_values, // 14 + post_transform, // 15 + target_class_ids, // 16 + target_class_nodeids, // 17 + target_class_treeids, // 18 + target_class_weights // 19 ); } diff --git a/onnx_extended/reference/c_reference_evaluator.py b/onnx_extended/reference/c_reference_evaluator.py index 8b04acdc..4ffb5c63 100644 --- a/onnx_extended/reference/c_reference_evaluator.py +++ b/onnx_extended/reference/c_reference_evaluator.py @@ -4,15 +4,6 @@ from onnx.defs import get_schema from onnx.reference import ReferenceEvaluator from onnx.reference.op_run import OpRun -from onnx_extended.reference.c_ops.c_op_conv import Conv -from onnx_extended.reference.c_ops.c_op_tree_ensemble_regressor import ( - TreeEnsembleRegressor_1, - TreeEnsembleRegressor_3, -) -from onnx_extended.reference.c_ops.c_op_tree_ensemble_classifier import ( - TreeEnsembleClassifier_1, - TreeEnsembleClassifier_3, -) class CReferenceEvaluator(ReferenceEvaluator): @@ -29,13 +20,28 @@ class CReferenceEvaluator(ReferenceEvaluator): ref = ReferenceEvaluator(..., new_ops=[Conv]) """ - default_ops = [ - Conv, - TreeEnsembleClassifier_1, - TreeEnsembleClassifier_3, - TreeEnsembleRegressor_1, - TreeEnsembleRegressor_3, - ] + def default_ops(): + from onnx_extended.reference.c_ops.c_op_conv import Conv + from onnx_extended.reference.c_ops.c_op_tree_ensemble_regressor import ( + TreeEnsembleRegressor_1, + TreeEnsembleRegressor_3, + ) + from onnx_extended.reference.c_ops.c_op_tree_ensemble_classifier import ( + TreeEnsembleClassifier_1, + TreeEnsembleClassifier_3, + ) + from onnx_extended.reference.c_custom_ops.custom_op_tree_ensemble_regressor import ( # noqa: E501 + TreeEnsembleRegressor_1 as TreeEnsembleRegressor_1_Float, + ) + + return [ + Conv, + TreeEnsembleClassifier_1, + TreeEnsembleClassifier_3, + TreeEnsembleRegressor_1, + TreeEnsembleRegressor_3, + TreeEnsembleRegressor_1_Float, + ] @staticmethod def filter_ops(proto, new_ops, opsets): @@ -84,10 +90,10 @@ def __init__( **kwargs, ): if new_ops is None: - new_ops = CReferenceEvaluator.default_ops + new_ops = CReferenceEvaluator.default_ops() else: new_ops = new_ops.copy() - new_ops.extend(CReferenceEvaluator.default_ops) + new_ops.extend(CReferenceEvaluator.default_ops()) new_ops = CReferenceEvaluator.filter_ops(proto, new_ops, opsets) ReferenceEvaluator.__init__( diff --git a/onnx_extended/validation/cpu/_validation.cpp b/onnx_extended/validation/cpu/_validation.cpp index a6e768b5..67d74b3b 100644 --- a/onnx_extended/validation/cpu/_validation.cpp +++ b/onnx_extended/validation/cpu/_validation.cpp @@ -116,8 +116,7 @@ and parallelization (see `AVX API :return: sum of all elements )pbdoc"); - m.def("vector_add", &vector_add, - py::arg("v1"), py::arg("v2"), + m.def("vector_add", &vector_add, py::arg("v1"), py::arg("v2"), R"pbdoc(Computes the addition of 2 vectors of any dimensions. It assumes both vectors have the same dimensions (no broadcast).). @@ -125,5 +124,4 @@ It assumes both vectors have the same dimensions (no broadcast).). :param v2: second vector :return: new vector )pbdoc"); - } diff --git a/onnx_extended/validation/cpu/speed_metrics.cpp b/onnx_extended/validation/cpu/speed_metrics.cpp index 522f7e31..c68e4f48 100644 --- a/onnx_extended/validation/cpu/speed_metrics.cpp +++ b/onnx_extended/validation/cpu/speed_metrics.cpp @@ -72,7 +72,7 @@ benchmark_cache_tree(int64_t n_rows, int64_t n_features, int64_t n_trees, if (seed > 10037) seed = n_features * 7 + 1; - #pragma omp parallel for +#pragma omp parallel for for (int64_t t = 0; t < n_trees; ++t) { int64_t end = batch + step < n_rows ? batch + step : n_rows; for (int64_t i = batch; i < end; ++i) { diff --git a/onnx_extended/validation/cpu/speed_metrics.h b/onnx_extended/validation/cpu/speed_metrics.h index ac068ec6..81de83d0 100644 --- a/onnx_extended/validation/cpu/speed_metrics.h +++ b/onnx_extended/validation/cpu/speed_metrics.h @@ -27,7 +27,7 @@ inline bool _isnan_(double x) { } inline bool _isnan_(float x) { - uint32_t* pv = reinterpret_cast(&x); + uint32_t *pv = reinterpret_cast(&x); uint32_t b = *pv; return (b & 0x7fc00000) == 0x7fc00000; } diff --git a/onnx_extended/validation/cpu/vector_function.cpp b/onnx_extended/validation/cpu/vector_function.cpp index d2508e51..991a8fc7 100644 --- a/onnx_extended/validation/cpu/vector_function.cpp +++ b/onnx_extended/validation/cpu/vector_function.cpp @@ -16,7 +16,7 @@ namespace validation { -float vector_sum(int nl, int nc, const float* values, int by_rows) { +float vector_sum(int nl, int nc, const float *values, int by_rows) { float total = 0; if (by_rows) { for (size_t i = 0; i < nl; ++i) { diff --git a/onnx_extended/validation/cpu/vector_function.h b/onnx_extended/validation/cpu/vector_function.h index 81b34ea9..20f0b1f8 100644 --- a/onnx_extended/validation/cpu/vector_function.h +++ b/onnx_extended/validation/cpu/vector_function.h @@ -6,6 +6,6 @@ namespace validation { -float vector_sum(int nl, int nc, const float* values, int by_rows); +float vector_sum(int nl, int nc, const float *values, int by_rows); } // namespace validation diff --git a/onnx_extended/validation/cpu/vector_sum.cpp b/onnx_extended/validation/cpu/vector_sum.cpp index f536b488..300954df 100644 --- a/onnx_extended/validation/cpu/vector_sum.cpp +++ b/onnx_extended/validation/cpu/vector_sum.cpp @@ -166,38 +166,39 @@ float vector_sum_array_avx_parallel(int nc, } py_array_float vector_add(const py_array_float &v1, const py_array_float &v2) { - // Based on tutorial https://pybind11.readthedocs.io/en/stable/advanced/pycpp/numpy.html - if (v1.ndim() != v2.ndim()) { - throw std::runtime_error("Vector v1 and v2 must have the same shape."); - } - for(int i = 0; i < v1.ndim(); ++i) { - if (v1.shape(i) != v2.shape(i)) { - throw std::runtime_error("Vector v1 and v2 must have the same shape."); - } - } - std::vector shape(v1.ndim()); - for (int i = 0; i < v1.ndim(); ++i) { - shape[i] = v1.shape(i); - } - py::buffer_info b1 = v1.request(); - py::buffer_info b2 = v2.request(); - py_array_float result = py::array_t(shape); - py::buffer_info br = result.request(); - - const float * p1 = static_cast(b1.ptr); // pointer on v1 data - const float * p2 = static_cast(b2.ptr); // pointer on v2 data - float * pr = static_cast(br.ptr); // pointer on result data - if (p1 == nullptr || p2 == nullptr || pr == nullptr) { - throw std::runtime_error("One vector is empty."); - } - - // Here the addition. - int64_t size = v1.size(); - for(int64_t i = 0; i < size; ++i) { - pr[i] = p1[i] + p2[i]; - } - - return result; + // Based on tutorial + // https://pybind11.readthedocs.io/en/stable/advanced/pycpp/numpy.html + if (v1.ndim() != v2.ndim()) { + throw std::runtime_error("Vector v1 and v2 must have the same shape."); + } + for (int i = 0; i < v1.ndim(); ++i) { + if (v1.shape(i) != v2.shape(i)) { + throw std::runtime_error("Vector v1 and v2 must have the same shape."); + } + } + std::vector shape(v1.ndim()); + for (int i = 0; i < v1.ndim(); ++i) { + shape[i] = v1.shape(i); + } + py::buffer_info b1 = v1.request(); + py::buffer_info b2 = v2.request(); + py_array_float result = py::array_t(shape); + py::buffer_info br = result.request(); + + const float *p1 = static_cast(b1.ptr); // pointer on v1 data + const float *p2 = static_cast(b2.ptr); // pointer on v2 data + float *pr = static_cast(br.ptr); // pointer on result data + if (p1 == nullptr || p2 == nullptr || pr == nullptr) { + throw std::runtime_error("One vector is empty."); + } + + // Here the addition. + int64_t size = v1.size(); + for (int64_t i = 0; i < size; ++i) { + pr[i] = p1[i] + p2[i]; + } + + return result; } } // namespace validation diff --git a/onnx_extended/validation/cpu/vector_sum.h b/onnx_extended/validation/cpu/vector_sum.h index 53d1b9c3..596f7478 100644 --- a/onnx_extended/validation/cpu/vector_sum.h +++ b/onnx_extended/validation/cpu/vector_sum.h @@ -13,7 +13,7 @@ namespace py = pybind11; namespace validation { - + // vector_sum float vector_sum(int nc, const std::vector &values, bool by_rows); diff --git a/onnx_extended/validation/cuda/__init__.py b/onnx_extended/validation/cuda/__init__.py index 8b137891..4dbda618 100644 --- a/onnx_extended/validation/cuda/__init__.py +++ b/onnx_extended/validation/cuda/__init__.py @@ -1 +1,14 @@ - +def cuda_version() -> str: + """ + Returns the cuda version it was compiled with. + If CUDA was not available, it retunrs `"0.0"`. + """ + try: + from .cuda_example_py import cuda_version as cv + except ImportError: + # No CUDA + return "0.0" + v = cv() + major = v // 1000 + minor = (v % 1000) // 10 + return f"{major}.{minor}" diff --git a/onnx_extended/validation/cuda/cuda_example.cu b/onnx_extended/validation/cuda/cuda_example.cu index ae5f0b63..0c9d18ab 100644 --- a/onnx_extended/validation/cuda/cuda_example.cu +++ b/onnx_extended/validation/cuda/cuda_example.cu @@ -11,36 +11,42 @@ namespace cuda_example { -__global__ void block_vector_add(const float *a, const float *b, float *c, int n) { +__global__ void block_vector_add(const float *a, const float *b, float *c, + int n) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n) { c[i] = a[i] + b[i]; } } -void kernel_vector_add(unsigned int size, const float* gpu_ptr1, const float* gpu_ptr2, float* gpu_res) { +void kernel_vector_add(unsigned int size, const float *gpu_ptr1, + const float *gpu_ptr2, float *gpu_res) { constexpr int blockSize = 256; int numBlocks = (size + blockSize - 1) / blockSize; - block_vector_add<<>>(gpu_ptr1, gpu_ptr2, gpu_res, size); + block_vector_add<<>>(gpu_ptr1, gpu_ptr2, gpu_res, size); } -void vector_add(unsigned int size, const float* ptr1, const float* ptr2, float* br, int cudaDevice) { +void vector_add(unsigned int size, const float *ptr1, const float *ptr2, + float *br, int cudaDevice) { // copy memory from CPU memory to CUDA memory NVTX_SCOPE("vector_add") checkCudaErrors(cudaSetDevice(cudaDevice)); float *gpu_ptr1, *gpu_ptr2, *gpu_res; checkCudaErrors(cudaMalloc(&gpu_ptr1, size * sizeof(float))); - checkCudaErrors(cudaMemcpy(gpu_ptr1, ptr1, size * sizeof(float), cudaMemcpyHostToDevice)); + checkCudaErrors( + cudaMemcpy(gpu_ptr1, ptr1, size * sizeof(float), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMalloc(&gpu_ptr2, size * sizeof(float))); - checkCudaErrors(cudaMemcpy(gpu_ptr2, ptr2, size * sizeof(float), cudaMemcpyHostToDevice)); + checkCudaErrors( + cudaMemcpy(gpu_ptr2, ptr2, size * sizeof(float), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMalloc(&gpu_res, size * sizeof(float))); // execute the code kernel_vector_add(size, gpu_ptr1, gpu_ptr2, gpu_res); - checkCudaErrors(cudaMemcpy(br, gpu_res, size * sizeof(float), cudaMemcpyDeviceToHost)); + checkCudaErrors( + cudaMemcpy(br, gpu_res, size * sizeof(float), cudaMemcpyDeviceToHost)); // free the allocated vectors checkCudaErrors(cudaFree(gpu_ptr1)); @@ -49,16 +55,17 @@ void vector_add(unsigned int size, const float* ptr1, const float* ptr2, float* } unsigned int nextPow2(unsigned int x) { - --x; - x |= x >> 1; - x |= x >> 2; - x |= x >> 4; - x |= x >> 8; - x |= x >> 16; - return ++x; + --x; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + x |= x >> 8; + x |= x >> 16; + return ++x; } -__global__ void kernel_sum_reduce0(float *g_idata, float *g_odata, unsigned int n) { +__global__ void kernel_sum_reduce0(float *g_idata, float *g_odata, + unsigned int n) { extern __shared__ float sdata[]; // load shared mem @@ -69,9 +76,9 @@ __global__ void kernel_sum_reduce0(float *g_idata, float *g_odata, unsigned int __syncthreads(); - for (unsigned int s=1; s < blockDim.x; s *= 2) { + for (unsigned int s = 1; s < blockDim.x; s *= 2) { // modulo arithmetic is slow! - if ((tid % (2*s)) == 0) { + if ((tid % (2 * s)) == 0) { sdata[tid] += sdata[tid + s]; } __syncthreads(); @@ -82,20 +89,24 @@ __global__ void kernel_sum_reduce0(float *g_idata, float *g_odata, unsigned int } } -float kernel_vector_sum_reduce0(float* gpu_ptr, unsigned int size, int maxThreads) { +float kernel_vector_sum_reduce0(float *gpu_ptr, unsigned int size, + int maxThreads) { int threads = (size < maxThreads) ? nextPow2(size) : maxThreads; - int blocks = (size + threads - 1) / threads; + int blocks = (size + threads - 1) / threads; dim3 dimBlock(threads, 1, 1); dim3 dimGrid(blocks, 1, 1); - float* gpu_block_ptr; + float *gpu_block_ptr; checkCudaErrors(cudaMalloc(&gpu_block_ptr, blocks * sizeof(float))); - int smemSize = (threads <= 32) ? 2 * threads * sizeof(float) : threads * sizeof(float); - kernel_sum_reduce0<<>>(gpu_ptr, gpu_block_ptr, size); + int smemSize = + (threads <= 32) ? 2 * threads * sizeof(float) : threads * sizeof(float); + kernel_sum_reduce0<<>>(gpu_ptr, gpu_block_ptr, + size); // the last reduction happens on CPU, the first step is to move // the data from GPU to CPU. - float* cpu_ptr = new float[blocks]; - checkCudaErrors(cudaMemcpy(cpu_ptr, gpu_block_ptr, blocks * sizeof(float), cudaMemcpyDeviceToHost)); + float *cpu_ptr = new float[blocks]; + checkCudaErrors(cudaMemcpy(cpu_ptr, gpu_block_ptr, blocks * sizeof(float), + cudaMemcpyDeviceToHost)); float gpu_result = 0; for (int i = 0; i < blocks; ++i) { gpu_result += cpu_ptr[i]; @@ -105,15 +116,15 @@ float kernel_vector_sum_reduce0(float* gpu_ptr, unsigned int size, int maxThread return gpu_result; } -float vector_sum0(unsigned int size, const float* ptr, int maxThreads, +float vector_sum0(unsigned int size, const float *ptr, int maxThreads, int cudaDevice) { // copy memory from CPU memory to CUDA memory NVTX_SCOPE("vector_sum0") float *gpu_ptr; checkCudaErrors(cudaSetDevice(cudaDevice)); checkCudaErrors(cudaMalloc(&gpu_ptr, size * sizeof(float))); - checkCudaErrors(cudaMemcpy(gpu_ptr, ptr, size * sizeof(float), - cudaMemcpyHostToDevice)); + checkCudaErrors( + cudaMemcpy(gpu_ptr, ptr, size * sizeof(float), cudaMemcpyHostToDevice)); // execute the code float result = kernel_vector_sum_reduce0(gpu_ptr, size, maxThreads); @@ -124,23 +135,23 @@ float vector_sum0(unsigned int size, const float* ptr, int maxThreads, } __global__ void vector_sum(float *input, float *output, unsigned int size) { - int tid = threadIdx.x + blockIdx.x * blockDim.x; - int stride = blockDim.x * gridDim.x; - float sum = 0.0f; - for (int i = tid; i < size; i += stride) { - sum += input[i]; - } - atomicAdd(output, sum); + int tid = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + float sum = 0.0f; + for (int i = tid; i < size; i += stride) { + sum += input[i]; + } + atomicAdd(output, sum); } -float vector_sum_atomic(unsigned int size, const float* ptr, - int maxThreads, int cudaDevice) { +float vector_sum_atomic(unsigned int size, const float *ptr, int maxThreads, + int cudaDevice) { NVTX_SCOPE("vector_sum_atomic") float *input, *output; float sum = 0.0f; cudaMalloc(&input, size * sizeof(float)); - checkCudaErrors(cudaMemcpy(input, ptr, size * sizeof(float), - cudaMemcpyHostToDevice)); + checkCudaErrors( + cudaMemcpy(input, ptr, size * sizeof(float), cudaMemcpyHostToDevice)); cudaMalloc(&output, sizeof(float)); cudaMemcpy(output, &sum, sizeof(float), cudaMemcpyHostToDevice); vector_sum<<>>(input, output, size); @@ -150,5 +161,4 @@ float vector_sum_atomic(unsigned int size, const float* ptr, return sum; } - } // namespace cuda_example diff --git a/onnx_extended/validation/cuda/cuda_example.cuh b/onnx_extended/validation/cuda/cuda_example.cuh index d325000c..594e782b 100644 --- a/onnx_extended/validation/cuda/cuda_example.cuh +++ b/onnx_extended/validation/cuda/cuda_example.cuh @@ -5,8 +5,10 @@ unsigned int nextPow2(unsigned int x); void vector_add(unsigned int size, const float *ptr1, const float *ptr2, float *ptr3, int cudaDevice); -float vector_sum0(unsigned int size, const float *ptr, int max_threads, int cudaDevice); +float vector_sum0(unsigned int size, const float *ptr, int max_threads, + int cudaDevice); -float vector_sum_atomic(unsigned int size, const float* ptr, int maxThreads, int cudaDevice); +float vector_sum_atomic(unsigned int size, const float *ptr, int maxThreads, + int cudaDevice); } // namespace cuda_example diff --git a/onnx_extended/validation/cuda/cuda_example_reduce.cu b/onnx_extended/validation/cuda/cuda_example_reduce.cu index debbc5ef..1f323096 100644 --- a/onnx_extended/validation/cuda/cuda_example_reduce.cu +++ b/onnx_extended/validation/cuda/cuda_example_reduce.cu @@ -13,11 +13,11 @@ namespace cuda_example { -#define reduce6_block_and_sync(I,I2) \ - if ((blockSize >= I) && (tid < I2)) { \ - sdata[tid] = mySum = mySum + sdata[tid + I2]; \ - } \ - __syncthreads(); +#define reduce6_block_and_sync(I, I2) \ + if ((blockSize >= I) && (tid < I2)) { \ + sdata[tid] = mySum = mySum + sdata[tid + I2]; \ + } \ + __syncthreads(); template __global__ void kernel_reduce6(const T *g_idata, T *g_odata, unsigned int n) { @@ -43,13 +43,12 @@ __global__ void kernel_reduce6(const T *g_idata, T *g_odata, unsigned int n) { sdata[tid] = mySum; __syncthreads(); - // reduction within a block in shared memory reduce6_block_and_sync(512, 256); reduce6_block_and_sync(256, 128); reduce6_block_and_sync(128, 64); -#if (__CUDA_ARCH__ >= 300 ) +#if (__CUDA_ARCH__ >= 300) if (tid < 32) { if (blockSize >= 64) { mySum += sdata[tid + 32]; @@ -82,20 +81,23 @@ bool isPow2(unsigned int n) { return (n & (n - 1)) == 0; } -#define case_vector_sum_6_block(T, I, B) \ - case I: \ - kernel_reduce6<<>>(gpu_ptr, gpu_block_ptr, size); \ - break; +#define case_vector_sum_6_block(T, I, B) \ + case I: \ + kernel_reduce6 \ + <<>>(gpu_ptr, gpu_block_ptr, size); \ + break; -float kernel_vector_sum_6(unsigned int size, const float* gpu_ptr, int maxThreads) { +float kernel_vector_sum_6(unsigned int size, const float *gpu_ptr, + int maxThreads) { int threads = (size < maxThreads) ? nextPow2(size) : maxThreads; - int blocks = (size + threads - 1) / threads; + int blocks = (size + threads - 1) / threads; dim3 dimBlock(threads, 1, 1); dim3 dimGrid(blocks, 1, 1); - float* gpu_block_ptr; + float *gpu_block_ptr; checkCudaErrors(cudaMalloc(&gpu_block_ptr, blocks * sizeof(float))); - int smemSize = (threads <= 32) ? 2 * threads * sizeof(float) : threads * sizeof(float); + int smemSize = + (threads <= 32) ? 2 * threads * sizeof(float) : threads * sizeof(float); if (isPow2(size)) { switch (threads) { @@ -110,8 +112,7 @@ float kernel_vector_sum_6(unsigned int size, const float* gpu_ptr, int maxThread case_vector_sum_6_block(float, 2, true); case_vector_sum_6_block(float, 1, true); } - } - else { + } else { switch (threads) { case_vector_sum_6_block(float, 512, false); case_vector_sum_6_block(float, 256, false); @@ -128,8 +129,9 @@ float kernel_vector_sum_6(unsigned int size, const float* gpu_ptr, int maxThread // the last reduction happens on CPU, the first step is to move // the data from GPU to CPU. - float* cpu_ptr = new float[blocks]; - checkCudaErrors(cudaMemcpy(cpu_ptr, gpu_block_ptr, blocks * sizeof(float), cudaMemcpyDeviceToHost)); + float *cpu_ptr = new float[blocks]; + checkCudaErrors(cudaMemcpy(cpu_ptr, gpu_block_ptr, blocks * sizeof(float), + cudaMemcpyDeviceToHost)); float gpu_result = 0; for (int i = 0; i < blocks; ++i) { gpu_result += cpu_ptr[i]; @@ -137,18 +139,17 @@ float kernel_vector_sum_6(unsigned int size, const float* gpu_ptr, int maxThread checkCudaErrors(cudaFree(gpu_block_ptr)); delete[] cpu_ptr; return gpu_result; - } -float vector_sum6(unsigned int size, const float* ptr, int maxThreads, +float vector_sum6(unsigned int size, const float *ptr, int maxThreads, int cudaDevice) { // copy memory from CPU memory to CUDA memory NVTX_SCOPE("vector_sum6") float *gpu_ptr; checkCudaErrors(cudaSetDevice(cudaDevice)); checkCudaErrors(cudaMalloc(&gpu_ptr, size * sizeof(float))); - checkCudaErrors(cudaMemcpy(gpu_ptr, ptr, size * sizeof(float), - cudaMemcpyHostToDevice)); + checkCudaErrors( + cudaMemcpy(gpu_ptr, ptr, size * sizeof(float), cudaMemcpyHostToDevice)); // execute the code float result = kernel_vector_sum_6(size, gpu_ptr, maxThreads); diff --git a/pyproject.toml b/pyproject.toml index 61fe4f4c..4f715d75 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,7 @@ classifiers = [ "Programming Language :: Python :: 3.11", ] dependencies = ["numpy", "onnx>=1.14.0", "scipy"] -description = "Extends the list of supported operators in onnx reference implementation, or implements faster versions in C++." +description = "Extends the list of supported operators in onnx reference implementation and onnxruntime, or implements faster versions in C++." keywords = ["onnx", "onnxruntime", "CUDA", "openmp", "cmake", "cython", "pybind11"] license = {file = "LICENSE.txt"} name = "onnx-extended" @@ -31,7 +31,7 @@ version = "0.2.0" homepage = "https://sdpython.github.io/doc/onnx-extended/" documentation = "https://sdpython.github.io/doc/onnx-extended/" repository = "https://github.com/sdpython/onnx-extended" -# changelog = "https://github.com/sdpython/onnx-extended/CHANGELOG.md" +changelog = "https://github.com/sdpython/onnx-extended/CHANGELOGS.rst" [project.optional-dependencies] dev = [ @@ -51,7 +51,7 @@ dev = [ "ml-dtypes", "onnx-array-api", "onnxmltools", - "onnxruntime; python_version < '3.11'", + "onnxruntime", "pandas", "psutil", "pyquickhelper>=1.12.3821", @@ -96,7 +96,7 @@ ignore_messages = "Duplicate implicit target name: \"setup.py\"" namespaces = false [tool.setuptools.package-data] -"*" = ["*.cc", "*.cpp", "*.cu", "*.cuh", "*.dylib", "*.h", "*.hpp", "*.pyd", "*.so*"] +"*" = ["*.cc", "*.cpp", "*.cu", "*.cuh", "*.dll", "*.dylib", "*.h", "*.hpp", "*.pyd", "*.so*"] [tool.cibuildwheel] build = "*" @@ -105,7 +105,7 @@ manylinux-x86_64-image = "manylinux2014" [tool.cibuildwheel.linux] archs = ["x86_64"] build = "cp*" -skip = "cp36-* cp37-* cp38-* cp39-* cp311-* pypy* *musllinux*" +skip = "cp36-* cp37-* cp38-* cp39-* pypy* *musllinux*" [tool.cibuildwheel.macos] archs = ["x86_64"] diff --git a/requirements-dev.txt b/requirements-dev.txt index 7b26af67..3fdd1ebb 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -14,7 +14,7 @@ matplotlib ml-dtypes onnx-array-api onnxmltools -onnxruntime; python_version < '3.11' +onnxruntime openpyxl packaging pandas diff --git a/setup.py b/setup.py index 90672585..64e0f6fc 100644 --- a/setup.py +++ b/setup.py @@ -37,10 +37,14 @@ "*.hpp", "*.pyd", "*.so*", + "*.dll", ] package_data = { - "onnx_extended.ortcy.wrap": known_extensions, + "onnx_extended": known_extensions, + "onnx_extended.ortops.optim.cpu": known_extensions, "onnx_extended.ortops.tutorial.cpu": known_extensions, + "onnx_extended.ortops.tutorial.cuda": known_extensions, + "onnx_extended.ortcy.wrap": known_extensions, "onnx_extended.reference.c_ops.cpu": known_extensions, "onnx_extended.validation.cpu": known_extensions, "onnx_extended.validation.cython": known_extensions, @@ -62,7 +66,7 @@ except FileNotFoundError: long_description = "" -version_str = "0.1.0" +VERSION_STR = "0.2.0" with open(os.path.join(here, "onnx_extended/__init__.py"), "r") as f: line = [ _ @@ -70,7 +74,7 @@ if _.startswith("__version__") ] if len(line) > 0: - version_str = line[0].split("=")[1].strip('" ') + VERSION_STR = line[0].split("=")[1].strip('" ') ######################################## # C++ Helper @@ -213,31 +217,31 @@ def __init__(self, name: str, library: str = "") -> None: class cmake_build_ext(build_ext): user_options = [ *build_ext.user_options, - ("enable-nvtx=", None, "Enables compilation with NVTX events."), ( - "with-cuda=", + "use-cuda", None, "If cuda is available, CUDA is " "used by default unless this option is set to 0", ), + ("use-nvtx=", None, "Enables compilation with NVTX events."), ( - "cuda-version=", + "cuda-version", None, "If cuda is available, it searches the installed version " "unless this option is defined.", ), ( - "parallel=", + "parallel", None, "Parallelization", ), ( - "ort-version=", + "ort-version", None, "onnxruntime version, a path is allowed", ), ( - "cuda-build=", + "cuda-build", None, "CUDA code can be compiled to be working with " "different architectures, this flag can optimize " @@ -247,28 +251,68 @@ class cmake_build_ext(build_ext): ] def initialize_options(self): - self.enable_nvtx = None - self.with_cuda = None + self.use_nvtx = None + self.use_cuda = None self.cuda_version = None self.parallel = None self.ort_version = DEFAULT_ORT_VERSION self.cuda_build = "DEFAULT" + build_ext.initialize_options(self) + # boolean + b_values = {0, 1, "1", "0", True, False} + t_values = {1, "1", True} + for att in ["use_nvtx", "use_cuda"]: + v = getattr(self, att) + if v is not None: + continue + v = os.environ.get(att.upper(), None) + if v is None: + continue + if v not in b_values: + raise ValueError(f"Unable to interpret value {v} for {att.upper()!r}.") + print(f"-- setup: use env {att.upper()}={v in t_values}") + setattr(self, att, v in t_values) + if self.ort_version is None: + self.ort_version = os.environ.get("ORT_VERSION", None) + if self.ort_version not in ("", None): + print(f"-- setup: use env ORT_VERSION={self.ort_version}") + if self.cuda_build is None: + self.cuda_build = os.environ.get("CUDA_BUILD", None) + if self.cuda_build not in ("", None): + print(f"-- setup: use env CUDA_BUILD={self.cuda_build}") + if self.cuda_version is None: + self.cuda_version = os.environ.get("CUDA_VERSION", None) + if self.cuda_version not in ("", None): + print(f"-- setup: use env CUDA_VERSION={self.cuda_version}") + if self.use_nvtx is None: + self.use_nvtx = False + def finalize_options(self): - b_values = {None, 0, 1, "1", "0", True, False} - if self.enable_nvtx not in b_values: - raise ValueError(f"enable_nvtx={self.enable_nvtx!r} must be in {b_values}.") - if self.with_cuda not in b_values: - raise ValueError(f"with_cuda={self.with_cuda!r} must be in {b_values}.") - self.enable_nvtx = self.enable_nvtx in {1, "1", True, "True"} - self.with_cuda = self.with_cuda in {1, "1", True, "True", None} + build_ext.finalize_options(self) + + b_values = {0, 1, "1", "0", True, False, "True", "False"} + if self.use_nvtx not in b_values: + raise ValueError(f"use_nvtx={self.use_nvtx!r} must be in {b_values}.") + if self.use_cuda is None: + self.use_cuda = find_cuda() + if self.use_cuda not in b_values: + raise ValueError(f"use_cuda={self.use_cuda!r} must be in {b_values}.") + self.use_nvtx = self.use_nvtx in {1, "1", True, "True"} + self.use_cuda = self.use_cuda in {1, "1", True, "True"} if self.cuda_version in (None, ""): self.cuda_version = None build = {"DEFAULT", "H100", "H100opt"} if self.cuda_build not in build: - raise ValueError(f"cuda-built={self.cuda_build} not in {build}.") - build_ext.finalize_options(self) + raise ValueError(f"cuda-build={self.cuda_build!r} not in {build}.") + + options = {o[0]: o for o in self.user_options} + keys = list(sorted(options.keys())) + for na in keys: + opt = options[na] + name = opt[0].replace("-", "_").strip("=") + print(f"-- setup: option {name}={getattr(self, name, None)}") def get_cmake_args(self, cfg: str) -> List[str]: """ @@ -296,18 +340,17 @@ def get_cmake_args(self, cfg: str) -> List[str]: f"-DPYTHON_VERSION_MM={versmm}", f"-DPYTHON_MODULE_EXTENSION={module_ext}", f"-DORT_VERSION={self.ort_version}", + f"-DONNX_EXTENDED_VERSION={VERSION_STR}", ] if self.parallel is not None: cmake_args.append(f"-j{self.parallel}") - if os.environ.get("USE_NVTX", "0") in (1, "1") or self.enable_nvtx: + if self.use_nvtx: cmake_args.append("-DUSE_NVTX=1") - if os.environ.get("USE_CUDA", "1") in (0, "0") or not self.with_cuda: - cmake_args.append("-DUSE_CUDA=0") - else: - cmake_args.append("-DUSE_CUDA=1") + cmake_args.append(f"-DUSE_CUDA={1 if self.use_cuda else 0}") + if self.use_cuda: cmake_args.append(f"-DCUDA_BUILD={self.cuda_build}") - cuda_version = self.cuda_version or os.environ.get("CUDA_VERSION", "") + cuda_version = self.cuda_version if cuda_version not in (None, ""): cmake_args.append(f"-DCUDA_VERSION={cuda_version}") @@ -418,7 +461,7 @@ def process_extensions(self, cfg: str, build_path: str, build_lib: str): raise FileNotFoundError(f"Unable to find {look!r}.") if not os.path.exists(dest): raise FileNotFoundError(f"Unable to find folder {dest!r}.") - print(f"-- copy {look!r} to {dest!r}") + print(f"-- setup: copy-2 {look!r} to {dest!r}") shutil.copy(look, dest) def _process_setup_ext_line(self, cfg, build_path, line): @@ -450,7 +493,7 @@ def _process_setup_ext_line(self, cfg, build_path, line): raise FileNotFoundError( f"Unable to find library {fullname!r} (line={line!r})." ) - print(f"-- copy {fullname!r} to {fulldest!r}") + print(f"-- setup: copy-1 {fullname!r} to {fulldest!r}") shutil.copy(fullname, fulldest) else: raise RuntimeError(f"Unable to interpret line {line!r}.") @@ -484,8 +527,11 @@ def build_extensions(self): cfg = "Release" cmake_args = self.get_cmake_args(cfg) build_path, build_lib = self.build_cmake(cfg, cmake_args) + print("-- process_setup_ext") self.process_setup_ext(cfg, build_path, "_setup_ext.txt") + print("-- process_extensions") self.process_extensions(cfg, build_path, build_lib) + print("-- done") def get_ext_modules(): @@ -500,8 +546,8 @@ def get_ext_modules(): has_cuda = find_cuda() if has_cuda: add_cuda = True - if "--with-cuda" in sys.argv: - pos = sys.argv.index("--with-cuda") + if "--use-cuda" in sys.argv: + pos = sys.argv.index("--use-cuda") if len(sys.argv) > pos + 1 and sys.argv[pos + 1] in ( "0", 0, @@ -509,10 +555,8 @@ def get_ext_modules(): "False", ): add_cuda = False - elif "--with-cuda=0" in sys.argv: + elif os.environ.get("USE_CUDA", None) in {0, "0", False}: add_cuda = False - elif "--with-cuda=1" in sys.argv or "--with-cuda=guess": - add_cuda = True if add_cuda: cuda_extensions.extend( [ @@ -555,7 +599,7 @@ def get_ext_modules(): setup( name="onnx-extended", - version=version_str, + version=VERSION_STR, description="More operators for onnx reference implementation", long_description=long_description, author="Xavier Dupré",