Skip to content

Commit

Permalink
Fix cuda build (#148)
Browse files Browse the repository at this point in the history
* fix cuda

* crash
  • Loading branch information
xadupre authored Dec 29, 2023
1 parent 7e3eb9e commit f9baa46
Show file tree
Hide file tree
Showing 4 changed files with 3 additions and 100 deletions.
1 change: 1 addition & 0 deletions _cmake/targets/_validation_cuda_example_py.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ if(CUDA_AVAILABLE)

cuda_pybind11_add_module(
cuda_example_py
../onnx_extended/validation/cuda/cuda_example_py.cpp
../onnx_extended/validation/cuda/cuda_fpemu.cu
../onnx_extended/validation/cuda/cuda_tensor.cu
../onnx_extended/validation/cuda/cuda_gemm.cu)
Expand Down
5 changes: 0 additions & 5 deletions _doc/api/validation_cuda.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,9 @@ cuda_example_py
"cuda_device_memory",
"cuda_devices_memory",
"cuda_version",
"get_device_prop",
"gemm_benchmark_test",
"FpemuMode",
"fpemu_cuda_forward",
"vector_add",
"vector_sum_atomic",
"vector_sum0",
"vector_sum6",
]
names.sort()
classes = {"FpemuMode"}
Expand Down
1 change: 1 addition & 0 deletions _unittests/ut_ortops/test_optim_tree_ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ def test_random_forest_regressor(self):
self.assertEqualArray(expected, got, atol=1e-5)

@unittest.skipIf(InferenceSession is None, "onnxruntime not installed")
@skipif_ci_apple("crash")
def test_tree_run_optimize_model(self):
from onnx_extended.ortops.optim.cpu import get_ort_ext_libs
from skl2onnx import to_onnx
Expand Down
96 changes: 1 addition & 95 deletions onnx_extended/validation/cuda/cuda_example_py.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
#include "cuda_example.cuh"
#include "cuda_example_reduce.cuh"
#include "cuda_fpemu.cuh"
#include "cuda_gemm.cuh"

Expand Down Expand Up @@ -122,99 +120,6 @@ PYBIND11_MODULE(cuda_example_py, m) {
:param ldb: leading dimension of B
:param ldd: leading dimension of the result
:return: metrics in a dictionary
)pbdoc");

m.def(
"vector_add",
[](const py_array_float &v1, const py_array_float &v2,
int cuda_device) -> py_array_float {
if (v1.size() != v2.size()) {
throw std::runtime_error("Vectors v1 and v2 have different number of elements.");
}
auto ha1 = v1.request();
float *ptr1 = reinterpret_cast<float *>(ha1.ptr);
auto ha2 = v2.request();
float *ptr2 = reinterpret_cast<float *>(ha2.ptr);

std::vector<int64_t> shape(v1.ndim());
for (int i = 0; i < v1.ndim(); ++i) {
shape[i] = v1.shape(i);
}
py_array_float result = py::array_t<float>(shape);
py::buffer_info br = result.request();

float *pr = static_cast<float *>(br.ptr); // pointer on result data
if (ptr1 == nullptr || ptr2 == nullptr || pr == nullptr) {
throw std::runtime_error("One vector is empty.");
}
vector_add(v1.size(), ptr1, ptr2, pr, cuda_device);
return result;
},
py::arg("v1"), py::arg("v2"), py::arg("cuda_device") = 0,
R"pbdoc(Computes the additions of two vectors
of the same size with CUDA.
:param v1: array
:param v2: array
:param cuda_device: device id (if mulitple one)
:return: addition of the two arrays
)pbdoc");

m.def(
"vector_sum0",
[](const py_array_float &vect, int max_threads, int cuda_device) -> float {
if (vect.size() == 0)
return 0;
auto ha = vect.request();
const float *ptr = reinterpret_cast<float *>(ha.ptr);
return vector_sum0(static_cast<unsigned int>(vect.size()), ptr, max_threads,
cuda_device);
},
py::arg("vect"), py::arg("max_threads") = 256, py::arg("cuda_device") = 0,
R"pbdoc(Computes the sum of all coefficients with CUDA. Naive method.
:param vect: array
:param max_threads: number of threads to use (it must be a power of 2)
:param cuda_device: device id (if mulitple one)
:return: sum
)pbdoc");

m.def(
"vector_sum_atomic",
[](const py_array_float &vect, int max_threads, int cuda_device) -> float {
if (vect.size() == 0)
return 0;
auto ha = vect.request();
const float *ptr = reinterpret_cast<float *>(ha.ptr);
return vector_sum_atomic(static_cast<unsigned int>(vect.size()), ptr, max_threads,
cuda_device);
},
py::arg("vect"), py::arg("max_threads") = 256, py::arg("cuda_device") = 0,
R"pbdoc(Computes the sum of all coefficients with CUDA. Uses atomicAdd
:param vect: array
:param max_threads: number of threads to use (it must be a power of 2)
:param cuda_device: device id (if mulitple one)
:return: sum
)pbdoc");

m.def(
"vector_sum6",
[](const py_array_float &vect, int max_threads, int cuda_device) -> float {
if (vect.size() == 0)
return 0;
auto ha = vect.request();
const float *ptr = reinterpret_cast<float *>(ha.ptr);
return vector_sum6(static_cast<unsigned int>(vect.size()), ptr, max_threads,
cuda_device);
},
py::arg("vect"), py::arg("max_threads") = 256, py::arg("cuda_device") = 0,
R"pbdoc(Computes the sum of all coefficients with CUDA. More efficient method.
:param vect: array
:param max_threads: number of threads to use (it must be a power of 2)
:param cuda_device: device id (if mulitple one)
:return: sum
)pbdoc");

py::enum_<FpemuMode>(m, "FpemuMode",
Expand Down Expand Up @@ -255,4 +160,5 @@ of the same size with CUDA.
:param cuda_device: device id (if mulitple one)
:return: forward pass
)pbdoc");

}

0 comments on commit f9baa46

Please sign in to comment.