Add another implementation of vector sum (#17)

* better reduce * fix reduce operation * nvtx? * doc * add image * images * doc * add nvtx * fix nvtx * fix documentation * az
sdpython · Apr 21, 2023 · 33f366e · 33f366e
1 parent 36ab6cf
commit 33f366e
Show file tree

Hide file tree

Showing 20 changed files with 1,534 additions and 59 deletions.
diff --git a/README.rst b/README.rst
@@ -33,6 +33,9 @@ Documentation `onnx-extended
 Source are available on `github/onnx-extended
 <https://github.com/sdpython/onnx-extended>`_.
 
+Use C++ implementation of existing operators
+++++++++++++++++++++++++++++++++++++++++++++
+
 .. code-block:: python
 
     import timeit
@@ -89,3 +92,25 @@ Source are available on `github/onnx-extended
     difference: 0.0
     onnx: 0.024006774998269975
     onnx-extended: 0.0002316169993719086
+
+Build with CUDA, openmp
++++++++++++++++++++++++
+
+The package also contains some dummy example on how to
+build with C++ functions (`pybind11 <https://github.com/pybind/pybind11>`_,
+`cython <https://cython.org/>`_), with `openmp
+<https://www.openmp.org/>`_,
+with or without CUDA.
+The build will automatically link with CUDA if it is found.
+If not, some extensions might not be available.
+
+::
+
+    python setup.py build_ext --inplace
+
+`NVTX <https://github.com/NVIDIA/NVTX>`_
+can be enabled with the following command:
+
+::
+
+    python setup.py build_ext --inplace --enable_nvtx 1
diff --git a/_doc/_static/vector_sum6.png b/_doc/_static/vector_sum6.png
diff --git a/_doc/_static/vector_sum6_results.png b/_doc/_static/vector_sum6_results.png
diff --git a/_doc/api/reference.rst b/_doc/api/reference.rst
@@ -20,6 +20,10 @@ ai.onnx
 ai.onnx.ml
 ++++++++++
 
-.. autoclass:: onnx_extended.reference.c_ops.c_op_tree_ensemble_classifier.TreeEnsembleClassifier
+.. autoclass:: onnx_extended.reference.c_ops.c_op_tree_ensemble_classifier.TreeEnsembleClassifier_1
 
-.. autoclass:: onnx_extended.reference.c_ops.c_op_tree_ensemble_regressor.TreeEnsembleRegresspr
+.. autoclass:: onnx_extended.reference.c_ops.c_op_tree_ensemble_classifier.TreeEnsembleClassifier_3
+
+.. autoclass:: onnx_extended.reference.c_ops.c_op_tree_ensemble_regressor.TreeEnsembleRegressor_1
+
+.. autoclass:: onnx_extended.reference.c_ops.c_op_tree_ensemble_regressor.TreeEnsembleRegressor_3
diff --git a/_doc/api/validation.rst b/_doc/api/validation.rst
@@ -10,32 +10,34 @@ C API
 _validation
 +++++++++++
 
-.. autoclass:: onnx_extended.validation._validation.ElementTime
+.. autoclass:: onnx_extended.validation.cpu._validation.ElementTime
 
-.. autofunction:: onnx_extended.validation._validation.benchmark_cache
+.. autofunction:: onnx_extended.validation.cpu._validation.benchmark_cache
 
-.. autofunction:: onnx_extended.validation._validation.benchmark_cache_tree
+.. autofunction:: onnx_extended.validation.cpu._validation.benchmark_cache_tree
 
-.. autofunction:: onnx_extended.validation._validation.vector_add
+.. autofunction:: onnx_extended.validation.cpu._validation.vector_add
 
-.. autofunction:: onnx_extended.validation._validation.vector_sum
+.. autofunction:: onnx_extended.validation.cpu._validation.vector_sum
 
-.. autofunction:: onnx_extended.validation._validation.vector_sum_array
+.. autofunction:: onnx_extended.validation.cpu._validation.vector_sum_array
 
-.. autofunction:: onnx_extended.validation._validation.vector_sum_array_parallel
+.. autofunction:: onnx_extended.validation.cpu._validation.vector_sum_array_parallel
 
-.. autofunction:: onnx_extended.validation._validation.vector_sum_array_avx
+.. autofunction:: onnx_extended.validation.cpu._validation.vector_sum_array_avx
 
-.. autofunction:: onnx_extended.validation._validation.vector_sum_array_avx_parallel
+.. autofunction:: onnx_extended.validation.cpu._validation.vector_sum_array_avx_parallel
 
 cuda_example_py
 +++++++++++++++
 
-.. autofunction:: onnx_extended.cuda_example_py.vector_add
+.. autofunction:: onnx_extended.validation.cuda.cuda_example_py.vector_add
 
-.. autofunction:: onnx_extended.cuda_example_py.vector_sum
+.. autofunction:: onnx_extended.validation.cuda.cuda_example_py.vector_sum0
+
+.. autofunction:: onnx_extended.validation.cuda.cuda_example_py.vector_sum6
 
 vector_function_cy
 ++++++++++++++++++
 
-.. autofunction:: onnx_extended.vector_function_cy.vector_add_c
+.. autofunction:: onnx_extended.validation.cython.vector_function_cy.vector_add_c
diff --git a/_doc/examples/plot_bench_gpu_vector_sum_gpu.py b/_doc/examples/plot_bench_gpu_vector_sum_gpu.py
@@ -23,6 +23,7 @@
 try:
     from onnx_extended.validation.cuda.cuda_example_py import (
         vector_sum0,
+        vector_sum6,
         vector_sum_atomic,
     )
 except ImportError:
@@ -82,29 +83,43 @@
         )
     )
 
-    diff = abs(vector_sum0(values, 128) - dim**2)
-    res = measure_time(lambda: vector_sum0(values, 128), max_time=0.5)
+    diff = abs(vector_sum_atomic(values, 32) - dim**2)
+    res = measure_time(lambda: vector_sum_atomic(values, 32), max_time=0.5)
 
     obs.append(
         dict(
             dim=dim,
             size=values.size,
             time=res["average"],
-            direction="0cuda128",
+            direction="Acuda32",
             time_per_element=res["average"] / dim**2,
             diff=diff,
         )
     )
 
-    diff = abs(vector_sum_atomic(values, 32) - dim**2)
-    res = measure_time(lambda: vector_sum_atomic(values, 32), max_time=0.5)
+    diff = abs(vector_sum6(values, 32) - dim**2)
+    res = measure_time(lambda: vector_sum6(values, 32), max_time=0.5)
 
     obs.append(
         dict(
             dim=dim,
             size=values.size,
             time=res["average"],
-            direction="Acuda32",
+            direction="6cuda32",
+            time_per_element=res["average"] / dim**2,
+            diff=diff,
+        )
+    )
+
+    diff = abs(vector_sum6(values, 256) - dim**2)
+    res = measure_time(lambda: vector_sum6(values, 256), max_time=0.5)
+
+    obs.append(
+        dict(
+            dim=dim,
+            size=values.size,
+            time=res["average"],
+            direction="6cuda256",
             time_per_element=res["average"] / dim**2,
             diff=diff,
         )
@@ -126,7 +141,45 @@
 piv.plot(ax=ax[0], logx=True, title="Comparison between two summation")
 piv_diff.plot(ax=ax[1], logx=True, logy=True, title="Summation errors")
 piv_time.plot(ax=ax[2], logx=True, logy=True, title="Total time")
-fig.savefig("plot_bench_cpu_vector_sum_avx_parallel.png")
+fig.savefig("plot_bench_gpu_vector_sum_gpu.png")
 
 ##############################################
-# AVX is faster.
+# The results should look like the following.
+#
+# .. image:: ../_static/vector_sum6_results.png
+#
+# AVX is still faster. Let's try to understand why.
+#
+# Profiling
+# +++++++++
+#
+# The profiling indicates where the program is most of the time.
+# It shows when the GPU is waiting and when the memory is copied from
+# from host (CPU) to device (GPU) and the other way around. There are
+# the two steps we need to reduce or avoid to make use of the GPU.
+#
+# Profiling with `nsight-compute <https://developer.nvidia.com/nsight-compute>`_:
+#
+# ::
+#
+#     nsys profile --trace=cuda,cudnn,cublas,osrt,nvtx,openmp python <file>
+#
+# If `nsys` fails to find `python`, the command `which python` should locate it.
+# `<file> can be `plot_bench_gpu_vector_sum_gpu.py` for example.
+#
+# Then command `nsys-ui` starts the Visual Interface interface of the profiling.
+# A screen shot shows the following after loading the profiling.
+#
+# .. image:: ../_static/vector_sum6.png
+#
+# Most of time is spent in copy the data from CPU memory to GPU memory.
+# In our case, GPU is not really useful because just copying the data from CPU
+# to GPU takes more time than processing it with CPU and AVX instructions.
+#
+# GPU is useful for deep learning because many operations can be chained and
+# the data stays on GPU memory until the very end. When multiple tools are involved,
+# torch, numpy, onnxruntime, the `DLPack <https://github.com/dmlc/dlpack>`_
+# avoids copying the data when switching.
+#
+# The copy of a big tensor can happens by block. The computation may start
+# before the data is fully copied.
diff --git a/_doc/tutorial/index.rst b/_doc/tutorial/index.rst
@@ -16,15 +16,16 @@ Operators
 .. toctree::
     :maxdepth: 1
 
-    ../autoexemples/plot_conv
+    ../auto_examples/plot_conv
 
 Validation, Experiments
 +++++++++++++++++++++++
 
 .. toctree::
     :maxdepth: 1
 
-    ../autoexemples/plot_bench_cpu
-    ../autoexemples/plot_bench_cpu_vector_sum
-    ../autoexemples/plot_bench_cpu_vector_sum_parallel
-    ../autoexemples/plot_bench_cpu_vector_sum_avx_parallel
+    ../auto_examples/plot_bench_cpu
+    ../auto_examples/plot_bench_cpu_vector_sum
+    ../auto_examples/plot_bench_cpu_vector_sum_parallel
+    ../auto_examples/plot_bench_cpu_vector_sum_avx_parallel
+    ../auto_examples/plot_bench_gpu_vector_sum_gpu
diff --git a/_unittests/ut_validation/test_vector_cuda.py b/_unittests/ut_validation/test_vector_cuda.py
@@ -8,11 +8,13 @@
         vector_sum0,
         vector_add,
         vector_sum_atomic,
+        vector_sum6,
     )
 else:
     vector_sum0 = None
     vector_add = None
     vector_sum_atomic = None
+    vector_sum6 = None
 
 
 class TestVectorCuda(ExtTestCase):
@@ -75,6 +77,18 @@ def test_vector_sum_atomic_cuda(self):
     def test_vector_sum_atomic_cud_bigger(self):
         values = numpy.random.randn(30, 224, 224).astype(numpy.float32)
         t = vector_sum_atomic(values)
+        self.assertAlmostEqual(t, values.sum().astype(numpy.float32), rtol=1e-3)
+
+    @unittest.skipIf(vector_sum6 is None, reason="CUDA not available")
+    def test_vector_sum6_cuda(self):
+        values = numpy.array([[10, 1, 4, 5, 6, 7]], dtype=numpy.float32)
+        t = vector_sum6(values)
+        self.assertEqual(t, values.sum().astype(numpy.float32))
+
+    @unittest.skipIf(vector_sum6 is None, reason="CUDA not available")
+    def test_vector_sum6_cud_bigger(self):
+        values = numpy.random.randn(30, 224, 224).astype(numpy.float32)
+        t = vector_sum6(values)
         self.assertAlmostEqual(t, values.sum().astype(numpy.float32), rtol=1e-4)
 
 

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -34,7 +34,8 @@ jobs:
       black --diff .
     displayName: 'Black'
   - script: |
-      cmake-lint cmake/*  --disabled-codes C0103 C0113
+      cmake-lint cmake/Find*  --disabled-codes C0103 C0113
+      cmake-lint cmake/CMake*  --disabled-codes C0103 C0113
     displayName: 'cmake-lint'
   - script: |
       # python -m pip install -e .

diff --git a/clang_format.sh b/clang_format.sh
@@ -1,30 +1,6 @@
-clang-format --length 88 -i onnx_extended/reference/c_ops/c_op_common_parallel.hpp
-clang-format --length 88 -i onnx_extended/reference/c_ops/c_op_common.cpp
-clang-format --length 88 -i onnx_extended/reference/c_ops/c_op_common.h
+#!/bin/bash
 
-clang-format --length 88 -i onnx_extended/reference/c_ops/c_op_conv_.cpp
-clang-format --length 88 -i onnx_extended/reference/c_ops/c_op_conv_common.h
-clang-format --length 88 -i onnx_extended/reference/c_ops/c_op_conv.h
-
-clang-format --length 88 -i onnx_extended/reference/c_ops/c_op_tree_ensemble_common_.hpp
-clang-format --length 88 -i onnx_extended/reference/c_ops/c_op_tree_ensemble_common_agg_.hpp
-clang-format --length 88 -i onnx_extended/reference/c_ops/c_op_tree_ensemble_common_classifier_.hpp
-clang-format --length 88 -i onnx_extended/reference/c_ops/c_op_tree_ensemble_py_.cpp
-clang-format --length 88 -i onnx_extended/reference/c_ops/c_op_tree_ensemble_py_classifier_.hpp
-
-clang-format --length 88 -i onnx_extended/validation/vector_function.h
-clang-format --length 88 -i onnx_extended/validation/vector_function.cpp
-clang-format --length 88 -i onnx_extended/validation/vector_sum.h
-clang-format --length 88 -i onnx_extended/validation/vector_sum.cpp
-
-clang-format --length 88 -i onnx_extended/validation/speed_metrics.cpp
-clang-format --length 88 -i onnx_extended/validation/speed_metrics.h
-
-clang-format --length 88 -i onnx_extended/validation/_validation.cpp
-
-clang-format --length 88 -i onnx_extended/validation/cuda_utils.h
-clang-format --length 88 -i onnx_extended/validation/cuda_example.h
-clang-format --length 88 -i onnx_extended/validation/cuda_example.cpp
-clang-format --length 88 -i onnx_extended/validation/cuda_example.cu
-clang-format --length 88 -i onnx_extended/validation/cuda_example.cuh
-clang-format --length 88 -i onnx_extended/validation/cuda_example_py.cpp
+find onnx_extended -type f \( -name "*.h" -o -name "*.hpp" -o -name "*.cuh" -o -name "*.cpp" -o -name "*.cc" -o -name "*.cu" \) | while read f; do
+    echo "Processing '$f'";
+    clang-format --length 88 -i $f;
+done
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
@@ -26,6 +26,7 @@ message(STATUS "PYTHON_LIBRARY_DIR=${PYTHON_LIBRARY_DIR}")
 message(STATUS "PYTHON_NUMPY_INCLUDE_DIR=${PYTHON_NUMPY_INCLUDE_DIR}")
 message(STATUS "PYTHON_MODULE_EXTENSION=${PYTHON_MODULE_EXTENSION}")
 message(STATUS "PYTHON_NUMPY_VERSION=${PYTHON_NUMPY_VERSION}")
+message(STATUS "USE_NVTX=${USE_NVTX}")
 message(STATUS "ENV-PATH=$ENV{PATH}")
 message(STATUS "ENV-PYTHONPATH=$ENV{PYTHONPATH}")
 
@@ -124,6 +125,8 @@ if(CUDA_FOUND)
   message(STATUS "CUDA_cusparse_LIBRARY=${CUDA_cusparse_LIBRARY}")
   message(STATUS "CUDA_nvToolsExt_LIBRARY=${CUDA_nvToolsExt_LIBRARY}")
   message(STATUS "CUDA_OpenCL_LIBRARY=${CUDA_OpenCL_LIBRARY}")
+  message(STATUS "CUDA NVTX_LINK_C=${NVTX_LINK_C}")
+  message(STATUS "CUDA NVTX_LINK_CPP=${NVTX_LINK_CPP}")
   set(CUDA_AVAILABLE 1)
 else()
   message(STATUS "Module CudaExtension is not installed.")
@@ -182,7 +185,8 @@ if(CUDA_AVAILABLE)
     cuda_example_py
     ../onnx_extended/validation/cuda/cuda_example_py.cpp
     ../onnx_extended/validation/cuda/cuda_example.cpp
-    ../onnx_extended/validation/cuda/cuda_example.cu)
+    ../onnx_extended/validation/cuda/cuda_example.cu
+    ../onnx_extended/validation/cuda/cuda_example_reduce.cu)
 
 else()
   set(config_content "HAS_CUDA = 0")