Skip to content

Commit

Permalink
Implements a custom op calling a custom assembly (TreeEnsemble) (#124)
Browse files Browse the repository at this point in the history
* Implements a custom op calling a custom assembly (TreeEnsemble)

* ?

* small changes

* fix windows build

* fix compilation issue

* fix unit test

* skipif if apple
  • Loading branch information
xadupre authored Nov 28, 2023
1 parent a90ecfe commit fa5e2a2
Show file tree
Hide file tree
Showing 10 changed files with 497 additions and 10 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
*.cmake
*.whl
*.def
*.ll
*.s
/*.png
/*.onnx
.build_path.txt
Expand Down
1 change: 1 addition & 0 deletions _cmake/targets/ortops_tutorial_cpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ ort_add_custom_op(
"CPU"
onnx_extended/ortops/tutorial/cpu
../onnx_extended/ortops/tutorial/cpu/custom_gemm.cc
../onnx_extended/ortops/tutorial/cpu/custom_tree_assembly.cc
../onnx_extended/ortops/tutorial/cpu/dynamic_quantize_linear.cc
../onnx_extended/ortops/tutorial/cpu/my_kernel.cc
../onnx_extended/ortops/tutorial/cpu/my_kernel_attr.cc
Expand Down
234 changes: 234 additions & 0 deletions _unittests/ut_ortops/test_tutorial_cpu_tree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
import multiprocessing
import os
import subprocess
import unittest
import warnings
from typing import Optional, Tuple
import numpy
from onnx import ModelProto
from onnx.helper import make_attribute
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor
from onnx_extended.ext_test_case import ExtTestCase

try:
from onnxruntime import InferenceSession, SessionOptions
except ImportError:
SessionOptions, InferenceSession = None, None


def make_tree(n_features: int, n_trees: int, max_depth: int) -> ModelProto:
from skl2onnx import to_onnx

X, y = make_regression(max_depth * 1024, n_features)
X = X.astype(numpy.float32)
y = y.astype(numpy.float32)
rf = RandomForestRegressor(n_estimators=n_trees, max_depth=max_depth)
rf.fit(X, y)
onx = to_onnx(rf, X[:1])
return onx


def compile_tree(
llc_exe: str,
filename: str,
onx: ModelProto,
batch_size: int,
tree_tile_size: int = 8,
verbose: int = 0,
) -> str:
if verbose:
print("[compile_tree] import treebeard")
import treebeard

if verbose:
print(
f"[compile_tree] treebeard set options, "
f"batch_size={batch_size}, tree_tile_size={tree_tile_size}"
)
compiler_options = treebeard.CompilerOptions(batch_size, tree_tile_size)

compiler_options.SetNumberOfCores(multiprocessing.cpu_count())
compiler_options.SetMakeAllLeavesSameDepth(1)
compiler_options.SetReorderTreesByDepth(True)
assert 8 < batch_size
compiler_options.SetPipelineWidth(8)

if verbose:
print(f"[compile_tree] write filename={filename!r}")

# let's remove nodes_hitrates to avoid a warning before saving the model
for node in onx.graph.node:
if node.op_type == "TreeEnsembleRegressor":
found = -1
for i in range(len(node.attribute)):
if node.attribute[i].name == "nodes_hitrates":
found = i
if found >= 0:
del node.attribute[found]
with open(filename, "wb") as f:
f.write(onx.SerializeToString())

onnx_model_path = os.path.abspath(filename)
if verbose:
print(
f"[compile_tree] treebeard context with onnx_model_path={onnx_model_path!r}"
)
tbContext = treebeard.TreebeardContext(onnx_model_path, "", compiler_options)
tbContext.SetRepresentationType("sparse")
tbContext.SetInputFiletype("onnx_file")

llvm_file_path = f"{os.path.splitext(onnx_model_path)[0]}.ll"
if verbose:
print(f"[compile_tree] LLVM dump into {llvm_file_path!r}")
error = tbContext.DumpLLVMIR(llvm_file_path)
if error:
raise RuntimeError(
f"Failed to dump LLVM IR in {llvm_file_path!r}, error={error}."
)
if not os.path.exists(llvm_file_path):
raise FileNotFoundError(f"Unable to find {llvm_file_path!r}.")

# Run LLC
asm_file_path = f"{os.path.splitext(onnx_model_path)[0]}.s"
if verbose:
print(f"[compile_tree] llc={llc_exe!r}")
print(f"[compile_tree] run LLC into {llvm_file_path!r}")
subprocess.run(
[
llc_exe,
llvm_file_path,
"-O3",
"-march=x86-64",
"-mcpu=native",
"--relocation-model=pic",
"-o",
asm_file_path,
]
)

# Run CLANG
so_file_path = f"{os.path.splitext(onnx_model_path)[0]}.so"
if verbose:
print(f"[compile_tree] run clang into {so_file_path!r}")
subprocess.run(
["clang", "-shared", asm_file_path, "-fopenmp=libomp", "-o", so_file_path]
)
if verbose:
print("[compile_tree] done.")
return so_file_path


def make_ort_session(onx: ModelProto, assembly_name: Optional[str] = None) -> Tuple:
from onnxruntime import InferenceSession, SessionOptions
from onnx_extended.ortops.tutorial.cpu import get_ort_ext_libs as lib_tuto
from onnx_extended.ortops.optim.cpu import get_ort_ext_libs as lib_optim
from onnx_extended.ortops.optim.optimize import (
change_onnx_operator_domain,
get_node_attribute,
)

# baseline
sess_check = InferenceSession(
onx.SerializeToString(), providers=["CPUExecutionProvider"]
)

# first optimization
onx2 = change_onnx_operator_domain(
onx,
op_type="TreeEnsembleRegressor",
op_domain="ai.onnx.ml",
new_op_domain="onnx_extented.ortops.optim.cpu",
nodes_modes=",".join(
map(
lambda s: s.decode("ascii"),
get_node_attribute(onx.graph.node[0], "nodes_modes").strings,
)
),
)

r = lib_optim()
opts = SessionOptions()
opts.register_custom_ops_library(r[0])
sess_opt = InferenceSession(
onx2.SerializeToString(), opts, providers=["CPUExecutionProvider"]
)

if assembly_name is None:
return sess_check, sess_opt, None

# assembly
for node in onx.graph.node:
if node.op_type == "TreeEnsembleRegressor":
node.op_type = "TreeEnsembleAssemblyRegressor"
node.domain = "onnx_extented.ortops.tutorial.cpu"
del node.attribute[:]
new_add = make_attribute("assembly", assembly_name)
node.attribute.append(new_add)

d = onx.opset_import.add()
d.domain = "onnx_extented.ortops.tutorial.cpu"
d.version = 1

r = lib_tuto()
opts = SessionOptions()
opts.register_custom_ops_library(r[0])
sess_assembly = InferenceSession(
onx.SerializeToString(), opts, providers=["CPUExecutionProvider"]
)

return sess_check, sess_opt, sess_assembly


class TestOrtOpTutorialCpuTree(ExtTestCase):
def test_get_ort_ext_libs(self):
from onnx_extended.ortops.tutorial.cpu import get_ort_ext_libs

r = get_ort_ext_libs()
self.assertEqual(len(r), 1)

@unittest.skipIf(InferenceSession is None, "onnxruntime not installed")
def test_custom_tree_ensemble(self):
n_features = 5
batch_size = 1024
onx = make_tree(n_features=n_features, n_trees=100, max_depth=5)
llc_exe = os.environ.get("TEST_LLC_EXE", "SKIP")
if llc_exe == "SKIP":
warnings.warn("Unable to find environment variable 'TEST_LLC_EXE'.")
sessions = make_ort_session(onx)

elif not os.path.exists(llc_exe):
raise FileNotFoundError(f"Unable to find {llc_exe}.")
else:
names = [
"custom_tree_ensemble.onnx",
"custom_tree_ensemble.ll",
"custom_tree_ensemble.s",
"custom_tree_ensemble.so",
]
for name in names:
if os.path.exists(name):
os.remove(name)
assembly_name = compile_tree(
llc_exe,
"custom_tree_ensemble.onnx",
onx,
batch_size,
verbose=1 if __name__ == "__main__" else 0,
)
sessions = make_ort_session(onx, assembly_name)

feeds = {"X": numpy.random.randn(batch_size, n_features).astype(numpy.float32)}
results = []
for sess in sessions:
if sess is None:
continue
results.append(sess.run(None, feeds)[0])

self.assertEqualArray(results[0], results[1], atol=1e-3)
if len(results) > 2:
self.assertEqualArray(results[0], results[2], atol=1e-3)


if __name__ == "__main__":
unittest.main(verbosity=2)
3 changes: 2 additions & 1 deletion _unittests/ut_validation/test_speed_metrics.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import unittest
from onnx_extended.ext_test_case import ExtTestCase
from onnx_extended.ext_test_case import ExtTestCase, skipif_ci_apple


class TestSpeedMetrics(ExtTestCase):
Expand All @@ -11,6 +11,7 @@ def test_benchmark_cache(self):
res = benchmark_cache(1000, False)
self.assertGreater(res, 0)

@skipif_ci_apple("unstable on Apple")
def test_benchmark_cache_tree(self):
from onnx_extended.validation.cpu._validation import (
benchmark_cache_tree,
Expand Down
10 changes: 5 additions & 5 deletions onnx_extended/cpp/include/common/c_op_math.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ namespace onnx_c_ops {
#define InlinedVector std::vector
#define InlinedHashSet std::unordered_set

#if defined(_WIN32) || defined(WIN32)
#if defined(_WIN32)

inline bool _isnan_(float x) { return _isnanf(x); }
inline bool _isnan_(double x) { return _isnan(x); }
Expand Down Expand Up @@ -166,8 +166,8 @@ std::size_t write_scores(std::vector<NTYPE> &scores,

template <typename NTYPE, typename T>
std::size_t write_scores(std::size_t n_classes, NTYPE *scores,
POST_EVAL_TRANSFORM post_transform, T *Z,
int add_second_class) {
POST_EVAL_TRANSFORM post_transform, T *Z,
int add_second_class) {
if (n_classes >= 2) {
NTYPE *end = scores + n_classes;
switch (post_transform) {
Expand Down Expand Up @@ -238,8 +238,8 @@ std::size_t write_scores(std::size_t n_classes, NTYPE *scores,
}

template <typename NTYPE, typename T>
std::size_t write_scores2(NTYPE *scores, POST_EVAL_TRANSFORM post_transform, T *Z,
int add_second_class) {
std::size_t write_scores2(NTYPE *scores, POST_EVAL_TRANSFORM post_transform,
T *Z, int add_second_class) {
switch (post_transform) {
case POST_EVAL_TRANSFORM::PROBIT:
Z[0] = ComputeProbit(scores[0]);
Expand Down
2 changes: 1 addition & 1 deletion onnx_extended/ortops/optim/cpu/ort_optim_cpu_lib.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
#include <mutex>
#include <vector>

#include "ortapi_version.h"
#include "ort_optim_cpu_lib.h"
#include "ort_svm.hpp"
#include "ort_tfidf_vectorizer.hpp"
#include "ort_tree_ensemble.hpp"
#include "ortapi_version.h"

static const char *c_OpDomain = "onnx_extented.ortops.optim.cpu";

Expand Down
Loading

0 comments on commit fa5e2a2

Please sign in to comment.