Skip to content

Commit

Permalink
add aot custom op to accelerate computing bbox_iou on GPU
Browse files Browse the repository at this point in the history
  • Loading branch information
panshaowu committed Aug 25, 2023
1 parent d459bea commit 3abaf62
Show file tree
Hide file tree
Showing 22 changed files with 1,868 additions and 44 deletions.
6 changes: 6 additions & 0 deletions docs/en/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,9 @@ In addition, we provide an optional [fast coco api](https://github.com/facebookr
cd mindyolo/csrc
sh build.sh
```

We also provide fused GPU operators which are built upon MindSpore [ops.Custom](https://www.mindspore.cn/tutorials/experts/en/master/operation/op_custom.html) API. The fused GPU operators are able to improve train speed. The source code is provided in C++ and CUDA and is in the folder `mindyolo/models/losses/fused_op`. You can try compiling the source code to dynamic link libraries with the following commands, **(This operation is optional)** :

```shell
bash mindyolo/models/losses/fused_op/build.sh
```
8 changes: 7 additions & 1 deletion docs/zh/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,15 @@ cd mindyolo
pip install -e .
```

另外, 我们提供了一个可选的 [fast coco api](https://github.com/facebookresearch/detectron2/blob/main/detectron2/evaluation/fast_eval_api.py) 接口用于提升验证过程的速度。代码是以C++形式提供的,可以尝试用以下的命令进行安装 **(此操作是可选的)** :
我们提供了一个可选的 [fast coco api](https://github.com/facebookresearch/detectron2/blob/main/detectron2/evaluation/fast_eval_api.py) 接口用于提升验证过程的速度。代码是以C++形式提供的,可以尝试用以下的命令进行安装 **(此操作是可选的)** :

```shell
cd mindyolo/csrc
sh build.sh
```

我们还提供了基于MindSpore [Custom自定义算子](https://www.mindspore.cn/tutorials/experts/zh-CN/master/operation/op_custom.html) 的GPU融合算子,用于提升训练过程的速度。代码采用C++和CUDA开发,位于`mindyolo/models/losses/fused_op`路径下。可以使用以下的命令,编译生成GPU融合算子运行所依赖的动态库,用于调测 **(此操作是可选的)** :

```shell
bash mindyolo/models/losses/fused_op/build.sh
```
272 changes: 272 additions & 0 deletions mindyolo/models/losses/fused_op/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,272 @@
import os

from mindspore.ops import DataType, CustomRegOp


fused_op_list = ['fused_get_ciou_kernel', 'fused_get_center_dist_kernel', 'fused_get_convex_diagonal_squared_kernel',
'fused_get_ciou_diagonal_angle_kernel','fused_get_boundding_boxes_coord_kernel',
'fused_get_intersection_area_kernel']
fused_ops_dir = os.path.dirname(__file__)
for fused_op_item in fused_op_list:
so_path = fused_ops_dir + '/' + fused_op_item + '.so'
if not os.path.exists(so_path):
cu_path = fused_ops_dir + '/' + fused_op_item + '.cu'
nvcc_cmd = 'nvcc --shared -Xcompiler -fPIC -o ' + so_path + ' ' + cu_path
print("nvcc compiler cmd: {}".format(nvcc_cmd))
os.system(nvcc_cmd)

fused_get_ciou_op_path = fused_ops_dir + "/fused_get_ciou_kernel.so" + ":FusedGetCiou"
fused_get_ciou_op_bprop_path = fused_ops_dir + "/fused_get_ciou_kernel.so" + ":FusedGetCiouBprop"
fused_get_center_dist_op_path = fused_ops_dir + "/fused_get_center_dist_kernel.so" + ":FusedGetCenterDist"
fused_get_center_dist_op_bprop_path = fused_ops_dir + "/fused_get_center_dist_kernel.so" + ":FusedGetCenterDistBprop"
fused_get_convex_diagonal_squared_path = fused_ops_dir + "/fused_get_convex_diagonal_squared_kernel.so" + ":FusedGetConvexDiagonalSquared"
fused_get_convex_diagonal_squared_grad_path = fused_ops_dir + "/fused_get_convex_diagonal_squared_kernel.so" + ":FusedGetConvexDiagonalSquaredGrad"
fused_get_ciou_diagonal_angle_path = fused_ops_dir + "/fused_get_ciou_diagonal_angle_kernel.so" + ":FusedGetCiouDiagonalAngle"
fused_get_ciou_diagonal_angle_grad_path = fused_ops_dir + "/fused_get_ciou_diagonal_angle_kernel.so" + ":FusedGetCiouDiagonalAngleGrad"
fused_get_boundding_boxes_coord_path = fused_ops_dir + "/fused_get_boundding_boxes_coord_kernel.so" + ":FusedGetBounddingBoxesCoord"
fused_get_boundding_boxes_coord_grad_path = fused_ops_dir+"/fused_get_boundding_boxes_coord_kernel.so" + ":FusedGetBounddingBoxesCoordGrad"
fused_get_intersection_area_path = fused_ops_dir + "/fused_get_intersection_area_kernel.so" + ":FusedGetIntersectionArea"
fused_get_intersection_area_grad_path = fused_ops_dir + "/fused_get_intersection_area_kernel.so" + ":FusedGetIntersectionAreaGrad"


fuse_get_ciou_gpu_info = CustomRegOp() \
.input(0, "v") \
.input(1, "iou") \
.input(2, "rho2") \
.input(3, "c2") \
.output(0, "alpha") \
.output(1, "out") \
.dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
.target("GPU") \
.get_op_info()


fuse_get_ciou_bprop_gpu_info = CustomRegOp() \
.input(0, "v") \
.input(1, "iou") \
.input(2, "rho2") \
.input(3, "c2") \
.input(4, "d_alpha") \
.input(5, "d_out") \
.output(0, "d_v") \
.output(1, "d_iou") \
.output(2, "d_rho2") \
.output(3, "d_c2") \
.dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
DataType.F32_Default, DataType.F32_Default) \
.target("GPU") \
.get_op_info()


fuse_get_center_dist_gpu_info = CustomRegOp() \
.input(0, "b1_x1") \
.input(1, "b1_x2") \
.input(2, "b1_y1") \
.input(3, "b1_y2") \
.input(4, "b2_x1") \
.input(5, "b2_x2") \
.input(6, "b2_y1") \
.input(7, "b2_y2") \
.output(0, "out") \
.dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
DataType.F32_Default) \
.target("GPU") \
.get_op_info()


fuse_get_center_dist_bprop_gpu_info = CustomRegOp() \
.input(0, "b1_x1") \
.input(1, "b1_x2") \
.input(2, "b1_y1") \
.input(3, "b1_y2") \
.input(4, "b2_x1") \
.input(5, "b2_x2") \
.input(6, "b2_y1") \
.input(7, "b2_y2") \
.input(8, "d_out") \
.output(0, "d_b1_x1") \
.output(1, "d_b1_x2") \
.output(2, "d_b1_y1") \
.output(3, "d_b1_y2") \
.output(4, "d_b2_x1") \
.output(5, "d_b2_x2") \
.output(6, "d_b2_y1") \
.output(7, "d_b2_y2") \
.dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
DataType.F32_Default) \
.target("GPU") \
.get_op_info()


fused_get_convex_diagonal_squared_info = CustomRegOp() \
.input(0, "b1_x1") \
.input(1, "b1_x2") \
.input(2, "b2_x1") \
.input(3, "b2_x2") \
.input(4, "b1_y1") \
.input(5, "b1_y2") \
.input(6, "b2_y1") \
.input(7, "b2_y2") \
.output(8, "out") \
.dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
.target("GPU") \
.get_op_info()


fused_get_convex_diagonal_squared_grad_info = CustomRegOp() \
.input(0, "b1_x1") \
.input(1, "b1_x2") \
.input(2, "b2_x1") \
.input(3, "b2_x2") \
.input(4, "b1_y1") \
.input(5, "b1_y2") \
.input(6, "b2_y1") \
.input(7, "b2_y2") \
.input(8, "dout") \
.output(9, "d_b1_x1") \
.output(10, "d_b1_x2") \
.output(11, "d_b2_x1") \
.output(12, "d_b2_x2") \
.output(13, "d_b1_y1") \
.output(14, "d_b1_y2") \
.output(15, "d_b2_y1") \
.output(16, "d_b2_y2") \
.dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
.target("GPU") \
.get_op_info()


fused_get_ciou_diagonal_angle_info = CustomRegOp() \
.input(0, "w1") \
.input(1, "h1") \
.input(2, "w2") \
.input(3, "h2") \
.output(4, "out") \
.dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
DataType.F32_Default, DataType.F32_Default) \
.target("GPU") \
.get_op_info()


fused_get_ciou_diagonal_angle_grad_info = CustomRegOp() \
.input(0, "w1") \
.input(1, "h1") \
.input(2, "w2") \
.input(3, "h2") \
.input(4, "out") \
.output(5, "w1_diff") \
.output(6, "h1_diff") \
.output(7, "w2_diff") \
.output(8, "h2_diff") \
.dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
.target("GPU") \
.get_op_info()


fused_get_boundding_boxes_coord_gpu_info = CustomRegOp() \
.input(0, "x1") \
.input(1, "y1") \
.input(2, "w1") \
.input(3, "h1") \
.input(4, "x2") \
.input(5, "y2") \
.input(6, "w2") \
.input(7, "h2") \
.output(0, "b1_x1") \
.output(1, "b1_y1") \
.output(2, "b1_x2") \
.output(3, "b1_y2") \
.output(4, "b2_x1") \
.output(5, "b2_y1") \
.output(6, "b2_x2") \
.output(7, "b2_y2") \
.dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
.target("GPU") \
.get_op_info()


fused_get_boundding_boxes_coord_bprop_gpu_info = CustomRegOp() \
.input(0, "d_b1_x1") \
.input(1, "d_b1_x2") \
.input(2, "d_b1_y1") \
.input(3, "d_b1_y2") \
.input(4, "d_b2_x1") \
.input(5, "d_b2_x2") \
.input(6, "d_b2_y1") \
.input(7, "d_b2_y2") \
.output(0, "d_x1") \
.output(1, "d_y1") \
.output(2, "d_w1") \
.output(3, "d_h1") \
.output(4, "d_x2") \
.output(5, "d_y2") \
.output(6, "d_w2") \
.output(7, "d_h2") \
.dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
.target("GPU") \
.get_op_info()


fused_get_intersection_area_gpu_info = CustomRegOp() \
.input(0, "b1_x1") \
.input(1, "b1_x2") \
.input(2, "b2_x1") \
.input(3, "b2_x2") \
.input(4, "b1_y1") \
.input(5, "b1_y2") \
.input(6, "b2_y1") \
.input(7, "b2_y2") \
.output(8, "inter") \
.dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
DataType.F32_Default) \
.target("GPU") \
.get_op_info()


fused_get_intersection_area_gpu_grad_info = CustomRegOp() \
.input(0, "b1_x1") \
.input(1, "b1_x2") \
.input(2, "b2_x1") \
.input(3, "b2_x2") \
.input(4, "b1_y1") \
.input(5, "b1_y2") \
.input(6, "b2_y1") \
.input(7, "b2_y2") \
.input(8, "d_inter") \
.output(9, "d_b1_x1") \
.output(10, "d_b1_x2") \
.output(11, "d_b2_x1") \
.output(12, "d_b2_x2") \
.output(13, "d_b1_y1") \
.output(14, "d_b1_y2") \
.output(15, "d_b2_y1") \
.output(16, "d_b2_y2") \
.dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
DataType.F32_Default) \
.target("GPU") \
.get_op_info()

7 changes: 7 additions & 0 deletions mindyolo/models/losses/fused_op/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
nvcc --shared -Xcompiler -fPIC -o $(dirname $0)/fused_get_intersection_area_kernel.so $(dirname $0)/fused_get_intersection_area_kernel.cu
nvcc --shared -Xcompiler -fPIC -o $(dirname $0)/fused_get_ciou_kernel.so $(dirname $0)/fused_get_ciou_kernel.cu
nvcc --shared -Xcompiler -fPIC -o $(dirname $0)/fused_get_ciou_diagonal_angle_kernel.so $(dirname $0)/fused_get_ciou_diagonal_angle_kernel.cu
nvcc --shared -Xcompiler -fPIC -o $(dirname $0)/fused_get_center_dist_kernel.so $(dirname $0)/fused_get_center_dist_kernel.cu
nvcc --shared -Xcompiler -fPIC -o $(dirname $0)/fused_get_boundding_boxes_coord_kernel.so $(dirname $0)/fused_get_boundding_boxes_coord_kernel.cu
nvcc --shared -Xcompiler -fPIC -o $(dirname $0)/fused_get_iou_kernel.so $(dirname $0)/fused_get_iou_kernel.cu
nvcc --shared -Xcompiler -fPIC -o $(dirname $0)/fused_get_convex_diagonal_squared_kernel.so $(dirname $0)/fused_get_convex_diagonal_squared_kernel.cu
Loading

0 comments on commit 3abaf62

Please sign in to comment.