diff --git a/examples/onnx-yolo/post-process.py b/examples/onnx-yolo/post-process.py index c7e1c0e..e05ee15 100644 --- a/examples/onnx-yolo/post-process.py +++ b/examples/onnx-yolo/post-process.py @@ -4,17 +4,21 @@ def hook(frame_data, _): frame = frame_data['original'] model_output = frame_data['inference_output'] - yolo_input_shape = (640, 640, 3) # h,w,c - boxes, scores, class_ids = parse_yolo_output(model_output, frame.shape, yolo_input_shape) - class_labels = [yolo_classes[id] for id in class_ids] - for i in range(len(boxes)): - draw_bbox(frame, boxes[i], class_labels[i], scores[i]) + if len(model_output) > 0: + yolo_input_shape = (640, 640, 3) # h,w,c + boxes, scores, class_ids = postprocess_yolo(frame.shape, yolo_input_shape, model_output) + class_labels = [yolo_classes[id] for id in class_ids] + for i in range(len(boxes)): + draw_bbox(frame, boxes[i], class_labels[i], scores[i], color_palette[class_ids[i]]) - frame_data['modified'] = frame + frame_data['modified'] = frame ################################################# # Util functions to make the hook more readable # ################################################# +confidence_thres = 0.45 +iou_thres = 0.5 + yolo_classes = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', @@ -24,47 +28,19 @@ def hook(frame_data, _): 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'] +color_palette = np.random.uniform(0, 255, size=(len(yolo_classes), 3)) def xywh2xyxy(i): + """ + Converts from (center-x, center-y,w,h) to (x1,y1,x2,y2) + """ o = i.view() # Create numpy view - dw = i[..., 2] / 2 - dh = i[..., 3] / 2 - o[..., 0] = i[..., 0] - dw - o[..., 1] = i[..., 1] - dh - o[..., 2] = i[..., 0] + dw - o[..., 3] = i[..., 1] + dh + o[..., 0] = i[..., 0] - i[..., 2] / 2 + o[..., 1] = i[..., 1] - i[..., 3] / 2 + o[..., 2] = i[..., 0] + i[..., 2] + o[..., 3] = i[..., 1] + i[..., 3] return o -def rescale_boxes(original_image_shape, model_input_shape, boxes): - img_height, img_width, _ = original_image_shape - input_height, input_width, _ = model_input_shape - input_shape = np.array([input_width, input_height, input_width, input_height]) - boxes = np.divide(boxes, input_shape, dtype=np.float32) - boxes *= np.array([img_width, img_height, img_width, img_height]) - return boxes - -def parse_yolo_output(model_output, orginal_image_shape, model_input_shape): - confidence_threshold = 0.3 - iou_threshold = 0.7 - - predictions = np.squeeze(model_output[0]).T - - scores = np.max(predictions[:, 4:], axis=1) - predictions = predictions[scores > confidence_threshold, :] - scores = scores[scores > confidence_threshold] - if len(scores) == 0: - return [], [], [] - - class_ids = np.argmax(predictions[:, 4:], axis=1) - - # Extract boxes - boxes = predictions[:, :4] - boxes = rescale_boxes(orginal_image_shape, model_input_shape, boxes) - boxes = xywh2xyxy(boxes) - - indices = cv2.dnn.NMSBoxes(boxes, scores, confidence_threshold, iou_threshold) - return boxes[indices], scores[indices], class_ids[indices] - def clip_boxes(boxes, shape): boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, shape[1]) # x1, x2 boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, shape[0]) # y1, y2 @@ -85,3 +61,72 @@ def draw_bbox(image, box, label='', score=None, color=(255, 0, 255), txt_color=( else: cv2.putText(image, label, (p1[0], p1[1] - 2 if outside else p1[1] + h + 2), 0, lw / 3, txt_color, thickness=tf, lineType=cv2.LINE_AA) + +def postprocess_yolo(original_frame_shape, resized_img_shape, output): + original_height, original_width, _ = original_frame_shape + resized_height, resized_width, _ = resized_img_shape + + outputs = np.transpose(np.squeeze(output[0])) + + # Get the number of rows in the outputs array + rows = outputs.shape[0] + + boxes = [] + scores = [] + class_ids = [] + + # Calculate the scaling factors for the bounding box coordinates + if original_height > original_width: + scale_factor = original_height / resized_height + else: + scale_factor = original_width / resized_width + + # Iterate over each row in the outputs array + for i in range(rows): + classes_scores = outputs[i][4:] + + # FIXME: For some reason when using YOLO in ONNX sometimes it returns NaN values in the classes scores + # and other times it returns 1 for some classes and 0 for the rest which is almost certainly a bad prediction. + # This hack skips those entries + nan_mask = np.isnan(classes_scores) + if np.any(nan_mask): + continue + if np.any(classes_scores == 1): + continue + + max_score = np.amax(classes_scores) + if max_score >= confidence_thres: + class_id = np.argmax(classes_scores) # Get the class ID with the highest score + x, y, w, h = outputs[i][0], outputs[i][1], outputs[i][2], outputs[i][3] + + ## Calculate the scaled coordinates of the bounding box + ## the original image was padded to be square + if original_height > original_width: + # we added pad on the width + pad = (resized_width - original_width / scale_factor) // 2 + left = int((x - pad) * scale_factor) + top = int(y * scale_factor) + else: + # we added pad on the height + pad = (resized_height - original_height / scale_factor) // 2 + left = int(x * scale_factor) + top = int((y - pad) * scale_factor) + width = int(w * scale_factor) + height = int(h * scale_factor) + + class_ids.append(class_id) + scores.append(max_score) + boxes.append([left, top, width, height]) + + if len(boxes) > 0: + boxes = np.array(boxes) + scores = np.array(scores) + class_ids = np.array(class_ids) + + clip_boxes(boxes, original_frame_shape) + boxes = xywh2xyxy(boxes) + indices = cv2.dnn.NMSBoxes(boxes, scores, confidence_thres, iou_thres) + + return boxes[indices], scores[indices], class_ids[indices] + else: + return [], [], [] diff --git a/examples/onnx-yolo/pre-process.py b/examples/onnx-yolo/pre-process.py index 6fdd41d..1e2fb0b 100644 --- a/examples/onnx-yolo/pre-process.py +++ b/examples/onnx-yolo/pre-process.py @@ -1,34 +1,50 @@ import cv2 import numpy as np -def resize_rgb_frame(frame, target_dim): - target_height = target_dim[0] - target_width = target_dim[1] - channels = target_dim[2] - # Scale the image maintaining aspect ratio - width_ratio = target_width / frame.shape[1] - height_ratio = target_height / frame.shape[0] +def is_cuda_available(): + return cv2.cuda.getCudaEnabledDeviceCount() > 0 + +""" +Resize and pad image. Uses CUDA when available +""" +def resize_and_pad(frame, target_dim, pad_top, pad_bottom, pad_left, pad_right): + target_height, target_width = target_dim + if is_cuda_available(): + # FIXME: due to the memory allocation here could be even slower than running on CPU. We must provide the frame from GPU memory to the hook + frame_gpu = cv2.cuda_GpuMat(frame) + resized_frame_gpu = cv2.cuda.resize(frame_gpu, (target_width, target_height), interpolation=cv2.INTER_CUBIC) + padded_frame_gpu = cv2.cuda.copyMakeBorder(resized_frame_gpu, pad_top, pad_bottom, pad_left, pad_right, cv2.BORDER_CONSTANT, value=(0, 0, 0)) + result = padded_frame_gpu.download() + return result + else: + resized_frame = cv2.resize(frame, (target_width, target_height), interpolation=cv2.INTER_CUBIC) + padded_frame = cv2.copyMakeBorder(resized_frame, pad_top, pad_bottom, pad_left, pad_right, + borderType=cv2.BORDER_CONSTANT, value=(0, 0, 0)) + return padded_frame + +def resize_with_padding(frame, target_dim): + target_height, target_width, _ = target_dim + frame_height, frame_width, _ = frame.shape + + width_ratio = target_width / frame_width + height_ratio = target_height / frame_height # Choose the minimum scaling factor to maintain aspect ratio scale_factor = min(width_ratio, height_ratio) # Calculate new dimensions after resizing - new_width = int(frame.shape[1] * scale_factor) - new_height = int(frame.shape[0] * scale_factor) + new_width = int(frame_width * scale_factor) + new_height = int(frame_height * scale_factor) # Calculate padding dimensions pad_width = (target_width - new_width) // 2 pad_height = (target_height - new_height) // 2 - # Create a canvas with the desired dimensions and padding - canvas = np.zeros((target_height, target_width, channels), dtype=np.uint8) - # Resize the image and place it on the canvas - resized_image = cv2.resize(frame, (new_width, new_height)) - canvas[pad_height:pad_height+new_height, pad_width:pad_width+new_width] = resized_image - return canvas -def hook(frame_data, context): + padded_image = resize_and_pad(frame, (new_height, new_width), pad_height, pad_height, pad_width, pad_width) + return padded_image + +def hook(frame_data, _): frame = frame_data["original"].view() yolo_input_shape = (640, 640, 3) # h,w,c - frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) - frame = resize_rgb_frame(frame, yolo_input_shape) - frame = cv2.normalize(frame, None, 0.0, 1.0, cv2.NORM_MINMAX) + frame = resize_with_padding(frame, yolo_input_shape) + frame = np.array(frame) / 255.0 # Normalize pixel values frame = np.transpose(frame, axes=(2,0,1)) # Convert to c,h,w inference_inputs = frame.astype("float32") frame_data['inference_input'] = inference_inputs diff --git a/examples/yolo/process.py b/examples/yolo/process.py index 75d3317..51ef42d 100644 --- a/examples/yolo/process.py +++ b/examples/yolo/process.py @@ -1,14 +1,8 @@ import numpy as np -import time def hook(frame, context): rgb_frame = frame['original'] model = context['model'] - input_fps = frame['fps'] - delay = time.time() - frame['input_ts'] - if input_fps > 0 and delay > 1 / input_fps: - print('Skipping frame to maintain real-time') - else: - prediction = next(model(rgb_frame, stream=True)) - bboxes = prediction.boxes.data.tolist() if prediction.boxes else [] - frame['inference_output'] = np.array(bboxes, dtype="float32") + prediction = next(model(rgb_frame, stream=True)) + bboxes = prediction.boxes.data.tolist() if prediction.boxes else [] + frame['inference_output'] = np.array(bboxes, dtype="float32")