Merge pull request #129 from pipeless-ai/improve_examples

fix(examples): Improve yolo onnx example
pipeless-ai · Jan 31, 2024 · 5f8254f · 5f8254f
2 parents 537e8e9 + 583cb26
commit 5f8254f
Show file tree

Hide file tree

Showing 3 changed files with 125 additions and 70 deletions.
diff --git a/examples/onnx-yolo/post-process.py b/examples/onnx-yolo/post-process.py
@@ -4,17 +4,21 @@
 def hook(frame_data, _):
     frame = frame_data['original']
     model_output = frame_data['inference_output']
-    yolo_input_shape = (640, 640, 3) # h,w,c
-    boxes, scores, class_ids = parse_yolo_output(model_output, frame.shape, yolo_input_shape)
-    class_labels = [yolo_classes[id] for id in class_ids]
-    for i in range(len(boxes)):
-        draw_bbox(frame, boxes[i], class_labels[i], scores[i])
+    if len(model_output) > 0:
+        yolo_input_shape = (640, 640, 3) # h,w,c
+        boxes, scores, class_ids = postprocess_yolo(frame.shape, yolo_input_shape, model_output)
+        class_labels = [yolo_classes[id] for id in class_ids]
+        for i in range(len(boxes)):
+            draw_bbox(frame, boxes[i], class_labels[i], scores[i], color_palette[class_ids[i]])
 
-    frame_data['modified'] = frame
+        frame_data['modified'] = frame
 
 #################################################
 # Util functions to make the hook more readable #
 #################################################
+confidence_thres = 0.45
+iou_thres = 0.5
+
 yolo_classes = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
                'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
                'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
@@ -24,47 +28,19 @@ def hook(frame_data, _):
                'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard',
                'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
                'scissors', 'teddy bear', 'hair drier', 'toothbrush']
+color_palette = np.random.uniform(0, 255, size=(len(yolo_classes), 3))
 
 def xywh2xyxy(i):
+    """
+    Converts from (center-x, center-y,w,h) to (x1,y1,x2,y2)
+    """
     o = i.view() # Create numpy view
-    dw = i[..., 2] / 2
-    dh = i[..., 3] / 2
-    o[..., 0] = i[..., 0] - dw
-    o[..., 1] = i[..., 1] - dh
-    o[..., 2] = i[..., 0] + dw
-    o[..., 3] = i[..., 1] + dh
+    o[..., 0] = i[..., 0] - i[..., 2] / 2
+    o[..., 1] = i[..., 1] - i[..., 3] / 2
+    o[..., 2] = i[..., 0] + i[..., 2]
+    o[..., 3] = i[..., 1] + i[..., 3]
     return o
 
-def rescale_boxes(original_image_shape, model_input_shape, boxes):
-    img_height, img_width, _ = original_image_shape
-    input_height, input_width, _ = model_input_shape
-    input_shape = np.array([input_width, input_height, input_width, input_height])
-    boxes = np.divide(boxes, input_shape, dtype=np.float32)
-    boxes *= np.array([img_width, img_height, img_width, img_height])
-    return boxes
-
-def parse_yolo_output(model_output, orginal_image_shape, model_input_shape):
-    confidence_threshold = 0.3
-    iou_threshold = 0.7
-
-    predictions = np.squeeze(model_output[0]).T
-
-    scores = np.max(predictions[:, 4:], axis=1)
-    predictions = predictions[scores > confidence_threshold, :]
-    scores = scores[scores > confidence_threshold]
-    if len(scores) == 0:
-        return [], [], []
-
-    class_ids = np.argmax(predictions[:, 4:], axis=1)
-
-    # Extract boxes
-    boxes = predictions[:, :4]
-    boxes = rescale_boxes(orginal_image_shape, model_input_shape, boxes)
-    boxes = xywh2xyxy(boxes)
-
-    indices = cv2.dnn.NMSBoxes(boxes, scores, confidence_threshold, iou_threshold)
-    return boxes[indices], scores[indices], class_ids[indices]
-
 def clip_boxes(boxes, shape):
     boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, shape[1])  # x1, x2
     boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, shape[0])  # y1, y2
@@ -85,3 +61,72 @@ def draw_bbox(image, box, label='', score=None, color=(255, 0, 255), txt_color=(
         else:
             cv2.putText(image, label, (p1[0], p1[1] - 2 if outside else p1[1] + h + 2),
                 0, lw / 3, txt_color, thickness=tf, lineType=cv2.LINE_AA)
+
+def postprocess_yolo(original_frame_shape, resized_img_shape, output):
+    original_height, original_width, _ = original_frame_shape
+    resized_height, resized_width, _ = resized_img_shape
+
+    outputs = np.transpose(np.squeeze(output[0]))
+
+    # Get the number of rows in the outputs array
+    rows = outputs.shape[0]
+
+    boxes = []
+    scores = []
+    class_ids = []
+
+    # Calculate the scaling factors for the bounding box coordinates
+    if original_height > original_width:
+        scale_factor = original_height / resized_height
+    else:
+        scale_factor = original_width / resized_width
+
+    # Iterate over each row in the outputs array
+    for i in range(rows):
+        classes_scores = outputs[i][4:]
+
+        # FIXME: For some reason when using YOLO in ONNX sometimes it returns NaN values in the classes scores
+        #        and other times it returns 1 for some classes and 0 for the rest which is almost certainly a bad prediction.
+        #        This hack skips those entries
+        nan_mask = np.isnan(classes_scores)
+        if np.any(nan_mask):
+            continue
+        if np.any(classes_scores == 1):
+            continue
+
+        max_score = np.amax(classes_scores)
+        if max_score >= confidence_thres:
+            class_id = np.argmax(classes_scores) # Get the class ID with the highest score
+            x, y, w, h = outputs[i][0], outputs[i][1], outputs[i][2], outputs[i][3]
+
+            ## Calculate the scaled coordinates of the bounding box
+            ## the original image was padded to be square
+            if original_height > original_width:
+                # we added pad on the width
+                pad = (resized_width - original_width / scale_factor) // 2
+                left = int((x - pad) * scale_factor)
+                top = int(y * scale_factor)
+            else:
+                # we added pad on the height
+                pad = (resized_height - original_height / scale_factor) // 2
+                left = int(x * scale_factor)
+                top = int((y - pad) * scale_factor)
+            width = int(w * scale_factor)
+            height = int(h * scale_factor)
+
+            class_ids.append(class_id)
+            scores.append(max_score)
+            boxes.append([left, top, width, height])
+
+    if len(boxes) > 0:
+        boxes = np.array(boxes)
+        scores = np.array(scores)
+        class_ids = np.array(class_ids)
+
+        clip_boxes(boxes, original_frame_shape)
+        boxes = xywh2xyxy(boxes)
+        indices = cv2.dnn.NMSBoxes(boxes, scores, confidence_thres, iou_thres)
+
+        return boxes[indices], scores[indices], class_ids[indices]
+    else:
+        return [], [], []
diff --git a/examples/onnx-yolo/pre-process.py b/examples/onnx-yolo/pre-process.py
@@ -1,34 +1,50 @@
 import cv2
 import numpy as np
 
-def resize_rgb_frame(frame, target_dim):
-    target_height = target_dim[0]
-    target_width = target_dim[1]
-    channels = target_dim[2]
-    # Scale the image maintaining aspect ratio
-    width_ratio = target_width / frame.shape[1]
-    height_ratio = target_height / frame.shape[0]
+def is_cuda_available():
+    return cv2.cuda.getCudaEnabledDeviceCount() > 0
+
+"""
+Resize and pad image. Uses CUDA when available
+"""
+def resize_and_pad(frame, target_dim, pad_top, pad_bottom, pad_left, pad_right):
+    target_height, target_width = target_dim
+    if is_cuda_available():
+        # FIXME: due to the memory allocation here could be even slower than running on CPU. We must provide the frame from GPU memory to the hook
+        frame_gpu = cv2.cuda_GpuMat(frame)
+        resized_frame_gpu = cv2.cuda.resize(frame_gpu, (target_width, target_height), interpolation=cv2.INTER_CUBIC)
+        padded_frame_gpu = cv2.cuda.copyMakeBorder(resized_frame_gpu, pad_top, pad_bottom, pad_left, pad_right, cv2.BORDER_CONSTANT, value=(0, 0, 0))
+        result = padded_frame_gpu.download()
+        return result
+    else:
+        resized_frame = cv2.resize(frame, (target_width, target_height), interpolation=cv2.INTER_CUBIC)
+        padded_frame = cv2.copyMakeBorder(resized_frame, pad_top, pad_bottom, pad_left, pad_right,
+                        borderType=cv2.BORDER_CONSTANT, value=(0, 0, 0))
+        return padded_frame
+
+def resize_with_padding(frame, target_dim):
+    target_height, target_width, _ = target_dim
+    frame_height, frame_width, _ = frame.shape
+
+    width_ratio = target_width / frame_width
+    height_ratio = target_height / frame_height
     # Choose the minimum scaling factor to maintain aspect ratio
     scale_factor = min(width_ratio, height_ratio)
     # Calculate new dimensions after resizing
-    new_width = int(frame.shape[1] * scale_factor)
-    new_height = int(frame.shape[0] * scale_factor)
+    new_width = int(frame_width * scale_factor)
+    new_height = int(frame_height * scale_factor)
     # Calculate padding dimensions
     pad_width = (target_width - new_width) // 2
     pad_height = (target_height - new_height) // 2
-    # Create a canvas with the desired dimensions and padding
-    canvas = np.zeros((target_height, target_width, channels), dtype=np.uint8)
-    # Resize the image and place it on the canvas
-    resized_image = cv2.resize(frame, (new_width, new_height))
-    canvas[pad_height:pad_height+new_height, pad_width:pad_width+new_width] = resized_image
-    return canvas
 
-def hook(frame_data, context):
+    padded_image = resize_and_pad(frame, (new_height, new_width), pad_height, pad_height, pad_width, pad_width)
+    return padded_image
+
+def hook(frame_data, _):
     frame = frame_data["original"].view()
     yolo_input_shape = (640, 640, 3) # h,w,c
-    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-    frame = resize_rgb_frame(frame, yolo_input_shape)
-    frame = cv2.normalize(frame, None, 0.0, 1.0, cv2.NORM_MINMAX)
+    frame = resize_with_padding(frame, yolo_input_shape)
+    frame = np.array(frame) / 255.0 # Normalize pixel values
     frame = np.transpose(frame, axes=(2,0,1)) # Convert to c,h,w
     inference_inputs = frame.astype("float32")
     frame_data['inference_input'] = inference_inputs
diff --git a/examples/yolo/process.py b/examples/yolo/process.py
@@ -1,14 +1,8 @@
 import numpy as np
-import time
 
 def hook(frame, context):
     rgb_frame = frame['original']
     model = context['model']
-    input_fps = frame['fps']
-    delay = time.time() - frame['input_ts']
-    if input_fps > 0 and delay > 1 / input_fps:
-       print('Skipping frame to maintain real-time')
-    else:
-        prediction = next(model(rgb_frame, stream=True))
-        bboxes = prediction.boxes.data.tolist() if prediction.boxes else []
-        frame['inference_output'] = np.array(bboxes, dtype="float32")
+    prediction = next(model(rgb_frame, stream=True))
+    bboxes = prediction.boxes.data.tolist() if prediction.boxes else []
+    frame['inference_output'] = np.array(bboxes, dtype="float32")