[add] add examples

dexsuite · Aug 26, 2023 · d799bf6 · d799bf6
1 parent 7a7631f
commit d799bf6
Show file tree

Hide file tree

Showing 4 changed files with 285 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -14,3 +14,4 @@ imgui.ini
 .DS_Store
 /.idea
 /log
+/example/data/output_*.mp4
diff --git a/example/data/human_hand_video.mp4 b/example/data/human_hand_video.mp4
diff --git a/example/detect_from_video.py b/example/detect_from_video.py
@@ -0,0 +1,146 @@
+from pathlib import Path
+
+import cv2
+import numpy as np
+import sapien.core as sapien
+from sapien.asset import create_dome_envmap
+
+from dex_retargeting.retargeting_config import get_retargeting_config, RetargetingConfig
+from dex_retargeting.seq_retarget import SeqRetargeting
+from single_hand_detector import SingleHandDetector
+
+RECORD_VIDEO = False
+
+
+def setup_sapien_viz_scene(urdf_path):
+    from sapien.utils.viewer import Viewer
+    import sapien.core as sapien
+
+    engine = sapien.Engine()
+    engine.set_log_level("warning")
+
+    if not RECORD_VIDEO:
+        sapien.render_config.camera_shader_dir = "rt"
+        sapien.render_config.viewer_shader_dir = "rt"
+        sapien.render_config.rt_samples_per_pixel = 32
+        sapien.render_config.rt_use_denoiser = True
+
+    renderer = sapien.SapienRenderer()
+
+    engine.set_renderer(renderer)
+
+    scene_config = sapien.SceneConfig()
+    scene = engine.create_scene(scene_config)
+    scene.set_timestep(1 / 240)
+
+    scene.set_environment_map(create_dome_envmap(sky_color=[0.2, 0.2, 0.2], ground_color=[0.2, 0.2, 0.2]))
+    scene.add_directional_light([-1, 0.5, -1], color=[2.0, 2.0, 2.0], shadow=True, scale=2.0, shadow_map_size=4096)
+
+    loader = scene.create_urdf_loader()
+    robot = loader.load(urdf_path)
+    if robot_name == "shadow":
+        robot.set_pose(sapien.Pose([0, 0, -0.3]))
+    elif robot_name == "schunk":
+        robot.set_pose(sapien.Pose([0, 0, -0.05]))
+    elif robot_name == "dlr":
+        robot.set_pose(sapien.Pose([0, 0, -0.08]))
+
+    camera = scene.add_camera(name="photo", width=1280, height=720, fovy=1, near=0.1, far=10)
+    camera.set_local_pose(
+        sapien.Pose([0.313487, 0.0653831, -0.0111697], [0.088142, -0.0298786, -0.00264502, -0.995656])
+    )
+
+    if RECORD_VIDEO:
+        viewer = Viewer(renderer)
+        viewer.set_scene(scene)
+        # viewer.set_camera_pose(camera.pose)
+    else:
+        viewer = None
+
+    for actor in robot.get_links():
+        for visual in actor.get_visual_bodies():
+            for mesh in visual.get_render_shapes():
+                mat = mesh.material
+                mat.set_base_color(np.array([0.3, 0.3, 0.3, 1]))
+                mat.set_specular(0.7)
+                mat.set_metallic(0.1)
+
+    return scene, viewer
+
+
+def build_retargeting(robot_name):
+    if robot_name == "allegro":
+        config_path = "teleop/allegro_hand_right.yml"
+    elif robot_name == "shadow":
+        config_path = "teleop/shadow_hand_right.yml"
+    elif robot_name == "schunk":
+        config_path = "teleop/schunk_svh_hand_right.yml"
+    else:
+        raise ValueError(f"Unrecognized robot_name: {robot_name}")
+
+    test_config = get_retargeting_config(str(Path(__file__).parent.parent / f"dex_retargeting/configs/{config_path}"))
+    seq_retargeting = test_config.build()
+    return seq_retargeting, test_config
+
+
+def retarget_video(seq_retargeting: SeqRetargeting, scene: sapien.Scene, viewer):
+    video_path = Path(__file__).parent / "data/human_hand_video.mp4"
+    cap = cv2.VideoCapture(str(video_path))
+    robot = scene.get_all_articulations()[0]
+
+    if not RECORD_VIDEO:
+        writer = cv2.VideoWriter(f"data/output_{robot_name}.mp4", cv2.VideoWriter_fourcc(*"mp4v"), 30.0, (1280, 720))
+
+    if not cap.isOpened():
+        print("Error: Could not open video file.")
+    else:
+        detector = SingleHandDetector(hand_type="Right", selfie=False)
+        while cap.isOpened():
+            ret, frame = cap.read()
+
+            if not ret:
+                break
+
+            rgb = frame[..., ::-1]
+            num_box, joint_pos, keypoint_2d, mediapipe_wrist_rot = detector.detect(rgb)
+
+            retargeting_type = seq_retargeting.optimizer.retargeting_type
+            indices = seq_retargeting.optimizer.target_link_human_indices
+            if retargeting_type == "VECTOR":
+                origin_indices = indices[0, :]
+                task_indices = indices[1, :]
+                ref_value = joint_pos[task_indices, :] - joint_pos[origin_indices, :]
+            elif retargeting_type == "POSITION":
+                indices = indices
+                ref_value = joint_pos[indices, :]
+            else:
+                raise ValueError(f"Unknown retargeting type: {retargeting_type}")
+            qpos = retargeting.retarget(ref_value)
+            robot.set_qpos(qpos)
+            if RECORD_VIDEO:
+                for _ in range(3):
+                    viewer.render()
+            else:
+                cam = scene.get_cameras()[0]
+                scene.update_render()
+                cam.take_picture()
+                rgb = cam.get_texture("Color")[..., :3]
+                rgb = (np.clip(rgb, 0, 1) * 255).astype(np.uint8)
+                seg = cam.get_visual_actor_segmentation()[..., 0] < 1
+                rgb[seg, :] = [255, 255, 255]
+                writer.write(rgb[..., ::-1])
+                print(seq_retargeting.num_retargeting)
+
+        cap.release()
+        cv2.destroyAllWindows()
+        if not RECORD_VIDEO:
+            writer.release()
+
+
+if __name__ == "__main__":
+    robot_name = ["allegro", "shadow", "schunk"][1]
+    robot_dir = Path(__file__).parent.parent / "assets" / "robots"
+    RetargetingConfig.set_default_urdf_dir(str(robot_dir))
+    retargeting, cfg = build_retargeting(robot_name)
+    scene, viewer = setup_sapien_viz_scene(cfg.urdf_path)
+    retarget_video(retargeting, scene, viewer)
diff --git a/example/single_hand_detector.py b/example/single_hand_detector.py
@@ -0,0 +1,138 @@
+import mediapipe as mp
+import mediapipe.framework as framework
+import numpy as np
+from mediapipe.framework.formats import landmark_pb2
+from mediapipe.python.solutions import hands_connections
+from mediapipe.python.solutions.drawing_utils import DrawingSpec
+from mediapipe.python.solutions.hands import HandLandmark
+
+OPERATOR2MANO_RIGHT = np.array(
+    [
+        [0, 0, -1],
+        [-1, 0, 0],
+        [0, 1, 0],
+    ]
+)
+
+OPERATOR2MANO_LEFT = np.array(
+    [
+        [0, 0, -1],
+        [1, 0, 0],
+        [0, -1, 0],
+    ]
+)
+
+
+class SingleHandDetector:
+    def __init__(self, hand_type="Right", min_detection_confidence=0.8, min_tracking_confidence=0.8, selfie=False):
+        self.hand_detector = mp.solutions.hands.Hands(
+            static_image_mode=False,
+            max_num_hands=1,
+            min_detection_confidence=min_detection_confidence,
+            min_tracking_confidence=min_tracking_confidence,
+        )
+        self.selfie = selfie
+        self.operator2mano = OPERATOR2MANO_RIGHT if hand_type == "Right" else OPERATOR2MANO_LEFT
+        inverse_hand_dict = {"Right": "Left", "Left": "Right"}
+        self.detected_hand_type = hand_type if selfie else inverse_hand_dict[hand_type]
+
+    @staticmethod
+    def draw_skeleton_on_image(image, keypoint_2d: landmark_pb2.NormalizedLandmarkList, style="white"):
+        if style == "default":
+            mp.solutions.drawing_utils.draw_landmarks(
+                image,
+                keypoint_2d,
+                mp.solutions.hands.HAND_CONNECTIONS,
+                mp.solutions.drawing_styles.get_default_hand_landmarks_style(),
+                mp.solutions.drawing_styles.get_default_hand_connections_style(),
+            )
+        elif style == "white":
+            landmark_style = {}
+            for landmark in HandLandmark:
+                landmark_style[landmark] = DrawingSpec(color=(255, 48, 48), circle_radius=4, thickness=-1)
+
+            connections = hands_connections.HAND_CONNECTIONS
+            connection_style = {}
+            for pair in connections:
+                connection_style[pair] = DrawingSpec(thickness=2)
+
+            mp.solutions.drawing_utils.draw_landmarks(
+                image, keypoint_2d, mp.solutions.hands.HAND_CONNECTIONS, landmark_style, connection_style
+            )
+
+        return image
+
+    def detect(self, rgb):
+        results = self.hand_detector.process(rgb)
+        if not results.multi_hand_landmarks:
+            return 0, None, None, None
+
+        desired_hand_num = -1
+        for i in range(len(results.multi_hand_landmarks)):
+            label = results.multi_handedness[i].ListFields()[0][1][0].label
+            if label == self.detected_hand_type:
+                desired_hand_num = i
+                break
+        if desired_hand_num < 0:
+            return 0, None, None, None
+
+        keypoint_3d = results.multi_hand_world_landmarks[desired_hand_num]
+        keypoint_2d = results.multi_hand_landmarks[desired_hand_num]
+        num_box = len(results.multi_hand_landmarks)
+
+        # Parse 3d keypoints from MediaPipe hand detector
+        keypoint_3d_array = self.parse_keypoint_3d(keypoint_3d)
+        keypoint_3d_array = keypoint_3d_array - keypoint_3d_array[0:1, :]
+        mediapipe_wrist_rot = self.estimate_frame_from_hand_points(keypoint_3d_array)
+        joint_pos = keypoint_3d_array @ mediapipe_wrist_rot @ self.operator2mano
+
+        return num_box, joint_pos, keypoint_2d, mediapipe_wrist_rot
+
+    @staticmethod
+    def parse_keypoint_3d(keypoint_3d: framework.formats.landmark_pb2.LandmarkList) -> np.ndarray:
+        keypoint = np.empty([21, 3])
+        for i in range(21):
+            keypoint[i][0] = keypoint_3d.landmark[i].x
+            keypoint[i][1] = keypoint_3d.landmark[i].y
+            keypoint[i][2] = keypoint_3d.landmark[i].z
+        return keypoint
+
+    @staticmethod
+    def parse_keypoint_2d(keypoint_2d: landmark_pb2.NormalizedLandmarkList, img_size) -> np.ndarray:
+        keypoint = np.empty([21, 2])
+        for i in range(21):
+            keypoint[i][0] = keypoint_2d.landmark[i].x
+            keypoint[i][1] = keypoint_2d.landmark[i].y
+        keypoint = keypoint * np.array([img_size[1], img_size[0]])[None, :]
+        return keypoint
+
+    @staticmethod
+    def estimate_frame_from_hand_points(keypoint_3d_array: np.ndarray) -> np.ndarray:
+        """
+        Compute the 3D coordinate frame (orientation only) from detected 3d key points
+        :param points: keypoint3 detected from MediaPipe detector. Order: [wrist, index, middle, pinky]
+        :return: the coordinate frame of wrist in MANO convention
+        """
+        assert keypoint_3d_array.shape == (21, 3)
+        points = keypoint_3d_array[[0, 5, 9], :]
+
+        # Compute vector from palm to the first joint of middle finger
+        x_vector = points[0] - points[2]
+
+        # Normal fitting with SVD
+        points = points - np.mean(points, axis=0, keepdims=True)
+        u, s, v = np.linalg.svd(points)
+
+        normal = v[2, :]
+
+        # Gram–Schmidt Orthonormalize
+        x = x_vector - np.sum(x_vector * normal) * normal
+        x = x / np.linalg.norm(x)
+        z = np.cross(x, normal)
+
+        # We assume that the vector from pinky to index is similar the z axis in MANO convention
+        if np.sum(z * (points[1] - points[2])) < 0:
+            normal *= -1
+            z *= -1
+        frame = np.stack([x, normal, z], axis=1)
+        return frame