Skip to content

Commit

Permalink
[add] add examples
Browse files Browse the repository at this point in the history
  • Loading branch information
yzqin committed Aug 26, 2023
1 parent 7a7631f commit d799bf6
Show file tree
Hide file tree
Showing 4 changed files with 285 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ imgui.ini
.DS_Store
/.idea
/log
/example/data/output_*.mp4
Binary file added example/data/human_hand_video.mp4
Binary file not shown.
146 changes: 146 additions & 0 deletions example/detect_from_video.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
from pathlib import Path

import cv2
import numpy as np
import sapien.core as sapien
from sapien.asset import create_dome_envmap

from dex_retargeting.retargeting_config import get_retargeting_config, RetargetingConfig
from dex_retargeting.seq_retarget import SeqRetargeting
from single_hand_detector import SingleHandDetector

RECORD_VIDEO = False


def setup_sapien_viz_scene(urdf_path):
from sapien.utils.viewer import Viewer
import sapien.core as sapien

engine = sapien.Engine()
engine.set_log_level("warning")

if not RECORD_VIDEO:
sapien.render_config.camera_shader_dir = "rt"
sapien.render_config.viewer_shader_dir = "rt"
sapien.render_config.rt_samples_per_pixel = 32
sapien.render_config.rt_use_denoiser = True

renderer = sapien.SapienRenderer()

engine.set_renderer(renderer)

scene_config = sapien.SceneConfig()
scene = engine.create_scene(scene_config)
scene.set_timestep(1 / 240)

scene.set_environment_map(create_dome_envmap(sky_color=[0.2, 0.2, 0.2], ground_color=[0.2, 0.2, 0.2]))
scene.add_directional_light([-1, 0.5, -1], color=[2.0, 2.0, 2.0], shadow=True, scale=2.0, shadow_map_size=4096)

loader = scene.create_urdf_loader()
robot = loader.load(urdf_path)
if robot_name == "shadow":
robot.set_pose(sapien.Pose([0, 0, -0.3]))
elif robot_name == "schunk":
robot.set_pose(sapien.Pose([0, 0, -0.05]))
elif robot_name == "dlr":
robot.set_pose(sapien.Pose([0, 0, -0.08]))

camera = scene.add_camera(name="photo", width=1280, height=720, fovy=1, near=0.1, far=10)
camera.set_local_pose(
sapien.Pose([0.313487, 0.0653831, -0.0111697], [0.088142, -0.0298786, -0.00264502, -0.995656])
)

if RECORD_VIDEO:
viewer = Viewer(renderer)
viewer.set_scene(scene)
# viewer.set_camera_pose(camera.pose)
else:
viewer = None

for actor in robot.get_links():
for visual in actor.get_visual_bodies():
for mesh in visual.get_render_shapes():
mat = mesh.material
mat.set_base_color(np.array([0.3, 0.3, 0.3, 1]))
mat.set_specular(0.7)
mat.set_metallic(0.1)

return scene, viewer


def build_retargeting(robot_name):
if robot_name == "allegro":
config_path = "teleop/allegro_hand_right.yml"
elif robot_name == "shadow":
config_path = "teleop/shadow_hand_right.yml"
elif robot_name == "schunk":
config_path = "teleop/schunk_svh_hand_right.yml"
else:
raise ValueError(f"Unrecognized robot_name: {robot_name}")

test_config = get_retargeting_config(str(Path(__file__).parent.parent / f"dex_retargeting/configs/{config_path}"))
seq_retargeting = test_config.build()
return seq_retargeting, test_config


def retarget_video(seq_retargeting: SeqRetargeting, scene: sapien.Scene, viewer):
video_path = Path(__file__).parent / "data/human_hand_video.mp4"
cap = cv2.VideoCapture(str(video_path))
robot = scene.get_all_articulations()[0]

if not RECORD_VIDEO:
writer = cv2.VideoWriter(f"data/output_{robot_name}.mp4", cv2.VideoWriter_fourcc(*"mp4v"), 30.0, (1280, 720))

if not cap.isOpened():
print("Error: Could not open video file.")
else:
detector = SingleHandDetector(hand_type="Right", selfie=False)
while cap.isOpened():
ret, frame = cap.read()

if not ret:
break

rgb = frame[..., ::-1]
num_box, joint_pos, keypoint_2d, mediapipe_wrist_rot = detector.detect(rgb)

retargeting_type = seq_retargeting.optimizer.retargeting_type
indices = seq_retargeting.optimizer.target_link_human_indices
if retargeting_type == "VECTOR":
origin_indices = indices[0, :]
task_indices = indices[1, :]
ref_value = joint_pos[task_indices, :] - joint_pos[origin_indices, :]
elif retargeting_type == "POSITION":
indices = indices
ref_value = joint_pos[indices, :]
else:
raise ValueError(f"Unknown retargeting type: {retargeting_type}")
qpos = retargeting.retarget(ref_value)
robot.set_qpos(qpos)
if RECORD_VIDEO:
for _ in range(3):
viewer.render()
else:
cam = scene.get_cameras()[0]
scene.update_render()
cam.take_picture()
rgb = cam.get_texture("Color")[..., :3]
rgb = (np.clip(rgb, 0, 1) * 255).astype(np.uint8)
seg = cam.get_visual_actor_segmentation()[..., 0] < 1
rgb[seg, :] = [255, 255, 255]
writer.write(rgb[..., ::-1])
print(seq_retargeting.num_retargeting)

cap.release()
cv2.destroyAllWindows()
if not RECORD_VIDEO:
writer.release()


if __name__ == "__main__":
robot_name = ["allegro", "shadow", "schunk"][1]
robot_dir = Path(__file__).parent.parent / "assets" / "robots"
RetargetingConfig.set_default_urdf_dir(str(robot_dir))
retargeting, cfg = build_retargeting(robot_name)
scene, viewer = setup_sapien_viz_scene(cfg.urdf_path)
retarget_video(retargeting, scene, viewer)
138 changes: 138 additions & 0 deletions example/single_hand_detector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import mediapipe as mp
import mediapipe.framework as framework
import numpy as np
from mediapipe.framework.formats import landmark_pb2
from mediapipe.python.solutions import hands_connections
from mediapipe.python.solutions.drawing_utils import DrawingSpec
from mediapipe.python.solutions.hands import HandLandmark

OPERATOR2MANO_RIGHT = np.array(
[
[0, 0, -1],
[-1, 0, 0],
[0, 1, 0],
]
)

OPERATOR2MANO_LEFT = np.array(
[
[0, 0, -1],
[1, 0, 0],
[0, -1, 0],
]
)


class SingleHandDetector:
def __init__(self, hand_type="Right", min_detection_confidence=0.8, min_tracking_confidence=0.8, selfie=False):
self.hand_detector = mp.solutions.hands.Hands(
static_image_mode=False,
max_num_hands=1,
min_detection_confidence=min_detection_confidence,
min_tracking_confidence=min_tracking_confidence,
)
self.selfie = selfie
self.operator2mano = OPERATOR2MANO_RIGHT if hand_type == "Right" else OPERATOR2MANO_LEFT
inverse_hand_dict = {"Right": "Left", "Left": "Right"}
self.detected_hand_type = hand_type if selfie else inverse_hand_dict[hand_type]

@staticmethod
def draw_skeleton_on_image(image, keypoint_2d: landmark_pb2.NormalizedLandmarkList, style="white"):
if style == "default":
mp.solutions.drawing_utils.draw_landmarks(
image,
keypoint_2d,
mp.solutions.hands.HAND_CONNECTIONS,
mp.solutions.drawing_styles.get_default_hand_landmarks_style(),
mp.solutions.drawing_styles.get_default_hand_connections_style(),
)
elif style == "white":
landmark_style = {}
for landmark in HandLandmark:
landmark_style[landmark] = DrawingSpec(color=(255, 48, 48), circle_radius=4, thickness=-1)

connections = hands_connections.HAND_CONNECTIONS
connection_style = {}
for pair in connections:
connection_style[pair] = DrawingSpec(thickness=2)

mp.solutions.drawing_utils.draw_landmarks(
image, keypoint_2d, mp.solutions.hands.HAND_CONNECTIONS, landmark_style, connection_style
)

return image

def detect(self, rgb):
results = self.hand_detector.process(rgb)
if not results.multi_hand_landmarks:
return 0, None, None, None

desired_hand_num = -1
for i in range(len(results.multi_hand_landmarks)):
label = results.multi_handedness[i].ListFields()[0][1][0].label
if label == self.detected_hand_type:
desired_hand_num = i
break
if desired_hand_num < 0:
return 0, None, None, None

keypoint_3d = results.multi_hand_world_landmarks[desired_hand_num]
keypoint_2d = results.multi_hand_landmarks[desired_hand_num]
num_box = len(results.multi_hand_landmarks)

# Parse 3d keypoints from MediaPipe hand detector
keypoint_3d_array = self.parse_keypoint_3d(keypoint_3d)
keypoint_3d_array = keypoint_3d_array - keypoint_3d_array[0:1, :]
mediapipe_wrist_rot = self.estimate_frame_from_hand_points(keypoint_3d_array)
joint_pos = keypoint_3d_array @ mediapipe_wrist_rot @ self.operator2mano

return num_box, joint_pos, keypoint_2d, mediapipe_wrist_rot

@staticmethod
def parse_keypoint_3d(keypoint_3d: framework.formats.landmark_pb2.LandmarkList) -> np.ndarray:
keypoint = np.empty([21, 3])
for i in range(21):
keypoint[i][0] = keypoint_3d.landmark[i].x
keypoint[i][1] = keypoint_3d.landmark[i].y
keypoint[i][2] = keypoint_3d.landmark[i].z
return keypoint

@staticmethod
def parse_keypoint_2d(keypoint_2d: landmark_pb2.NormalizedLandmarkList, img_size) -> np.ndarray:
keypoint = np.empty([21, 2])
for i in range(21):
keypoint[i][0] = keypoint_2d.landmark[i].x
keypoint[i][1] = keypoint_2d.landmark[i].y
keypoint = keypoint * np.array([img_size[1], img_size[0]])[None, :]
return keypoint

@staticmethod
def estimate_frame_from_hand_points(keypoint_3d_array: np.ndarray) -> np.ndarray:
"""
Compute the 3D coordinate frame (orientation only) from detected 3d key points
:param points: keypoint3 detected from MediaPipe detector. Order: [wrist, index, middle, pinky]
:return: the coordinate frame of wrist in MANO convention
"""
assert keypoint_3d_array.shape == (21, 3)
points = keypoint_3d_array[[0, 5, 9], :]

# Compute vector from palm to the first joint of middle finger
x_vector = points[0] - points[2]

# Normal fitting with SVD
points = points - np.mean(points, axis=0, keepdims=True)
u, s, v = np.linalg.svd(points)

normal = v[2, :]

# Gram–Schmidt Orthonormalize
x = x_vector - np.sum(x_vector * normal) * normal
x = x / np.linalg.norm(x)
z = np.cross(x, normal)

# We assume that the vector from pinky to index is similar the z axis in MANO convention
if np.sum(z * (points[1] - points[2])) < 0:
normal *= -1
z *= -1
frame = np.stack([x, normal, z], axis=1)
return frame

0 comments on commit d799bf6

Please sign in to comment.