-
Notifications
You must be signed in to change notification settings - Fork 23
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
285 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,3 +14,4 @@ imgui.ini | |
.DS_Store | ||
/.idea | ||
/log | ||
/example/data/output_*.mp4 |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,146 @@ | ||
from pathlib import Path | ||
|
||
import cv2 | ||
import numpy as np | ||
import sapien.core as sapien | ||
from sapien.asset import create_dome_envmap | ||
|
||
from dex_retargeting.retargeting_config import get_retargeting_config, RetargetingConfig | ||
from dex_retargeting.seq_retarget import SeqRetargeting | ||
from single_hand_detector import SingleHandDetector | ||
|
||
RECORD_VIDEO = False | ||
|
||
|
||
def setup_sapien_viz_scene(urdf_path): | ||
from sapien.utils.viewer import Viewer | ||
import sapien.core as sapien | ||
|
||
engine = sapien.Engine() | ||
engine.set_log_level("warning") | ||
|
||
if not RECORD_VIDEO: | ||
sapien.render_config.camera_shader_dir = "rt" | ||
sapien.render_config.viewer_shader_dir = "rt" | ||
sapien.render_config.rt_samples_per_pixel = 32 | ||
sapien.render_config.rt_use_denoiser = True | ||
|
||
renderer = sapien.SapienRenderer() | ||
|
||
engine.set_renderer(renderer) | ||
|
||
scene_config = sapien.SceneConfig() | ||
scene = engine.create_scene(scene_config) | ||
scene.set_timestep(1 / 240) | ||
|
||
scene.set_environment_map(create_dome_envmap(sky_color=[0.2, 0.2, 0.2], ground_color=[0.2, 0.2, 0.2])) | ||
scene.add_directional_light([-1, 0.5, -1], color=[2.0, 2.0, 2.0], shadow=True, scale=2.0, shadow_map_size=4096) | ||
|
||
loader = scene.create_urdf_loader() | ||
robot = loader.load(urdf_path) | ||
if robot_name == "shadow": | ||
robot.set_pose(sapien.Pose([0, 0, -0.3])) | ||
elif robot_name == "schunk": | ||
robot.set_pose(sapien.Pose([0, 0, -0.05])) | ||
elif robot_name == "dlr": | ||
robot.set_pose(sapien.Pose([0, 0, -0.08])) | ||
|
||
camera = scene.add_camera(name="photo", width=1280, height=720, fovy=1, near=0.1, far=10) | ||
camera.set_local_pose( | ||
sapien.Pose([0.313487, 0.0653831, -0.0111697], [0.088142, -0.0298786, -0.00264502, -0.995656]) | ||
) | ||
|
||
if RECORD_VIDEO: | ||
viewer = Viewer(renderer) | ||
viewer.set_scene(scene) | ||
# viewer.set_camera_pose(camera.pose) | ||
else: | ||
viewer = None | ||
|
||
for actor in robot.get_links(): | ||
for visual in actor.get_visual_bodies(): | ||
for mesh in visual.get_render_shapes(): | ||
mat = mesh.material | ||
mat.set_base_color(np.array([0.3, 0.3, 0.3, 1])) | ||
mat.set_specular(0.7) | ||
mat.set_metallic(0.1) | ||
|
||
return scene, viewer | ||
|
||
|
||
def build_retargeting(robot_name): | ||
if robot_name == "allegro": | ||
config_path = "teleop/allegro_hand_right.yml" | ||
elif robot_name == "shadow": | ||
config_path = "teleop/shadow_hand_right.yml" | ||
elif robot_name == "schunk": | ||
config_path = "teleop/schunk_svh_hand_right.yml" | ||
else: | ||
raise ValueError(f"Unrecognized robot_name: {robot_name}") | ||
|
||
test_config = get_retargeting_config(str(Path(__file__).parent.parent / f"dex_retargeting/configs/{config_path}")) | ||
seq_retargeting = test_config.build() | ||
return seq_retargeting, test_config | ||
|
||
|
||
def retarget_video(seq_retargeting: SeqRetargeting, scene: sapien.Scene, viewer): | ||
video_path = Path(__file__).parent / "data/human_hand_video.mp4" | ||
cap = cv2.VideoCapture(str(video_path)) | ||
robot = scene.get_all_articulations()[0] | ||
|
||
if not RECORD_VIDEO: | ||
writer = cv2.VideoWriter(f"data/output_{robot_name}.mp4", cv2.VideoWriter_fourcc(*"mp4v"), 30.0, (1280, 720)) | ||
|
||
if not cap.isOpened(): | ||
print("Error: Could not open video file.") | ||
else: | ||
detector = SingleHandDetector(hand_type="Right", selfie=False) | ||
while cap.isOpened(): | ||
ret, frame = cap.read() | ||
|
||
if not ret: | ||
break | ||
|
||
rgb = frame[..., ::-1] | ||
num_box, joint_pos, keypoint_2d, mediapipe_wrist_rot = detector.detect(rgb) | ||
|
||
retargeting_type = seq_retargeting.optimizer.retargeting_type | ||
indices = seq_retargeting.optimizer.target_link_human_indices | ||
if retargeting_type == "VECTOR": | ||
origin_indices = indices[0, :] | ||
task_indices = indices[1, :] | ||
ref_value = joint_pos[task_indices, :] - joint_pos[origin_indices, :] | ||
elif retargeting_type == "POSITION": | ||
indices = indices | ||
ref_value = joint_pos[indices, :] | ||
else: | ||
raise ValueError(f"Unknown retargeting type: {retargeting_type}") | ||
qpos = retargeting.retarget(ref_value) | ||
robot.set_qpos(qpos) | ||
if RECORD_VIDEO: | ||
for _ in range(3): | ||
viewer.render() | ||
else: | ||
cam = scene.get_cameras()[0] | ||
scene.update_render() | ||
cam.take_picture() | ||
rgb = cam.get_texture("Color")[..., :3] | ||
rgb = (np.clip(rgb, 0, 1) * 255).astype(np.uint8) | ||
seg = cam.get_visual_actor_segmentation()[..., 0] < 1 | ||
rgb[seg, :] = [255, 255, 255] | ||
writer.write(rgb[..., ::-1]) | ||
print(seq_retargeting.num_retargeting) | ||
|
||
cap.release() | ||
cv2.destroyAllWindows() | ||
if not RECORD_VIDEO: | ||
writer.release() | ||
|
||
|
||
if __name__ == "__main__": | ||
robot_name = ["allegro", "shadow", "schunk"][1] | ||
robot_dir = Path(__file__).parent.parent / "assets" / "robots" | ||
RetargetingConfig.set_default_urdf_dir(str(robot_dir)) | ||
retargeting, cfg = build_retargeting(robot_name) | ||
scene, viewer = setup_sapien_viz_scene(cfg.urdf_path) | ||
retarget_video(retargeting, scene, viewer) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
import mediapipe as mp | ||
import mediapipe.framework as framework | ||
import numpy as np | ||
from mediapipe.framework.formats import landmark_pb2 | ||
from mediapipe.python.solutions import hands_connections | ||
from mediapipe.python.solutions.drawing_utils import DrawingSpec | ||
from mediapipe.python.solutions.hands import HandLandmark | ||
|
||
OPERATOR2MANO_RIGHT = np.array( | ||
[ | ||
[0, 0, -1], | ||
[-1, 0, 0], | ||
[0, 1, 0], | ||
] | ||
) | ||
|
||
OPERATOR2MANO_LEFT = np.array( | ||
[ | ||
[0, 0, -1], | ||
[1, 0, 0], | ||
[0, -1, 0], | ||
] | ||
) | ||
|
||
|
||
class SingleHandDetector: | ||
def __init__(self, hand_type="Right", min_detection_confidence=0.8, min_tracking_confidence=0.8, selfie=False): | ||
self.hand_detector = mp.solutions.hands.Hands( | ||
static_image_mode=False, | ||
max_num_hands=1, | ||
min_detection_confidence=min_detection_confidence, | ||
min_tracking_confidence=min_tracking_confidence, | ||
) | ||
self.selfie = selfie | ||
self.operator2mano = OPERATOR2MANO_RIGHT if hand_type == "Right" else OPERATOR2MANO_LEFT | ||
inverse_hand_dict = {"Right": "Left", "Left": "Right"} | ||
self.detected_hand_type = hand_type if selfie else inverse_hand_dict[hand_type] | ||
|
||
@staticmethod | ||
def draw_skeleton_on_image(image, keypoint_2d: landmark_pb2.NormalizedLandmarkList, style="white"): | ||
if style == "default": | ||
mp.solutions.drawing_utils.draw_landmarks( | ||
image, | ||
keypoint_2d, | ||
mp.solutions.hands.HAND_CONNECTIONS, | ||
mp.solutions.drawing_styles.get_default_hand_landmarks_style(), | ||
mp.solutions.drawing_styles.get_default_hand_connections_style(), | ||
) | ||
elif style == "white": | ||
landmark_style = {} | ||
for landmark in HandLandmark: | ||
landmark_style[landmark] = DrawingSpec(color=(255, 48, 48), circle_radius=4, thickness=-1) | ||
|
||
connections = hands_connections.HAND_CONNECTIONS | ||
connection_style = {} | ||
for pair in connections: | ||
connection_style[pair] = DrawingSpec(thickness=2) | ||
|
||
mp.solutions.drawing_utils.draw_landmarks( | ||
image, keypoint_2d, mp.solutions.hands.HAND_CONNECTIONS, landmark_style, connection_style | ||
) | ||
|
||
return image | ||
|
||
def detect(self, rgb): | ||
results = self.hand_detector.process(rgb) | ||
if not results.multi_hand_landmarks: | ||
return 0, None, None, None | ||
|
||
desired_hand_num = -1 | ||
for i in range(len(results.multi_hand_landmarks)): | ||
label = results.multi_handedness[i].ListFields()[0][1][0].label | ||
if label == self.detected_hand_type: | ||
desired_hand_num = i | ||
break | ||
if desired_hand_num < 0: | ||
return 0, None, None, None | ||
|
||
keypoint_3d = results.multi_hand_world_landmarks[desired_hand_num] | ||
keypoint_2d = results.multi_hand_landmarks[desired_hand_num] | ||
num_box = len(results.multi_hand_landmarks) | ||
|
||
# Parse 3d keypoints from MediaPipe hand detector | ||
keypoint_3d_array = self.parse_keypoint_3d(keypoint_3d) | ||
keypoint_3d_array = keypoint_3d_array - keypoint_3d_array[0:1, :] | ||
mediapipe_wrist_rot = self.estimate_frame_from_hand_points(keypoint_3d_array) | ||
joint_pos = keypoint_3d_array @ mediapipe_wrist_rot @ self.operator2mano | ||
|
||
return num_box, joint_pos, keypoint_2d, mediapipe_wrist_rot | ||
|
||
@staticmethod | ||
def parse_keypoint_3d(keypoint_3d: framework.formats.landmark_pb2.LandmarkList) -> np.ndarray: | ||
keypoint = np.empty([21, 3]) | ||
for i in range(21): | ||
keypoint[i][0] = keypoint_3d.landmark[i].x | ||
keypoint[i][1] = keypoint_3d.landmark[i].y | ||
keypoint[i][2] = keypoint_3d.landmark[i].z | ||
return keypoint | ||
|
||
@staticmethod | ||
def parse_keypoint_2d(keypoint_2d: landmark_pb2.NormalizedLandmarkList, img_size) -> np.ndarray: | ||
keypoint = np.empty([21, 2]) | ||
for i in range(21): | ||
keypoint[i][0] = keypoint_2d.landmark[i].x | ||
keypoint[i][1] = keypoint_2d.landmark[i].y | ||
keypoint = keypoint * np.array([img_size[1], img_size[0]])[None, :] | ||
return keypoint | ||
|
||
@staticmethod | ||
def estimate_frame_from_hand_points(keypoint_3d_array: np.ndarray) -> np.ndarray: | ||
""" | ||
Compute the 3D coordinate frame (orientation only) from detected 3d key points | ||
:param points: keypoint3 detected from MediaPipe detector. Order: [wrist, index, middle, pinky] | ||
:return: the coordinate frame of wrist in MANO convention | ||
""" | ||
assert keypoint_3d_array.shape == (21, 3) | ||
points = keypoint_3d_array[[0, 5, 9], :] | ||
|
||
# Compute vector from palm to the first joint of middle finger | ||
x_vector = points[0] - points[2] | ||
|
||
# Normal fitting with SVD | ||
points = points - np.mean(points, axis=0, keepdims=True) | ||
u, s, v = np.linalg.svd(points) | ||
|
||
normal = v[2, :] | ||
|
||
# Gram–Schmidt Orthonormalize | ||
x = x_vector - np.sum(x_vector * normal) * normal | ||
x = x / np.linalg.norm(x) | ||
z = np.cross(x, normal) | ||
|
||
# We assume that the vector from pinky to index is similar the z axis in MANO convention | ||
if np.sum(z * (points[1] - points[2])) < 0: | ||
normal *= -1 | ||
z *= -1 | ||
frame = np.stack([x, normal, z], axis=1) | ||
return frame |