Skip to content

Commit

Permalink
Merge pull request #20 from yuecideng/yuecideng/LabelMaker6D
Browse files Browse the repository at this point in the history
Label Maker for Instance-level 6D Pose Estiamtion
  • Loading branch information
yuecideng committed Apr 26, 2022
2 parents 4c07588 + 685b3b1 commit df47ee7
Show file tree
Hide file tree
Showing 11 changed files with 1,061 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -41,5 +41,6 @@ __pycache__/
*.pyc

# App/Reconstruction folder
dataset/
fragments/
scene/
220 changes: 220 additions & 0 deletions app/camera_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-

import time
import numpy as np
import pyrealsense2 as rs
from enum import IntEnum

class RealSenseResolution(IntEnum):
Low = 0 # 480X270
Medium = 1 # 640X480
High = 2 # 848X480
Max = 3 # 1280X720


class RealSenseFPS(IntEnum):
Low = 0 # 6
Medium = 1 # 15
High = 2 # 30
Max = 3 # 60


class RealSenseManager:

def __init__(self,
resolution=RealSenseResolution.High,
fps=RealSenseFPS.High):
self.align = rs.align(rs.stream.color)
self.config = rs.config()

self.resolution = None
if resolution == RealSenseResolution.Low:
self.resolution = (480, 270)
elif resolution == RealSenseResolution.Medium:
self.resolution = (640, 480)
elif resolution == RealSenseResolution.High:
self.resolution = (848, 480)
else:
self.resolution = (1280, 720)

fps_ = None
if fps == RealSenseFPS.Low:
fps_ = 6
elif fps == RealSenseFPS.Medium:
fps_ = 15
elif fps == RealSenseFPS.High:
fps_ = 30
else:
fps_ = 60

self.config.enable_stream(rs.stream.depth, self.resolution[0],
self.resolution[1], rs.format.z16, fps_)
fps_color = 30 if fps_ > 30 else fps_
self.config.enable_stream(rs.stream.color, self.resolution[0],
self.resolution[1], rs.format.rgb8, fps_color)
self.pipeline = rs.pipeline()

self.profile = None
self.depth_profile = None
self.color_profile = None
self.sensor = None

def get_intrinsics(self, type='color'):
""" Get intrinsics of the RGB camera or IR camera, which are varied with resolution
Args:
power ([string]): color or depth
Returns:
[tuple[List, List]]: K and dist
"""
if self.sensor is None:
print('Sensor not opened!')
return None
intrin = None
if type == 'color':
intrin = self.profile.get_stream(
rs.stream.color).as_video_stream_profile().get_intrinsics()
else:
intrin = self.profile.get_stream(
rs.stream.depth).as_video_stream_profile().get_intrinsics()
K = [intrin.fx, intrin.fy, intrin.ppx, intrin.ppy]
dist = [
intrin.coeffs[0], intrin.coeffs[1], intrin.coeffs[2],
intrin.coeffs[3], intrin.coeffs[4]
]
return (K, dist)

def get_extrinsics(self):
""" Get extrinsics from IR camera to RGB camera
Returns:
[np.ndarray(4 X 4)]:
"""

if self.sensor is None:
print('Sensor not opened!')
return None

res = self.depth_profile.get_extrinsics_to(self.color_profile)
rotation = np.array(res.rotation).reshape((3, 3))
translation = np.array(res.translation)
T = np.identity(4)
T[:3, :3] = rotation
T[:3, 3] = translation

return T

def get_resolution(self):
""" Get resolution of the camera
Returns:
tuple: (width, height)
"""
return self.resolution

def set_laser_power(self, power):
""" Set laser power within range[10, 360])
Args:
power ([int]): laser power
"""
power = max(10, min(360, power))
if self.sensor is None:
print('Sensor not opened!')
return
self.sensor.set_option(rs.option.laser_power, power)

def get_data(self, hole_fill=False):
""" Get data from the camera
Args:
hole_fill ([bool]): whether to fill the hole
vis ([bool]): whether to show the image
Returns:
[tuple[pyrealsense2.frame, pyrealsense2.frame]]: color and depth frames
"""
if self.sensor is None:
print('Sensor not opened!')
return None

frames = self.pipeline.wait_for_frames()
aligned_frames = self.align.process(frames)
color_frame = aligned_frames.get_color_frame()
depth_frame = aligned_frames.get_depth_frame()
if not color_frame or not depth_frame:
print('Can not get data from realsense!')
return None
# set fill = 2 will use 4 pixels neighbor for hole filling
fill = 2 if hole_fill else 0
depth_frame = rs.spatial_filter(0.5, 20, 2, fill).process(depth_frame)

# convert to numpy array
color_image = np.asanyarray(color_frame.get_data())
depth_image = np.asanyarray(depth_frame.get_data())

return (color_image, depth_image)

def open(self):
""" Open the camera
Returns:
[bool]:
"""
self.profile = self.pipeline.start(self.config)
self.depth_profile = rs.video_stream_profile(
self.profile.get_stream(rs.stream.depth))
self.color_profile = rs.video_stream_profile(
self.profile.get_stream(rs.stream.color))
device = self.profile.get_device()
if device.query_sensors().__len__() == 0:
print('Can not find realsense sensor!')
return False
else:
self.sensor = device.query_sensors()[0]
# set default laser power
self.sensor.set_option(rs.option.laser_power, 200)
# set to high density mode
self.sensor.set_option(rs.option.visual_preset, 4)
return True

def close(self):
self.pipeline.stop()
self.sensor = None
self.profile = None
self.depth_profile = None
self.color_profile = None


if __name__ == '__main__':
import cv2
import numpy as np

camera = RealSenseManager(resolution=RealSenseResolution.High,
fps=RealSenseFPS.High)
camera.open()
try:
while True:
t0 = time.time()
color, depth = camera.get_data(hole_fill=False)
color_image = np.asanyarray(color.get_data())
depth_color_frame = rs.colorizer().colorize(depth)
depth_color_image = np.asanyarray(depth_color_frame.get_data())
color_image_bgr = cv2.cvtColor(color_image, cv2.COLOR_RGB2BGR)
t1 = time.time()
fps = 'FPS: ' + str(int(1 / (t1 - t0)))
cv2.putText(color_image_bgr, fps, (50, 50),
cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2,
cv2.LINE_AA)
cv2.namedWindow('color image', cv2.WINDOW_AUTOSIZE)
cv2.imshow('color image', color_image_bgr)
cv2.namedWindow('depth image', cv2.WINDOW_AUTOSIZE)
cv2.imshow('depth image', depth_color_image)
cv2.waitKey(1)
except KeyboardInterrupt:
camera.close()
cv2.destroyAllWindows()
finally:
print('Exit')
138 changes: 138 additions & 0 deletions app/label_maker/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
# Label Maker for 6D Pose Estimation
This is a simple offline label maker for **instance-level** 6D pose estimation ground truth data generation, forming by several python scripts. The whole pipeline is as follows:
- Collect RGBD data from sensor.
- Run RGBD dense reconstruction.
- Pose initialization.
- Rendering mask and generate labels.

## Requirements
- `opencv-python`
- `open3d`
- `misc3d` (with `reconstruction` module enabled)

## Pipeline
### Step 1: Collect RGBD data
We provide a simple script (`label_maker/record_data.py`) to collect RGBD data from RealSense RGBD camera. There are some arguments you can set in the script to control record mode and frame rate.

You can also collect the data by yourself. But it is recommended to use the following structure:
```
dataset/
- color/
- 000000.png
- 000001.png
- ...
- depth/
- 000000.png
- 000001.png
- ...
```

**Note:**
You should collect no more than 300 pairs of RGBD data for each dataset, otherwise the scene resonstruction in next step will be very slow. You can record your data by multiple times with different scene (different models, backgraound, etc.)

### Step 2: Run RGBD dense reconstruction
Run `python3 reconstruction.py config.json` to get the reconstructed scene. You must change the `camera` parameters in the `config.json` file to match your camera specification.

You can use `draw_geometry.py` to visualize the reconstructed scene triangle mesh or point clouds.

### Step 3: Pose initialization
1. Prepare your model file with the following structure:
```
model/
- 0.ply
- 1.ply
- 2.ply
- ...
```
**You must use number as the model name to indicate the mask value of each frame.**
2. Run `init_obj_pose.py --model_path <your/model/path> --data_path <your/data/path>`, and follow the instruction printed in the terminal.

After finish the process, you will find the `init_poses.json` in your data path.

### Step 4: Rendering mask and generate labels
Run `python3 generate_labels.py --model_path <your/model/path> --data_path <your/data/path>`. (add `--vis` to visualize the rendering instance mask)

The results are 16 bit masks stored in `dataset/mask` and `json` file which contains 6d pose, bbox, object id and instance id. A minimal example with only one frame can be seen below:
```json
{
"000000": [
{
"obj_id": 0,
"instance_id": 0,
"cam_R_m2c": [
[
0.9014091842603533,
0.43197344750891603,
-0.029332970840869017
],
[
0.22044418653117434,
-0.5162045410566827,
-0.8276093477100617
],
[
-0.3726470758716603,
0.7395483841100113,
-0.560537549504563
]
],
"cam_t_m2c": [
0.29020161109027537,
0.2501192190463131,
0.6792205163170392
],
"bbox": [
498,
398,
142,
82
]
},
{
"obj_id": 0,
"instance_id": 1,
"cam_R_m2c": [
[
0.816165164729526,
0.5773665735205735,
-0.022853088700338718
],
[
0.30813881463069986,
-0.4683609651098311,
-0.828063087741133
],
[
-0.4887994423073907,
0.6687943227499061,
-0.5601689558137602
]
],
"cam_t_m2c": [
0.12174972304257478,
0.18991541206314635,
0.7773315438193125
],
"bbox": [
350,
337,
144,
133
]
}
]
}
```

The value for each instance object in the mask is equal to `v = obj_id * 1000 + instance_id`. You can decode each instance into a 8 bit mask image with single object contained using the following code:
```python
# you already have mask, obj_id and instance_id
mask_new = np.zeros((mask.shape[0], mask.shape[1]), dtype=np.uint8)
v = obj_id * 1000 + instance_id
mask_new[mask == v] = 255
```

## Reference:
LabelFusion
- [paper](https://ieeexplore.ieee.org/abstract/document/8460950)
- [code](https://github.com/RobotLocomotion/LabelFusion)
Loading

0 comments on commit df47ee7

Please sign in to comment.