mirror of
https://github.com/luckyrobots/open_phantom.git
synced 2025-04-04 02:52:18 +00:00
improve icp by adding a const scale factor
This commit is contained in:
parent
1e6722ae44
commit
ececd98449
@ -1,23 +1,30 @@
|
|||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
|
|
||||||
import cv2
|
import cv2
|
||||||
import depth_pro
|
import torch
|
||||||
import mediapipe as mp
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import open3d as o3d
|
import open3d as o3d
|
||||||
import torch
|
import mediapipe as mp
|
||||||
from PIL import Image
|
|
||||||
from robot_manager import RobotManager
|
|
||||||
from sam2.sam2_image_predictor import SAM2ImagePredictor
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
from collections import deque
|
||||||
from utils.visualizations import *
|
from utils.visualizations import *
|
||||||
|
from sam2.sam2_image_predictor import SAM2ImagePredictor
|
||||||
|
|
||||||
|
|
||||||
class HandProcessor:
|
# TODO: Optimize these constants for better results
|
||||||
|
HAND_WIDTH_MM = 90.0 # Average width of male hand in mm
|
||||||
|
CLOUD_Z_SCALE = 5.0
|
||||||
|
# Maximum expected distance between human thumb and index finger in mm when fully extended.
|
||||||
|
MAXIMUM_HAND_WIDTH_MM = 100.0
|
||||||
|
|
||||||
|
|
||||||
|
class ProcessHand:
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
|
|
||||||
|
# Initialize MediaPipe Hands for hand detection
|
||||||
|
print("Loading MediaPipe Hands model...")
|
||||||
self.mp_hands = mp.solutions.hands
|
self.mp_hands = mp.solutions.hands
|
||||||
self.mp_drawing = mp.solutions.drawing_utils
|
self.mp_drawing = mp.solutions.drawing_utils
|
||||||
self.mp_drawing_styles = mp.solutions.drawing_styles
|
self.mp_drawing_styles = mp.solutions.drawing_styles
|
||||||
@ -28,10 +35,9 @@ class HandProcessor:
|
|||||||
static_image_mode=False,
|
static_image_mode=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
# NOTE: Look into better depth estimation models
|
|
||||||
# Initialize MiDaS for depth estimation
|
# Initialize MiDaS for depth estimation
|
||||||
print("Loading MiDaS model...")
|
print("Loading MiDaS model...")
|
||||||
self.midas = torch.hub.load("intel-isl/MiDaS", "DPT_Hybrid")
|
self.midas = torch.hub.load("intel-isl/MiDaS", "DPT_Large")
|
||||||
self.midas.to(self.device)
|
self.midas.to(self.device)
|
||||||
self.midas.eval()
|
self.midas.eval()
|
||||||
|
|
||||||
@ -45,11 +51,17 @@ class HandProcessor:
|
|||||||
"facebook/sam2-hiera-large"
|
"facebook/sam2-hiera-large"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self.gripper_width_buffer = deque(maxlen=100)
|
||||||
|
|
||||||
|
"""
|
||||||
|
Create a segmentation mask over the hand using SAM2 model
|
||||||
|
"""
|
||||||
|
|
||||||
def _create_mask(self, frame: np.ndarray, landmarks: list) -> np.ndarray:
|
def _create_mask(self, frame: np.ndarray, landmarks: list) -> np.ndarray:
|
||||||
height, width = frame.shape[:2]
|
height, width = frame.shape[:2]
|
||||||
|
# Convert image to RGB for SAM2 model
|
||||||
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
||||||
|
|
||||||
# Set image in SAM2 predictor
|
|
||||||
with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
|
with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
|
||||||
self.sam2_predictor.set_image(frame_rgb) # Set image for prediction
|
self.sam2_predictor.set_image(frame_rgb) # Set image for prediction
|
||||||
|
|
||||||
@ -61,7 +73,7 @@ class HandProcessor:
|
|||||||
|
|
||||||
input_points = np.array(points)
|
input_points = np.array(points)
|
||||||
|
|
||||||
# Predict mask with point prompts
|
# Predict mask using point prompts
|
||||||
masks, _, _ = self.sam2_predictor.predict(
|
masks, _, _ = self.sam2_predictor.predict(
|
||||||
point_coords=input_points, # Pass the points as prompts
|
point_coords=input_points, # Pass the points as prompts
|
||||||
point_labels=np.ones(
|
point_labels=np.ones(
|
||||||
@ -79,7 +91,6 @@ class HandProcessor:
|
|||||||
Estimate depth map using MiDaS model
|
Estimate depth map using MiDaS model
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# TODO: Swap MiDaS for ml-depth-pro model
|
|
||||||
def _estimate_depth(self, image: np.ndarray) -> tuple:
|
def _estimate_depth(self, image: np.ndarray) -> tuple:
|
||||||
img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
||||||
|
|
||||||
@ -97,7 +108,7 @@ class HandProcessor:
|
|||||||
align_corners=False,
|
align_corners=False,
|
||||||
).squeeze()
|
).squeeze()
|
||||||
|
|
||||||
# Convert to numpy and normalize to 0-255 for visualization
|
# Convert to numpy and normalize for visualization
|
||||||
depth_map = prediction.cpu().numpy()
|
depth_map = prediction.cpu().numpy()
|
||||||
depth_min = depth_map.min()
|
depth_min = depth_map.min()
|
||||||
depth_max = depth_map.max()
|
depth_max = depth_map.max()
|
||||||
@ -105,11 +116,17 @@ class HandProcessor:
|
|||||||
|
|
||||||
return depth_map, depth_map_normalized.astype(np.uint8)
|
return depth_map, depth_map_normalized.astype(np.uint8)
|
||||||
|
|
||||||
# TODO: Look into better depth scaling
|
"""
|
||||||
def _create_cloud(self, depth_map: np.ndarray, hand_mask: np.ndarray) -> np.ndarray:
|
Create a point cloud from combining depth map and segmented mask
|
||||||
|
by back-projecting to 3D using camera intrinsics and depth values
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _create_cloud(
|
||||||
|
self, depth_map: np.ndarray, segmented_mask: np.ndarray
|
||||||
|
) -> np.ndarray:
|
||||||
focal_x, focal_y, center_x, center_y = self.camera_intrinsics
|
focal_x, focal_y, center_x, center_y = self.camera_intrinsics
|
||||||
|
|
||||||
v_coords, u_coords = np.where(hand_mask > 0)
|
v_coords, u_coords = np.where(segmented_mask > 0)
|
||||||
z_values = depth_map[v_coords, u_coords]
|
z_values = depth_map[v_coords, u_coords]
|
||||||
|
|
||||||
# Filter out zero depth values
|
# Filter out zero depth values
|
||||||
@ -118,7 +135,7 @@ class HandProcessor:
|
|||||||
v_coords = v_coords[valid_indices]
|
v_coords = v_coords[valid_indices]
|
||||||
z_values = z_values[valid_indices]
|
z_values = z_values[valid_indices]
|
||||||
|
|
||||||
z_metric = z_values * 0.5
|
z_metric = z_values * HAND_WIDTH_MM * CLOUD_Z_SCALE / depth_map.max()
|
||||||
|
|
||||||
# Back-project to 3D using camera intrinsics
|
# Back-project to 3D using camera intrinsics
|
||||||
x_values = (u_coords - center_x) * z_metric / focal_x
|
x_values = (u_coords - center_x) * z_metric / focal_x
|
||||||
@ -128,15 +145,25 @@ class HandProcessor:
|
|||||||
|
|
||||||
return points
|
return points
|
||||||
|
|
||||||
# TODO: Look into better depth scaling
|
"""
|
||||||
|
Create hand mesh from hand landmarks
|
||||||
|
"""
|
||||||
|
|
||||||
def _create_mesh(self, landmarks: list, image_dims: tuple) -> np.ndarray:
|
def _create_mesh(self, landmarks: list, image_dims: tuple) -> np.ndarray:
|
||||||
width, height = image_dims
|
width, height = image_dims
|
||||||
|
|
||||||
|
# Extract just z values to understand their range
|
||||||
|
z_values = [landmark.z for landmark in landmarks.landmark]
|
||||||
|
z_min = min(z_values)
|
||||||
|
z_max = max(z_values)
|
||||||
|
|
||||||
vertices = []
|
vertices = []
|
||||||
for landmark in landmarks.landmark:
|
for landmark in landmarks.landmark:
|
||||||
vertices.append(
|
# Scale z to same range as HAND_WIDTH_MM
|
||||||
[landmark.x * width, landmark.y * height, landmark.z * width]
|
normalized_z = (landmark.z - z_min) / (z_max - z_min + 1e-6)
|
||||||
)
|
scaled_z = normalized_z * HAND_WIDTH_MM
|
||||||
|
|
||||||
|
vertices.append([landmark.x * width, landmark.y * height, scaled_z])
|
||||||
|
|
||||||
# Define faces (triangles) connecting landmarks
|
# Define faces (triangles) connecting landmarks
|
||||||
faces = [
|
faces = [
|
||||||
@ -178,7 +205,10 @@ class HandProcessor:
|
|||||||
|
|
||||||
return np.array(dense_vertices), np.array(faces)
|
return np.array(dense_vertices), np.array(faces)
|
||||||
|
|
||||||
# TODO: Improve ICP registration for better alignment
|
"""
|
||||||
|
Align hand mesh to point cloud using ICP for accurate 3D reconstruction
|
||||||
|
"""
|
||||||
|
|
||||||
def _icp_registration(self, cloud: np.ndarray, vertices: np.ndarray) -> np.ndarray:
|
def _icp_registration(self, cloud: np.ndarray, vertices: np.ndarray) -> np.ndarray:
|
||||||
source = o3d.geometry.PointCloud()
|
source = o3d.geometry.PointCloud()
|
||||||
source.points = o3d.utility.Vector3dVector(vertices)
|
source.points = o3d.utility.Vector3dVector(vertices)
|
||||||
@ -200,27 +230,39 @@ class HandProcessor:
|
|||||||
result = o3d.pipelines.registration.registration_icp(
|
result = o3d.pipelines.registration.registration_icp(
|
||||||
source,
|
source,
|
||||||
target,
|
target,
|
||||||
max_correspondence_distance=0.1, # Increased distance
|
max_correspondence_distance=0.05,
|
||||||
init=trans_init, # Initial transformation
|
init=trans_init, # Initial transformation
|
||||||
estimation_method=o3d.pipelines.registration.TransformationEstimationPointToPoint(),
|
estimation_method=o3d.pipelines.registration.TransformationEstimationPointToPoint(),
|
||||||
criteria=o3d.pipelines.registration.ICPConvergenceCriteria(
|
criteria=o3d.pipelines.registration.ICPConvergenceCriteria(
|
||||||
max_iteration=100
|
max_iteration=100
|
||||||
), # Increased iterations
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
transformation = result.transformation
|
transformation = result.transformation
|
||||||
|
|
||||||
return transformation
|
return transformation
|
||||||
|
|
||||||
# TODO: Look into better depth scaling
|
"""
|
||||||
def _refine_landmarks(self, landmarks: list, transform: int, image_dims: tuple):
|
Refine landmarks based on the icp transformation
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _refine_landmarks(
|
||||||
|
self, landmarks: list, transform: int, image_dims: tuple
|
||||||
|
) -> list:
|
||||||
width, height = image_dims
|
width, height = image_dims
|
||||||
|
|
||||||
|
# Extract z range for normalization, similar to _create_mesh
|
||||||
|
z_values = [landmark.z for landmark in landmarks.landmark]
|
||||||
|
z_min = min(z_values)
|
||||||
|
z_max = max(z_values)
|
||||||
|
|
||||||
refined_landmarks = []
|
refined_landmarks = []
|
||||||
for landmark in landmarks.landmark:
|
for landmark in landmarks.landmark:
|
||||||
point = np.array(
|
# Use consistent scaling with _create_mesh
|
||||||
[landmark.x * width, landmark.y * height, landmark.z * width, 1.0]
|
normalized_z = (landmark.z - z_min) / (z_max - z_min + 1e-6)
|
||||||
)
|
scaled_z = normalized_z * HAND_WIDTH_MM
|
||||||
|
|
||||||
|
point = np.array([landmark.x * width, landmark.y * height, scaled_z, 1.0])
|
||||||
|
|
||||||
# Apply transformation to 3D point
|
# Apply transformation to 3D point
|
||||||
transformed = np.dot(transform, point)
|
transformed = np.dot(transform, point)
|
||||||
@ -228,6 +270,7 @@ class HandProcessor:
|
|||||||
|
|
||||||
return refined_landmarks
|
return refined_landmarks
|
||||||
|
|
||||||
|
# TODO: Implement better constraints that limit last joint of each finger to a single DOF
|
||||||
def _apply_constraints(self, landmarks: list):
|
def _apply_constraints(self, landmarks: list):
|
||||||
constrained = np.array(landmarks)
|
constrained = np.array(landmarks)
|
||||||
|
|
||||||
@ -236,8 +279,7 @@ class HandProcessor:
|
|||||||
thumb_indices = [1, 2, 3, 4]
|
thumb_indices = [1, 2, 3, 4]
|
||||||
index_indices = [5, 6, 7, 8]
|
index_indices = [5, 6, 7, 8]
|
||||||
|
|
||||||
# Constrain the last two joints of thumb and index finger
|
# Constrain the last two joints of thumb and index finger as per the paper
|
||||||
# as mentioned in the paper
|
|
||||||
for finger_indices in [thumb_indices, index_indices]:
|
for finger_indices in [thumb_indices, index_indices]:
|
||||||
# Get the last three joints (two segments)
|
# Get the last three joints (two segments)
|
||||||
if len(finger_indices) >= 3:
|
if len(finger_indices) >= 3:
|
||||||
@ -250,8 +292,7 @@ class HandProcessor:
|
|||||||
dir1 = joint2 - joint1
|
dir1 = joint2 - joint1
|
||||||
dir1 = dir1 / np.linalg.norm(dir1)
|
dir1 = dir1 / np.linalg.norm(dir1)
|
||||||
|
|
||||||
# Instead of full ball joint, constrain the last joint's direction
|
# Instead of full ball joint, constrain last joint's direction to follow previous segment
|
||||||
# to follow similar direction as the previous segment
|
|
||||||
ideal_dir = dir1.copy()
|
ideal_dir = dir1.copy()
|
||||||
actual_dir = joint3 - joint2
|
actual_dir = joint3 - joint2
|
||||||
actual_length = np.linalg.norm(actual_dir)
|
actual_length = np.linalg.norm(actual_dir)
|
||||||
@ -265,8 +306,14 @@ class HandProcessor:
|
|||||||
|
|
||||||
return constrained
|
return constrained
|
||||||
|
|
||||||
def _get_robot_params(self, refined_landmarks):
|
"""
|
||||||
# Extract keypoints
|
Extract robot parameters from refined landmarks:
|
||||||
|
1. Target Position: Midpoint between thumb tip and index tip
|
||||||
|
2. Target Orientation: Normal to the best-fitting plane of thumb and index finger
|
||||||
|
3. Gripper Width: Distance between thumb tip and index tip
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _get_robot_params(self, refined_landmarks: list) -> tuple:
|
||||||
landmarks = np.array(refined_landmarks)
|
landmarks = np.array(refined_landmarks)
|
||||||
|
|
||||||
# Define indices for specific parts of the hand
|
# Define indices for specific parts of the hand
|
||||||
@ -293,54 +340,52 @@ class HandProcessor:
|
|||||||
# Use SVD to find the normal to the best-fitting plane
|
# Use SVD to find the normal to the best-fitting plane
|
||||||
u, s, vh = np.linalg.svd(centered_points)
|
u, s, vh = np.linalg.svd(centered_points)
|
||||||
# The normal is the last right singular vector
|
# The normal is the last right singular vector
|
||||||
normal = vh[2, :]
|
plane_normal = vh[2, :]
|
||||||
|
plane_normal = plane_normal / np.linalg.norm(plane_normal)
|
||||||
# Ensure normal is a unit vector
|
|
||||||
normal = normal / np.linalg.norm(normal)
|
|
||||||
|
|
||||||
# Fit a principal axis through thumb points
|
# Fit a principal axis through thumb points
|
||||||
thumb_centroid = np.mean(thumb_points, axis=0)
|
# Using direction from thumb base to tip for more robustness
|
||||||
thumb_centered = thumb_points - thumb_centroid
|
thumb_direction = landmarks[thumb_tip_idx] - landmarks[thumb_indices[0]]
|
||||||
|
thumb_axis = thumb_direction / np.linalg.norm(thumb_direction)
|
||||||
|
|
||||||
# Use SVD again to find direction of maximum variance (principal axis)
|
# Ensure thumb_axis is orthogonal to plane_normal
|
||||||
u, s, vh = np.linalg.svd(thumb_centered)
|
thumb_axis = thumb_axis - np.dot(thumb_axis, plane_normal) * plane_normal
|
||||||
principal_axis = vh[0, :] # First singular vector
|
thumb_axis = thumb_axis / np.linalg.norm(thumb_axis)
|
||||||
|
|
||||||
# Ensure principal axis is a unit vector
|
# Compute third axis as cross product to create orthogonal frame
|
||||||
principal_axis = principal_axis / np.linalg.norm(principal_axis)
|
cross_axis = np.cross(plane_normal, thumb_axis)
|
||||||
|
cross_axis = cross_axis / np.linalg.norm(cross_axis)
|
||||||
|
|
||||||
# Construct orthogonal vectors for orientation matrix
|
# Create rotation matrix that aligns with the paper's description
|
||||||
z_axis = normal # Normal to the plane
|
# z-axis as normal, x-axis along thumb direction
|
||||||
x_axis = principal_axis # Along the thumb
|
rotation_matrix = np.column_stack([thumb_axis, cross_axis, plane_normal])
|
||||||
|
|
||||||
# Ensure x_axis is orthogonal to z_axis
|
|
||||||
x_axis = x_axis - np.dot(x_axis, z_axis) * z_axis
|
|
||||||
x_axis = x_axis / np.linalg.norm(x_axis)
|
|
||||||
|
|
||||||
# Compute y_axis as cross product to ensure orthogonality
|
|
||||||
y_axis = np.cross(z_axis, x_axis)
|
|
||||||
|
|
||||||
# Create rotation matrix
|
|
||||||
rotation_matrix = np.column_stack([x_axis, y_axis, z_axis])
|
|
||||||
|
|
||||||
# 3. Calculate gripper width
|
# 3. Calculate gripper width
|
||||||
gripper_width = np.linalg.norm(thumb_tip - index_tip)
|
gripper_width = np.linalg.norm(thumb_tip - index_tip)
|
||||||
|
|
||||||
# Apply percentile threshold to mitigate slippage (as mentioned in the paper)
|
|
||||||
# Store gripper widths in a buffer to compute percentiles
|
|
||||||
if not hasattr(self, "gripper_width_buffer"):
|
|
||||||
self.gripper_width_buffer = []
|
|
||||||
|
|
||||||
self.gripper_width_buffer.append(gripper_width)
|
self.gripper_width_buffer.append(gripper_width)
|
||||||
# Keep only the last 100 values
|
|
||||||
if len(self.gripper_width_buffer) > 100:
|
|
||||||
self.gripper_width_buffer.pop(0)
|
|
||||||
|
|
||||||
# Apply 20th percentile threshold
|
# Apply 20th percentile threshold as specified in the paper
|
||||||
if len(self.gripper_width_buffer) > 5: # Need enough samples
|
if len(self.gripper_width_buffer) > 5: # Need enough samples
|
||||||
min_width = np.percentile(self.gripper_width_buffer, 20)
|
min_width = np.percentile(self.gripper_width_buffer, 20)
|
||||||
if gripper_width < min_width:
|
if gripper_width < min_width:
|
||||||
gripper_width = min_width
|
gripper_width = 0.0 # Fully closed gripper when below threshold
|
||||||
|
else:
|
||||||
|
# Scale gripper_width to robot's specific range
|
||||||
|
gripper_width = min(1.0, gripper_width / MAXIMUM_HAND_WIDTH_MM)
|
||||||
|
|
||||||
|
# Convert from camera frame to robot frame
|
||||||
|
# Note: This requires the extrinsic matrix from camera to robot
|
||||||
|
# If extrinsics are available, uncomment and use this code:
|
||||||
|
# if hasattr(self, "camera_to_robot_transform"):
|
||||||
|
# # Convert position to homogeneous coordinates
|
||||||
|
# pos_homogeneous = np.append(position, 1.0)
|
||||||
|
# # Apply transformation
|
||||||
|
# robot_pos_homogeneous = np.dot(self.camera_to_robot_transform, pos_homogeneous)
|
||||||
|
# position = robot_pos_homogeneous[:3]
|
||||||
|
#
|
||||||
|
# # Convert rotation (special orthogonal transformation)
|
||||||
|
# rotation_in_robot_frame = np.dot(self.camera_to_robot_transform[:3, :3], rotation_matrix)
|
||||||
|
# rotation_matrix = rotation_in_robot_frame
|
||||||
|
|
||||||
return position, rotation_matrix, gripper_width
|
return position, rotation_matrix, gripper_width
|
||||||
|
|
||||||
@ -483,16 +528,16 @@ class HandProcessor:
|
|||||||
# Process if hand is detected
|
# Process if hand is detected
|
||||||
if results.multi_hand_landmarks:
|
if results.multi_hand_landmarks:
|
||||||
for hand_landmarks in results.multi_hand_landmarks:
|
for hand_landmarks in results.multi_hand_landmarks:
|
||||||
hand_mask = self._create_mask(frame, hand_landmarks)
|
segmented_mask = self._create_mask(frame, hand_landmarks)
|
||||||
mask_overlay = frame.copy()
|
mask_overlay = frame.copy()
|
||||||
mask_overlay[hand_mask > 0] = [0, 0, 255] # Red color for mask
|
mask_overlay[segmented_mask > 0] = [0, 0, 255] # Red color for mask
|
||||||
segmented_frame = cv2.addWeighted(frame, 0.7, mask_overlay, 0.3, 0)
|
segmented_frame = cv2.addWeighted(frame, 0.7, mask_overlay, 0.3, 0)
|
||||||
|
|
||||||
depth_map, depth_vis = self._estimate_depth(frame)
|
depth_map, depth_vis = self._estimate_depth(frame)
|
||||||
depth_colored = cv2.applyColorMap(depth_vis, cv2.COLORMAP_JET)
|
depth_colored = cv2.applyColorMap(depth_vis, cv2.COLORMAP_JET)
|
||||||
depth_frame = depth_colored.copy()
|
depth_frame = depth_colored.copy()
|
||||||
|
|
||||||
cloud = self._create_cloud(depth_map, hand_mask)
|
cloud = self._create_cloud(depth_map, segmented_mask)
|
||||||
|
|
||||||
hand_vertices, hand_faces = self._create_mesh(
|
hand_vertices, hand_faces = self._create_mesh(
|
||||||
hand_landmarks, (width, height)
|
hand_landmarks, (width, height)
|
Loading…
x
Reference in New Issue
Block a user