improve icp by adding a const scale factor

This commit is contained in:
Ethan Clark 2025-03-24 15:18:37 -07:00
parent 1e6722ae44
commit ececd98449

View File

@ -1,23 +1,30 @@
import os import os
import time import time
import cv2 import cv2
import depth_pro import torch
import mediapipe as mp
import numpy as np import numpy as np
import open3d as o3d import open3d as o3d
import torch import mediapipe as mp
from PIL import Image
from robot_manager import RobotManager
from sam2.sam2_image_predictor import SAM2ImagePredictor
from tqdm import tqdm from tqdm import tqdm
from collections import deque
from utils.visualizations import * from utils.visualizations import *
from sam2.sam2_image_predictor import SAM2ImagePredictor
class HandProcessor: # TODO: Optimize these constants for better results
HAND_WIDTH_MM = 90.0 # Average width of male hand in mm
CLOUD_Z_SCALE = 5.0
# Maximum expected distance between human thumb and index finger in mm when fully extended.
MAXIMUM_HAND_WIDTH_MM = 100.0
class ProcessHand:
def __init__(self) -> None: def __init__(self) -> None:
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Initialize MediaPipe Hands for hand detection
print("Loading MediaPipe Hands model...")
self.mp_hands = mp.solutions.hands self.mp_hands = mp.solutions.hands
self.mp_drawing = mp.solutions.drawing_utils self.mp_drawing = mp.solutions.drawing_utils
self.mp_drawing_styles = mp.solutions.drawing_styles self.mp_drawing_styles = mp.solutions.drawing_styles
@ -28,10 +35,9 @@ class HandProcessor:
static_image_mode=False, static_image_mode=False,
) )
# NOTE: Look into better depth estimation models
# Initialize MiDaS for depth estimation # Initialize MiDaS for depth estimation
print("Loading MiDaS model...") print("Loading MiDaS model...")
self.midas = torch.hub.load("intel-isl/MiDaS", "DPT_Hybrid") self.midas = torch.hub.load("intel-isl/MiDaS", "DPT_Large")
self.midas.to(self.device) self.midas.to(self.device)
self.midas.eval() self.midas.eval()
@ -45,11 +51,17 @@ class HandProcessor:
"facebook/sam2-hiera-large" "facebook/sam2-hiera-large"
) )
self.gripper_width_buffer = deque(maxlen=100)
"""
Create a segmentation mask over the hand using SAM2 model
"""
def _create_mask(self, frame: np.ndarray, landmarks: list) -> np.ndarray: def _create_mask(self, frame: np.ndarray, landmarks: list) -> np.ndarray:
height, width = frame.shape[:2] height, width = frame.shape[:2]
# Convert image to RGB for SAM2 model
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# Set image in SAM2 predictor
with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16): with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
self.sam2_predictor.set_image(frame_rgb) # Set image for prediction self.sam2_predictor.set_image(frame_rgb) # Set image for prediction
@ -61,7 +73,7 @@ class HandProcessor:
input_points = np.array(points) input_points = np.array(points)
# Predict mask with point prompts # Predict mask using point prompts
masks, _, _ = self.sam2_predictor.predict( masks, _, _ = self.sam2_predictor.predict(
point_coords=input_points, # Pass the points as prompts point_coords=input_points, # Pass the points as prompts
point_labels=np.ones( point_labels=np.ones(
@ -79,7 +91,6 @@ class HandProcessor:
Estimate depth map using MiDaS model Estimate depth map using MiDaS model
""" """
# TODO: Swap MiDaS for ml-depth-pro model
def _estimate_depth(self, image: np.ndarray) -> tuple: def _estimate_depth(self, image: np.ndarray) -> tuple:
img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
@ -97,7 +108,7 @@ class HandProcessor:
align_corners=False, align_corners=False,
).squeeze() ).squeeze()
# Convert to numpy and normalize to 0-255 for visualization # Convert to numpy and normalize for visualization
depth_map = prediction.cpu().numpy() depth_map = prediction.cpu().numpy()
depth_min = depth_map.min() depth_min = depth_map.min()
depth_max = depth_map.max() depth_max = depth_map.max()
@ -105,11 +116,17 @@ class HandProcessor:
return depth_map, depth_map_normalized.astype(np.uint8) return depth_map, depth_map_normalized.astype(np.uint8)
# TODO: Look into better depth scaling """
def _create_cloud(self, depth_map: np.ndarray, hand_mask: np.ndarray) -> np.ndarray: Create a point cloud from combining depth map and segmented mask
by back-projecting to 3D using camera intrinsics and depth values
"""
def _create_cloud(
self, depth_map: np.ndarray, segmented_mask: np.ndarray
) -> np.ndarray:
focal_x, focal_y, center_x, center_y = self.camera_intrinsics focal_x, focal_y, center_x, center_y = self.camera_intrinsics
v_coords, u_coords = np.where(hand_mask > 0) v_coords, u_coords = np.where(segmented_mask > 0)
z_values = depth_map[v_coords, u_coords] z_values = depth_map[v_coords, u_coords]
# Filter out zero depth values # Filter out zero depth values
@ -118,7 +135,7 @@ class HandProcessor:
v_coords = v_coords[valid_indices] v_coords = v_coords[valid_indices]
z_values = z_values[valid_indices] z_values = z_values[valid_indices]
z_metric = z_values * 0.5 z_metric = z_values * HAND_WIDTH_MM * CLOUD_Z_SCALE / depth_map.max()
# Back-project to 3D using camera intrinsics # Back-project to 3D using camera intrinsics
x_values = (u_coords - center_x) * z_metric / focal_x x_values = (u_coords - center_x) * z_metric / focal_x
@ -128,15 +145,25 @@ class HandProcessor:
return points return points
# TODO: Look into better depth scaling """
Create hand mesh from hand landmarks
"""
def _create_mesh(self, landmarks: list, image_dims: tuple) -> np.ndarray: def _create_mesh(self, landmarks: list, image_dims: tuple) -> np.ndarray:
width, height = image_dims width, height = image_dims
# Extract just z values to understand their range
z_values = [landmark.z for landmark in landmarks.landmark]
z_min = min(z_values)
z_max = max(z_values)
vertices = [] vertices = []
for landmark in landmarks.landmark: for landmark in landmarks.landmark:
vertices.append( # Scale z to same range as HAND_WIDTH_MM
[landmark.x * width, landmark.y * height, landmark.z * width] normalized_z = (landmark.z - z_min) / (z_max - z_min + 1e-6)
) scaled_z = normalized_z * HAND_WIDTH_MM
vertices.append([landmark.x * width, landmark.y * height, scaled_z])
# Define faces (triangles) connecting landmarks # Define faces (triangles) connecting landmarks
faces = [ faces = [
@ -178,7 +205,10 @@ class HandProcessor:
return np.array(dense_vertices), np.array(faces) return np.array(dense_vertices), np.array(faces)
# TODO: Improve ICP registration for better alignment """
Align hand mesh to point cloud using ICP for accurate 3D reconstruction
"""
def _icp_registration(self, cloud: np.ndarray, vertices: np.ndarray) -> np.ndarray: def _icp_registration(self, cloud: np.ndarray, vertices: np.ndarray) -> np.ndarray:
source = o3d.geometry.PointCloud() source = o3d.geometry.PointCloud()
source.points = o3d.utility.Vector3dVector(vertices) source.points = o3d.utility.Vector3dVector(vertices)
@ -200,27 +230,39 @@ class HandProcessor:
result = o3d.pipelines.registration.registration_icp( result = o3d.pipelines.registration.registration_icp(
source, source,
target, target,
max_correspondence_distance=0.1, # Increased distance max_correspondence_distance=0.05,
init=trans_init, # Initial transformation init=trans_init, # Initial transformation
estimation_method=o3d.pipelines.registration.TransformationEstimationPointToPoint(), estimation_method=o3d.pipelines.registration.TransformationEstimationPointToPoint(),
criteria=o3d.pipelines.registration.ICPConvergenceCriteria( criteria=o3d.pipelines.registration.ICPConvergenceCriteria(
max_iteration=100 max_iteration=100
), # Increased iterations ),
) )
transformation = result.transformation transformation = result.transformation
return transformation return transformation
# TODO: Look into better depth scaling """
def _refine_landmarks(self, landmarks: list, transform: int, image_dims: tuple): Refine landmarks based on the icp transformation
"""
def _refine_landmarks(
self, landmarks: list, transform: int, image_dims: tuple
) -> list:
width, height = image_dims width, height = image_dims
# Extract z range for normalization, similar to _create_mesh
z_values = [landmark.z for landmark in landmarks.landmark]
z_min = min(z_values)
z_max = max(z_values)
refined_landmarks = [] refined_landmarks = []
for landmark in landmarks.landmark: for landmark in landmarks.landmark:
point = np.array( # Use consistent scaling with _create_mesh
[landmark.x * width, landmark.y * height, landmark.z * width, 1.0] normalized_z = (landmark.z - z_min) / (z_max - z_min + 1e-6)
) scaled_z = normalized_z * HAND_WIDTH_MM
point = np.array([landmark.x * width, landmark.y * height, scaled_z, 1.0])
# Apply transformation to 3D point # Apply transformation to 3D point
transformed = np.dot(transform, point) transformed = np.dot(transform, point)
@ -228,6 +270,7 @@ class HandProcessor:
return refined_landmarks return refined_landmarks
# TODO: Implement better constraints that limit last joint of each finger to a single DOF
def _apply_constraints(self, landmarks: list): def _apply_constraints(self, landmarks: list):
constrained = np.array(landmarks) constrained = np.array(landmarks)
@ -236,8 +279,7 @@ class HandProcessor:
thumb_indices = [1, 2, 3, 4] thumb_indices = [1, 2, 3, 4]
index_indices = [5, 6, 7, 8] index_indices = [5, 6, 7, 8]
# Constrain the last two joints of thumb and index finger # Constrain the last two joints of thumb and index finger as per the paper
# as mentioned in the paper
for finger_indices in [thumb_indices, index_indices]: for finger_indices in [thumb_indices, index_indices]:
# Get the last three joints (two segments) # Get the last three joints (two segments)
if len(finger_indices) >= 3: if len(finger_indices) >= 3:
@ -250,8 +292,7 @@ class HandProcessor:
dir1 = joint2 - joint1 dir1 = joint2 - joint1
dir1 = dir1 / np.linalg.norm(dir1) dir1 = dir1 / np.linalg.norm(dir1)
# Instead of full ball joint, constrain the last joint's direction # Instead of full ball joint, constrain last joint's direction to follow previous segment
# to follow similar direction as the previous segment
ideal_dir = dir1.copy() ideal_dir = dir1.copy()
actual_dir = joint3 - joint2 actual_dir = joint3 - joint2
actual_length = np.linalg.norm(actual_dir) actual_length = np.linalg.norm(actual_dir)
@ -265,8 +306,14 @@ class HandProcessor:
return constrained return constrained
def _get_robot_params(self, refined_landmarks): """
# Extract keypoints Extract robot parameters from refined landmarks:
1. Target Position: Midpoint between thumb tip and index tip
2. Target Orientation: Normal to the best-fitting plane of thumb and index finger
3. Gripper Width: Distance between thumb tip and index tip
"""
def _get_robot_params(self, refined_landmarks: list) -> tuple:
landmarks = np.array(refined_landmarks) landmarks = np.array(refined_landmarks)
# Define indices for specific parts of the hand # Define indices for specific parts of the hand
@ -293,54 +340,52 @@ class HandProcessor:
# Use SVD to find the normal to the best-fitting plane # Use SVD to find the normal to the best-fitting plane
u, s, vh = np.linalg.svd(centered_points) u, s, vh = np.linalg.svd(centered_points)
# The normal is the last right singular vector # The normal is the last right singular vector
normal = vh[2, :] plane_normal = vh[2, :]
plane_normal = plane_normal / np.linalg.norm(plane_normal)
# Ensure normal is a unit vector
normal = normal / np.linalg.norm(normal)
# Fit a principal axis through thumb points # Fit a principal axis through thumb points
thumb_centroid = np.mean(thumb_points, axis=0) # Using direction from thumb base to tip for more robustness
thumb_centered = thumb_points - thumb_centroid thumb_direction = landmarks[thumb_tip_idx] - landmarks[thumb_indices[0]]
thumb_axis = thumb_direction / np.linalg.norm(thumb_direction)
# Use SVD again to find direction of maximum variance (principal axis) # Ensure thumb_axis is orthogonal to plane_normal
u, s, vh = np.linalg.svd(thumb_centered) thumb_axis = thumb_axis - np.dot(thumb_axis, plane_normal) * plane_normal
principal_axis = vh[0, :] # First singular vector thumb_axis = thumb_axis / np.linalg.norm(thumb_axis)
# Ensure principal axis is a unit vector # Compute third axis as cross product to create orthogonal frame
principal_axis = principal_axis / np.linalg.norm(principal_axis) cross_axis = np.cross(plane_normal, thumb_axis)
cross_axis = cross_axis / np.linalg.norm(cross_axis)
# Construct orthogonal vectors for orientation matrix # Create rotation matrix that aligns with the paper's description
z_axis = normal # Normal to the plane # z-axis as normal, x-axis along thumb direction
x_axis = principal_axis # Along the thumb rotation_matrix = np.column_stack([thumb_axis, cross_axis, plane_normal])
# Ensure x_axis is orthogonal to z_axis
x_axis = x_axis - np.dot(x_axis, z_axis) * z_axis
x_axis = x_axis / np.linalg.norm(x_axis)
# Compute y_axis as cross product to ensure orthogonality
y_axis = np.cross(z_axis, x_axis)
# Create rotation matrix
rotation_matrix = np.column_stack([x_axis, y_axis, z_axis])
# 3. Calculate gripper width # 3. Calculate gripper width
gripper_width = np.linalg.norm(thumb_tip - index_tip) gripper_width = np.linalg.norm(thumb_tip - index_tip)
# Apply percentile threshold to mitigate slippage (as mentioned in the paper)
# Store gripper widths in a buffer to compute percentiles
if not hasattr(self, "gripper_width_buffer"):
self.gripper_width_buffer = []
self.gripper_width_buffer.append(gripper_width) self.gripper_width_buffer.append(gripper_width)
# Keep only the last 100 values
if len(self.gripper_width_buffer) > 100:
self.gripper_width_buffer.pop(0)
# Apply 20th percentile threshold # Apply 20th percentile threshold as specified in the paper
if len(self.gripper_width_buffer) > 5: # Need enough samples if len(self.gripper_width_buffer) > 5: # Need enough samples
min_width = np.percentile(self.gripper_width_buffer, 20) min_width = np.percentile(self.gripper_width_buffer, 20)
if gripper_width < min_width: if gripper_width < min_width:
gripper_width = min_width gripper_width = 0.0 # Fully closed gripper when below threshold
else:
# Scale gripper_width to robot's specific range
gripper_width = min(1.0, gripper_width / MAXIMUM_HAND_WIDTH_MM)
# Convert from camera frame to robot frame
# Note: This requires the extrinsic matrix from camera to robot
# If extrinsics are available, uncomment and use this code:
# if hasattr(self, "camera_to_robot_transform"):
# # Convert position to homogeneous coordinates
# pos_homogeneous = np.append(position, 1.0)
# # Apply transformation
# robot_pos_homogeneous = np.dot(self.camera_to_robot_transform, pos_homogeneous)
# position = robot_pos_homogeneous[:3]
#
# # Convert rotation (special orthogonal transformation)
# rotation_in_robot_frame = np.dot(self.camera_to_robot_transform[:3, :3], rotation_matrix)
# rotation_matrix = rotation_in_robot_frame
return position, rotation_matrix, gripper_width return position, rotation_matrix, gripper_width
@ -483,16 +528,16 @@ class HandProcessor:
# Process if hand is detected # Process if hand is detected
if results.multi_hand_landmarks: if results.multi_hand_landmarks:
for hand_landmarks in results.multi_hand_landmarks: for hand_landmarks in results.multi_hand_landmarks:
hand_mask = self._create_mask(frame, hand_landmarks) segmented_mask = self._create_mask(frame, hand_landmarks)
mask_overlay = frame.copy() mask_overlay = frame.copy()
mask_overlay[hand_mask > 0] = [0, 0, 255] # Red color for mask mask_overlay[segmented_mask > 0] = [0, 0, 255] # Red color for mask
segmented_frame = cv2.addWeighted(frame, 0.7, mask_overlay, 0.3, 0) segmented_frame = cv2.addWeighted(frame, 0.7, mask_overlay, 0.3, 0)
depth_map, depth_vis = self._estimate_depth(frame) depth_map, depth_vis = self._estimate_depth(frame)
depth_colored = cv2.applyColorMap(depth_vis, cv2.COLORMAP_JET) depth_colored = cv2.applyColorMap(depth_vis, cv2.COLORMAP_JET)
depth_frame = depth_colored.copy() depth_frame = depth_colored.copy()
cloud = self._create_cloud(depth_map, hand_mask) cloud = self._create_cloud(depth_map, segmented_mask)
hand_vertices, hand_faces = self._create_mesh( hand_vertices, hand_faces = self._create_mesh(
hand_landmarks, (width, height) hand_landmarks, (width, height)