improve icp and robot params

This commit is contained in:
Ethan Clark 2025-03-21 17:05:51 -07:00
parent aae4e91f54
commit b38527b5db
6 changed files with 690 additions and 417 deletions

View File

@ -1,92 +1,94 @@
import os import os
import cv2
import time import time
import torch
import cv2
import depth_pro
import mediapipe as mp
import numpy as np import numpy as np
import open3d as o3d import open3d as o3d
import mediapipe as mp import torch
from PIL import Image
from robot_manager import RobotManager
from sam2.sam2_image_predictor import SAM2ImagePredictor
from tqdm import tqdm from tqdm import tqdm
from utils.visualizations import * from utils.visualizations import *
from robot_manager import RobotManager
# from diffusers import StableDiffusionInpaintPipeline
class HandProcessor: class HandProcessor:
def __init__(self) -> None: def __init__(self) -> None:
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.mp_hands = mp.solutions.hands self.mp_hands = mp.solutions.hands
self.mp_drawing = mp.solutions.drawing_utils self.mp_drawing = mp.solutions.drawing_utils
self.mp_drawing_styles = mp.solutions.drawing_styles self.mp_drawing_styles = mp.solutions.drawing_styles
self.hands = self.mp_hands.Hands( self.hands = self.mp_hands.Hands(
model_complexity=1, model_complexity=1,
max_num_hands=1, max_num_hands=1,
static_image_mode=False, static_image_mode=False,
) )
# NOTE: Look into better depth estimation models # NOTE: Look into better depth estimation models
# Initialize MiDaS for depth estimation # Initialize MiDaS for depth estimation
print("Loading MiDaS model...") print("Loading MiDaS model...")
self.midas = torch.hub.load("intel-isl/MiDaS", "DPT_Hybrid") self.midas = torch.hub.load("intel-isl/MiDaS", "DPT_Hybrid")
self.midas.to(self.device) self.midas.to(self.device)
self.midas.eval() self.midas.eval()
# Load MiDaS transforms to resize and normalize input images # Load MiDaS transforms to resize and normalize input images
self.midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms") self.midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms")
self.transform = self.midas_transforms.dpt_transform self.transform = self.midas_transforms.dpt_transform
# Segment hand from image using MediaPipe Hands landmarks # Load SAM2 model for hand segmentation
def _create_mask(self, image: np.ndarray, landmarks: list, thickness: int=5, padding:int=10) -> np.ndarray: print("Loading SAM2 model...")
height, width = image.shape[:2] self.sam2_predictor = SAM2ImagePredictor.from_pretrained(
mask = np.zeros((height, width), dtype=np.uint8) "facebook/sam2-hiera-large"
)
points = [(int(landmark.x * width), int(landmark.y * height)) for landmark in landmarks.landmark]
def _create_mask(self, frame: np.ndarray, landmarks: list) -> np.ndarray:
# Draw filled circles at each landmark height, width = frame.shape[:2]
for point in points: frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
cv2.circle(mask, point, thickness, 255, -1)
# Set image in SAM2 predictor
# Connect all landmarks with thick lines with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
for connection in self.mp_hands.HAND_CONNECTIONS: self.sam2_predictor.set_image(frame_rgb) # Set image for prediction
start_idx, end_idx = connection
cv2.line(mask, points[start_idx], points[end_idx], 255, thickness) # Convert landmarks to point prompts
points = []
# Create palm by connecting base of fingers with wrist for landmark in landmarks.landmark:
palm_points = [points[0], points[1], points[5], points[9], points[13], points[17]] x, y = int(landmark.x * width), int(landmark.y * height)
cv2.fillPoly(mask, [np.array(palm_points)], 255) points.append([x, y])
# Create shape between fingers input_points = np.array(points)
finger_bases = [(1,5), (5,9), (9,13), (13,17)]
for base1, base2 in finger_bases: # Predict mask with point prompts
triangle = np.array([points[0], points[base1], points[base2]]) masks, _, _ = self.sam2_predictor.predict(
cv2.fillPoly(mask, [triangle], 255) point_coords=input_points, # Pass the points as prompts
point_labels=np.ones(
# Dilate to smooth and expand slightly len(input_points)
kernel = np.ones((padding, padding), np.uint8) ), # All points from hand are foreground
mask = cv2.dilate(mask, kernel, iterations=1) multimask_output=False, # Just get one mask
)
# Smooth the mask
mask = cv2.GaussianBlur(mask, (21, 21), 0) mask = masks[0].astype(np.uint8) * 255
_, mask = cv2.threshold(mask, 50, 255, cv2.THRESH_BINARY)
return mask return mask
""" """
Transform input image to match MiDaS model requirements Transform input image to match MiDaS model requirements
Estimate depth map using MiDaS model Estimate depth map using MiDaS model
""" """
# TODO: Look into why screen goes black sometimes
# TODO: Swap MiDaS for ml-depth-pro model
def _estimate_depth(self, image: np.ndarray) -> tuple: def _estimate_depth(self, image: np.ndarray) -> tuple:
img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
# Transform img for MiDaS model # Transform img for MiDaS model
input_batch = self.transform(img).to(self.device) input_batch = self.transform(img).to(self.device)
with torch.inference_mode(): with torch.inference_mode():
prediction = self.midas(input_batch) prediction = self.midas(input_batch)
# Interpolate to original size # Interpolate to original size
prediction = torch.nn.functional.interpolate( prediction = torch.nn.functional.interpolate(
prediction.unsqueeze(1), prediction.unsqueeze(1),
@ -94,127 +96,146 @@ class HandProcessor:
mode="bicubic", mode="bicubic",
align_corners=False, align_corners=False,
).squeeze() ).squeeze()
# Convert to numpy and normalize to 0-255 for visualization # Convert to numpy and normalize to 0-255 for visualization
depth_map = prediction.cpu().numpy() depth_map = prediction.cpu().numpy()
depth_min = depth_map.min() depth_min = depth_map.min()
depth_max = depth_map.max() depth_max = depth_map.max()
depth_map_normalized = 255 * (depth_map - depth_min) / (depth_max - depth_min) depth_map_normalized = 255 * (depth_map - depth_min) / (depth_max - depth_min)
return depth_map, depth_map_normalized.astype(np.uint8) return depth_map, depth_map_normalized.astype(np.uint8)
# TODO: Look into better depth scaling # TODO: Look into better depth scaling
def _create_cloud(self, depth_map: np.ndarray, hand_mask: np.ndarray) -> np.ndarray: def _create_cloud(self, depth_map: np.ndarray, hand_mask: np.ndarray) -> np.ndarray:
focal_x, focal_y, center_x, center_y = self.camera_intrinsics focal_x, focal_y, center_x, center_y = self.camera_intrinsics
v_coords, u_coords = np.where(hand_mask > 0) v_coords, u_coords = np.where(hand_mask > 0)
z_values = depth_map[v_coords, u_coords] z_values = depth_map[v_coords, u_coords]
# Filter out zero depth values # Filter out zero depth values
valid_indices = z_values > 0 valid_indices = z_values > 0
u_coords = u_coords[valid_indices] u_coords = u_coords[valid_indices]
v_coords = v_coords[valid_indices] v_coords = v_coords[valid_indices]
z_values = z_values[valid_indices] z_values = z_values[valid_indices]
# NOTE: Abritrary scaling factor for depth
z_metric = z_values * 0.5 z_metric = z_values * 0.5
# Back-project to 3D using camera intrinsics # Back-project to 3D using camera intrinsics
x_values = (u_coords - center_x) * z_metric / focal_x x_values = (u_coords - center_x) * z_metric / focal_x
y_values = (v_coords - center_y) * z_metric / focal_y y_values = (v_coords - center_y) * z_metric / focal_y
points = np.column_stack((x_values, y_values, z_metric)) points = np.column_stack((x_values, y_values, z_metric))
return points return points
# TODO: Look into better Z scaling # TODO: Look into better depth scaling
def _create_mesh(self, landmarks: list, image_dims: tuple) -> np.ndarray: def _create_mesh(self, landmarks: list, image_dims: tuple) -> np.ndarray:
width, height = image_dims width, height = image_dims
vertices = [] vertices = []
for landmark in landmarks.landmark: for landmark in landmarks.landmark:
vertices.append([ vertices.append(
landmark.x * width, [landmark.x * width, landmark.y * height, landmark.z * width]
landmark.y * height, )
landmark.z * width
])
# Define faces (triangles) connecting landmarks # Define faces (triangles) connecting landmarks
faces = [ faces = [
# Palm # Palm
[0, 1, 5], [0, 5, 9], [0, 9, 13], [0, 13, 17], [0, 1, 5],
[0, 5, 9],
[0, 9, 13],
[0, 13, 17],
# Thumb # Thumb
[1, 2, 3], [2, 3, 4], [1, 2, 3],
[2, 3, 4],
# Index finger # Index finger
[5, 6, 7], [6, 7, 8], [5, 6, 7],
[6, 7, 8],
# Middle finger # Middle finger
[9, 10, 11], [10, 11, 12], [9, 10, 11],
[10, 11, 12],
# Ring finger # Ring finger
[13, 14, 15], [14, 15, 16], [13, 14, 15],
[14, 15, 16],
# Pinky # Pinky
[17, 18, 19], [18, 19, 20] [17, 18, 19],
[18, 19, 20],
] ]
dense_vertices = list(vertices) dense_vertices = list(vertices)
# Add interpolated vertices along finger segments # Add interpolated vertices along finger segments
connections = self.mp_hands.HAND_CONNECTIONS connections = self.mp_hands.HAND_CONNECTIONS
for connection in connections: for connection in connections:
start_idx, end_idx = connection start_idx, end_idx = connection
start_point = np.array(vertices[start_idx]) start_point = np.array(vertices[start_idx])
end_point = np.array(vertices[end_idx]) end_point = np.array(vertices[end_idx])
# Add 2 interpolated points between each connected pair # Add 2 interpolated points between each connected pair
for t in [0.33, 0.66]: for t in [0.33, 0.66]:
interp_point = start_point * (1-t) + end_point * t interp_point = start_point * (1 - t) + end_point * t
dense_vertices.append(interp_point.tolist()) dense_vertices.append(interp_point.tolist())
return np.array(dense_vertices), np.array(faces) return np.array(dense_vertices), np.array(faces)
# TODO: Improve ICP registration for better alignment
def _icp_registration(self, cloud: np.ndarray, vertices: np.ndarray) -> np.ndarray: def _icp_registration(self, cloud: np.ndarray, vertices: np.ndarray) -> np.ndarray:
source = o3d.geometry.PointCloud() source = o3d.geometry.PointCloud()
source.points = o3d.utility.Vector3dVector(vertices) source.points = o3d.utility.Vector3dVector(vertices)
target = o3d.geometry.PointCloud() target = o3d.geometry.PointCloud()
target.points = o3d.utility.Vector3dVector(cloud) target.points = o3d.utility.Vector3dVector(cloud)
# Calculate centroids
source_centroid = np.mean(vertices, axis=0)
target_centroid = np.mean(cloud, axis=0)
# Calculate initial translation to align centroids
initial_translation = target_centroid - source_centroid
# Create initial transformation matrix
trans_init = np.eye(4)
trans_init[:3, 3] = initial_translation
result = o3d.pipelines.registration.registration_icp( result = o3d.pipelines.registration.registration_icp(
source, source,
target, target,
max_correspondence_distance=0.05, max_correspondence_distance=0.1, # Increased distance
init=trans_init, # Initial transformation
estimation_method=o3d.pipelines.registration.TransformationEstimationPointToPoint(), estimation_method=o3d.pipelines.registration.TransformationEstimationPointToPoint(),
criteria=o3d.pipelines.registration.ICPConvergenceCriteria(
max_iteration=100
), # Increased iterations
) )
transformation = result.transformation transformation = result.transformation
return transformation return transformation
# TODO: Look into better depth scaling
def _refine_landmarks(self, landmarks: list, transform: int, image_dims: tuple): def _refine_landmarks(self, landmarks: list, transform: int, image_dims: tuple):
width, height = image_dims width, height = image_dims
refined_landmarks = [] refined_landmarks = []
for landmark in landmarks.landmark: for landmark in landmarks.landmark:
point = np.array([ point = np.array(
landmark.x * width, [landmark.x * width, landmark.y * height, landmark.z * width, 1.0]
landmark.y * height, )
landmark.z * width, # TODO: Figure out better scaling factor
1.0
])
# Apply transformation to 3D point # Apply transformation to 3D point
transformed = np.dot(transform, point) transformed = np.dot(transform, point)
refined_landmarks.append(transformed[:3]) refined_landmarks.append(transformed[:3])
return refined_landmarks return refined_landmarks
def _apply_constraints(self, landmarks: list): def _apply_constraints(self, landmarks: list):
constrained = np.array(landmarks) constrained = np.array(landmarks)
# Define finger joint indices # Define finger joint indices
# MediaPipe hand model: Wrist is 0, thumb is 1-4, index is 5-8, etc. # MediaPipe hand model: Wrist is 0, thumb is 1-4, index is 5-8, etc.
thumb_indices = [1, 2, 3, 4] thumb_indices = [1, 2, 3, 4]
index_indices = [5, 6, 7, 8] index_indices = [5, 6, 7, 8]
# Constrain the last two joints of thumb and index finger # Constrain the last two joints of thumb and index finger
# as mentioned in the paper # as mentioned in the paper
for finger_indices in [thumb_indices, index_indices]: for finger_indices in [thumb_indices, index_indices]:
@ -224,115 +245,165 @@ class HandProcessor:
joint1 = constrained[finger_indices[-3]] joint1 = constrained[finger_indices[-3]]
joint2 = constrained[finger_indices[-2]] joint2 = constrained[finger_indices[-2]]
joint3 = constrained[finger_indices[-1]] joint3 = constrained[finger_indices[-1]]
# Direction of the first segment # Direction of the first segment
dir1 = joint2 - joint1 dir1 = joint2 - joint1
dir1 = dir1 / np.linalg.norm(dir1) dir1 = dir1 / np.linalg.norm(dir1)
# Instead of full ball joint, constrain the last joint's direction # Instead of full ball joint, constrain the last joint's direction
# to follow similar direction as the previous segment # to follow similar direction as the previous segment
ideal_dir = dir1.copy() ideal_dir = dir1.copy()
actual_dir = joint3 - joint2 actual_dir = joint3 - joint2
actual_length = np.linalg.norm(actual_dir) actual_length = np.linalg.norm(actual_dir)
# Force the direction to be within a cone of the previous segment # Force the direction to be within a cone of the previous segment
# (limiting to single degree of freedom approximately) # (limiting to single degree of freedom approximately)
corrected_dir = ideal_dir * actual_length corrected_dir = ideal_dir * actual_length
# Apply the correction # Apply the correction
constrained[finger_indices[-1]] = joint2 + corrected_dir constrained[finger_indices[-1]] = joint2 + corrected_dir
return constrained return constrained
def _get_robot_params(self, refined_landmarks) -> tuple: def _get_robot_params(self, refined_landmarks):
# Extract key landmarks # Extract keypoints
wrist = refined_landmarks[0] # Wrist landmark landmarks = np.array(refined_landmarks)
thumb_tip = refined_landmarks[4] # Thumb tip
index_tip = refined_landmarks[8] # Index finger tip # Define indices for specific parts of the hand
thumb_indices = [1, 2, 3, 4] # Thumb landmarks
# Calculate end effector position (midpoint between thumb and index tips) index_indices = [5, 6, 7, 8] # Index finger landmarks
thumb_tip_idx = 4
index_tip_idx = 8
# 1. Set target position as midpoint between thumb tip and index tip
thumb_tip = landmarks[thumb_tip_idx]
index_tip = landmarks[index_tip_idx]
position = (thumb_tip + index_tip) / 2 position = (thumb_tip + index_tip) / 2
# Calculate vectors for orientation # 2. Calculate orientation
v1 = thumb_tip - wrist # Vector from wrist to thumb tip # Get all thumb and index finger points for plane fitting
v2 = index_tip - wrist # Vector from wrist to index tip thumb_points = landmarks[thumb_indices]
index_points = landmarks[index_indices]
# Calculate normal to hand plane finger_points = np.vstack([thumb_points, index_points])
normal = np.cross(v1, v2)
if np.linalg.norm(normal) > 0: # Fit plane to finger points
normal = normal / np.linalg.norm(normal) centroid = np.mean(finger_points, axis=0)
else: centered_points = finger_points - centroid
# Default if vectors are collinear
normal = np.array([0, 0, 1]) # Use SVD to find the normal to the best-fitting plane
u, s, vh = np.linalg.svd(centered_points)
# Calculate main direction along thumb # The normal is the last right singular vector
direction = thumb_tip - wrist normal = vh[2, :]
if np.linalg.norm(direction) > 0:
direction = direction / np.linalg.norm(direction) # Ensure normal is a unit vector
else: normal = normal / np.linalg.norm(normal)
# Default if thumb is at wrist (unlikely)
direction = np.array([1, 0, 0]) # Fit a principal axis through thumb points
thumb_centroid = np.mean(thumb_points, axis=0)
# Calculate gripper width thumb_centered = thumb_points - thumb_centroid
# Use SVD again to find direction of maximum variance (principal axis)
u, s, vh = np.linalg.svd(thumb_centered)
principal_axis = vh[0, :] # First singular vector
# Ensure principal axis is a unit vector
principal_axis = principal_axis / np.linalg.norm(principal_axis)
# Construct orthogonal vectors for orientation matrix
z_axis = normal # Normal to the plane
x_axis = principal_axis # Along the thumb
# Ensure x_axis is orthogonal to z_axis
x_axis = x_axis - np.dot(x_axis, z_axis) * z_axis
x_axis = x_axis / np.linalg.norm(x_axis)
# Compute y_axis as cross product to ensure orthogonality
y_axis = np.cross(z_axis, x_axis)
# Create rotation matrix
rotation_matrix = np.column_stack([x_axis, y_axis, z_axis])
# 3. Calculate gripper width
gripper_width = np.linalg.norm(thumb_tip - index_tip) gripper_width = np.linalg.norm(thumb_tip - index_tip)
return position, direction, normal, gripper_width # Apply percentile threshold to mitigate slippage (as mentioned in the paper)
# Store gripper widths in a buffer to compute percentiles
if not hasattr(self, "gripper_width_buffer"):
self.gripper_width_buffer = []
self.gripper_width_buffer.append(gripper_width)
# Keep only the last 100 values
if len(self.gripper_width_buffer) > 100:
self.gripper_width_buffer.pop(0)
# Apply 20th percentile threshold
if len(self.gripper_width_buffer) > 5: # Need enough samples
min_width = np.percentile(self.gripper_width_buffer, 20)
if gripper_width < min_width:
gripper_width = min_width
return position, rotation_matrix, gripper_width
def record_video(self) -> str: def record_video(self) -> str:
output_dir = os.path.join(os.getcwd(), 'recordings') output_dir = os.path.join(os.getcwd(), "recordings")
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
timestamp = time.strftime("%Y%m%d-%H%M%S") timestamp = time.strftime("%Y%m%d-%H%M%S")
video_filename = os.path.join(output_dir, f"recorded_video_{timestamp}.mp4") video_filename = os.path.join(output_dir, f"recorded_video_{timestamp}.mp4")
cap = cv2.VideoCapture(0) cap = cv2.VideoCapture(0)
assert cap.isOpened(), "Failed to open camera." assert cap.isOpened(), "Failed to open camera."
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS)) fps = int(cap.get(cv2.CAP_PROP_FPS))
# Update camera intrinsics based on video dimensions # Update camera intrinsics based on video dimensions
self.camera_intrinsics = (width * 0.8, height * 0.8, width / 2, height / 2) self.camera_intrinsics = (width * 0.8, height * 0.8, width / 2, height / 2)
fourcc = cv2.VideoWriter_fourcc(*'mp4v') fourcc = cv2.VideoWriter_fourcc(*"mp4v")
out = None out = None
recording = False recording = False
did_record = False did_record = False
print("Camera is active. Press 'r' to start/stop recording, 'q' to quit.") print("Camera is active. Press 'r' to start/stop recording, 'q' to quit.")
while cap.isOpened(): while cap.isOpened():
success, frame = cap.read() success, frame = cap.read()
assert success, "Failed to read from camera." assert success, "Failed to read from camera."
# Mirror the image for more intuitive viewing # Mirror the image for more intuitive viewing
frame = cv2.flip(frame, 1) frame = cv2.flip(frame, 1)
# Create a separate display frame for showing the recording indicator # Create a separate display frame for showing the recording indicator
display_frame = frame.copy() display_frame = frame.copy()
# Display recording indicator ONLY on the display frame # Display recording indicator ONLY on the display frame
if recording: if recording:
cv2.circle(display_frame, (30, 30), 15, (0, 0, 255), -1) cv2.circle(display_frame, (30, 30), 15, (0, 0, 255), -1)
cv2.putText(display_frame, "RECORDING", (50, 40), cv2.FONT_HERSHEY_SIMPLEX, cv2.putText(
1, (0, 0, 255), 2, cv2.LINE_AA) display_frame,
"RECORDING",
(50, 40),
cv2.FONT_HERSHEY_SIMPLEX,
1,
(0, 0, 255),
2,
cv2.LINE_AA,
)
# Show the display frame (with indicator if recording) # Show the display frame (with indicator if recording)
cv2.imshow('Video Recording', display_frame) cv2.imshow("Video Recording", display_frame)
# Write the original frame (without indicator) to video file if recording # Write the original frame (without indicator) to video file if recording
if recording and out is not None: if recording and out is not None:
out.write(frame) out.write(frame)
key = cv2.waitKey(1) & 0xFF key = cv2.waitKey(1) & 0xFF
# NOTE: Haven't tested what happens if user records multiple videos in one session (probably overwrites?) if key == ord("r"):
if key == ord('q'):
break
elif key == ord('r'):
recording = not recording recording = not recording
if recording: if recording:
print(f"Started recording to {video_filename}") print(f"Started recording to {video_filename}")
@ -342,29 +413,30 @@ class HandProcessor:
out.release() out.release()
print(f"Stopped recording. Video saved to {video_filename}") print(f"Stopped recording. Video saved to {video_filename}")
did_record = True did_record = True
break
if out is not None: if out is not None:
out.release() out.release()
cap.release() cap.release()
cv2.destroyAllWindows() cv2.destroyAllWindows()
return video_filename if did_record else None return video_filename if did_record else None
def process_video(self, video_path: str) -> tuple: def process_video(self, video_path: str) -> tuple:
assert video_path, "Video path is required." assert video_path, "Video path is required."
cap = cv2.VideoCapture(video_path) cap = cv2.VideoCapture(video_path)
assert cap.isOpened(), f"Failed to open video file {video_path}" assert cap.isOpened(), f"Failed to open video file {video_path}"
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS)) fps = int(cap.get(cv2.CAP_PROP_FPS))
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
# Update camera intrinsics based on video dimensions # Update camera intrinsics based on video dimensions
self.camera_intrinsics = (width * 0.8, height * 0.8, width / 2, height / 2) self.camera_intrinsics = (width * 0.8, height * 0.8, width / 2, height / 2)
base_path = os.path.splitext(video_path)[0] base_path = os.path.splitext(video_path)[0]
segmented_output_path = f"{base_path}_masked.mp4" segmented_output_path = f"{base_path}_masked.mp4"
depth_output_path = f"{base_path}_depth.mp4" depth_output_path = f"{base_path}_depth.mp4"
@ -372,94 +444,112 @@ class HandProcessor:
registration_output_path = f"{base_path}_registration.mp4" registration_output_path = f"{base_path}_registration.mp4"
constraints_output_path = f"{base_path}_constraints.mp4" constraints_output_path = f"{base_path}_constraints.mp4"
robot_output_path = f"{base_path}_robot.mp4" robot_output_path = f"{base_path}_robot.mp4"
fourcc = cv2.VideoWriter_fourcc(*'mp4v') fourcc = cv2.VideoWriter_fourcc(*"mp4v")
segmented_out = cv2.VideoWriter(segmented_output_path, fourcc, fps, (width, height)) segmented_out = cv2.VideoWriter(
segmented_output_path, fourcc, fps, (width, height)
)
depth_out = cv2.VideoWriter(depth_output_path, fourcc, fps, (width, height)) depth_out = cv2.VideoWriter(depth_output_path, fourcc, fps, (width, height))
mesh_out = cv2.VideoWriter(mesh_output_path, fourcc, fps, (width, height)) mesh_out = cv2.VideoWriter(mesh_output_path, fourcc, fps, (width, height))
reg_out = cv2.VideoWriter(registration_output_path, fourcc, fps, (640, 480)) # Fixed size reg_out = cv2.VideoWriter(
constraints_out = cv2.VideoWriter(constraints_output_path, fourcc, fps, (width, height)) registration_output_path, fourcc, fps, (640, 480)
) # Fixed size
constraints_out = cv2.VideoWriter(
constraints_output_path, fourcc, fps, (width, height)
)
robot_out = cv2.VideoWriter(robot_output_path, fourcc, fps, (width, height)) robot_out = cv2.VideoWriter(robot_output_path, fourcc, fps, (width, height))
print(f"Processing video with {total_frames} frames...") print(f"Processing video with {total_frames} frames...")
for _ in tqdm(range(total_frames)): for _ in tqdm(range(total_frames)):
success, frame = cap.read() success, frame = cap.read()
assert success, "Failed to read frame from video." assert success, "Failed to read frame from video."
# Convert image to RGB for MediaPipe # Convert image to RGB for MediaPipe
image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# Process with MediaPipe Hands # Process with MediaPipe Hands
results = self.hands.process(image_rgb) results = self.hands.process(image_rgb)
# Initialize output frames # Initialize output frames
segmented_frame = frame.copy() segmented_frame = frame.copy()
depth_frame = np.zeros((height, width, 3), dtype=np.uint8) depth_frame = np.zeros((height, width, 3), dtype=np.uint8)
mesh_frame = frame.copy() mesh_frame = frame.copy()
reg_frame = np.ones((480, 640, 3), dtype=np.uint8) * 255 # Fixed size, white background reg_frame = (
np.ones((480, 640, 3), dtype=np.uint8) * 255
) # Fixed size, white background
constraints_frame = frame.copy() constraints_frame = frame.copy()
robot_frame = frame.copy() robot_frame = frame.copy()
# Process if hand is detected # Process if hand is detected
if results.multi_hand_landmarks: if results.multi_hand_landmarks:
for hand_landmarks in results.multi_hand_landmarks: for hand_landmarks in results.multi_hand_landmarks:
# Segment hand
hand_mask = self._create_mask(frame, hand_landmarks) hand_mask = self._create_mask(frame, hand_landmarks)
mask_overlay = frame.copy() mask_overlay = frame.copy()
mask_overlay[hand_mask > 0] = [0, 0, 255] # Red color for mask mask_overlay[hand_mask > 0] = [0, 0, 255] # Red color for mask
segmented_frame = cv2.addWeighted(frame, 0.7, mask_overlay, 0.3, 0) segmented_frame = cv2.addWeighted(frame, 0.7, mask_overlay, 0.3, 0)
# Depth estimation
depth_map, depth_vis = self._estimate_depth(frame) depth_map, depth_vis = self._estimate_depth(frame)
depth_colored = cv2.applyColorMap(depth_vis, cv2.COLORMAP_JET) depth_colored = cv2.applyColorMap(depth_vis, cv2.COLORMAP_JET)
depth_frame = depth_colored.copy() depth_frame = depth_colored.copy()
# Create hand mesh
hand_vertices, hand_faces = self._create_mesh(hand_landmarks, (width, height))
mesh_frame = visualize_mesh(frame, hand_vertices, hand_faces)
# Create point cloud from depth
cloud = self._create_cloud(depth_map, hand_mask) cloud = self._create_cloud(depth_map, hand_mask)
# Perform ICP registration hand_vertices, hand_faces = self._create_mesh(
hand_landmarks, (width, height)
)
mesh_frame = visualize_mesh(frame, hand_vertices, hand_faces)
icp_transform = self._icp_registration(cloud, hand_vertices) icp_transform = self._icp_registration(cloud, hand_vertices)
reg_frame = visualize_registration(cloud, hand_vertices, icp_transform) reg_frame = visualize_registration(
cloud, hand_vertices, icp_transform
# Refine landmarks using ICP transformation )
refined_landmarks = self._refine_landmarks(hand_landmarks, icp_transform, (width, height))
refined_landmarks = self._refine_landmarks(
# Store pre-constraint landmarks for visualization hand_landmarks, icp_transform, (width, height)
)
original_refined = refined_landmarks.copy() original_refined = refined_landmarks.copy()
# Apply anatomical constraints
constrained_landmarks = self._apply_constraints(refined_landmarks) constrained_landmarks = self._apply_constraints(refined_landmarks)
constraints_frame = visualize_constraints(frame, original_refined, constrained_landmarks, self.camera_intrinsics) constraints_frame = visualize_constraints(
frame,
# # Calculate robot parameters original_refined,
# position, direction, normal, gripper_width = self._get_robot_params(constrained_landmarks) constrained_landmarks,
# robot_frame = visualize_robot_params(frame, position, direction, normal, gripper_width) self.camera_intrinsics,
)
position, orientation, gripper_width = self._get_robot_params(
constrained_landmarks
)
robot_frame = visualize_robot_params(
frame,
position,
orientation,
gripper_width,
self.camera_intrinsics,
)
segmented_out.write(segmented_frame) segmented_out.write(segmented_frame)
depth_out.write(depth_frame) depth_out.write(depth_frame)
mesh_out.write(mesh_frame) mesh_out.write(mesh_frame)
reg_out.write(reg_frame) reg_out.write(reg_frame)
constraints_out.write(constraints_frame) constraints_out.write(constraints_frame)
robot_out.write(robot_frame) robot_out.write(robot_frame)
display_scale = 0.5 display_scale = 0.5
display_size = (int(width * display_scale), int(height * display_scale)) display_size = (int(width * display_scale), int(height * display_scale))
reg_display_size = (int(640 * display_scale), int(480 * display_scale)) reg_display_size = (int(640 * display_scale), int(480 * display_scale))
cv2.imshow('Segmented', cv2.resize(segmented_frame, display_size)) cv2.imshow("Segmented", cv2.resize(segmented_frame, display_size))
cv2.imshow('Depth', cv2.resize(depth_frame, display_size)) cv2.imshow("Depth", cv2.resize(depth_frame, display_size))
cv2.imshow('Mesh', cv2.resize(mesh_frame, display_size)) cv2.imshow("Mesh", cv2.resize(mesh_frame, display_size))
cv2.imshow('Registration', cv2.resize(reg_frame, reg_display_size)) cv2.imshow("Registration", cv2.resize(reg_frame, reg_display_size))
cv2.imshow('Constraints', cv2.resize(constraints_frame, display_size)) cv2.imshow("Constraints", cv2.resize(constraints_frame, display_size))
cv2.imshow('Robot Parameters', cv2.resize(robot_frame, display_size)) cv2.imshow("Robot Parameters", cv2.resize(robot_frame, display_size))
if cv2.waitKey(1) & 0xFF == 27: # ESC to exit if cv2.waitKey(1) & 0xFF == 27: # ESC to exit
break break
cap.release() cap.release()
segmented_out.release() segmented_out.release()
depth_out.release() depth_out.release()
@ -468,7 +558,7 @@ class HandProcessor:
constraints_out.release() constraints_out.release()
robot_out.release() robot_out.release()
cv2.destroyAllWindows() cv2.destroyAllWindows()
print(f"Processing complete. Results saved to:") print(f"Processing complete. Results saved to:")
print(f"- Hand mask: {segmented_output_path}") print(f"- Hand mask: {segmented_output_path}")
print(f"- Depth visualization: {depth_output_path}") print(f"- Depth visualization: {depth_output_path}")
@ -476,12 +566,12 @@ class HandProcessor:
print(f"- Registration visualization: {registration_output_path}") print(f"- Registration visualization: {registration_output_path}")
print(f"- Constraints visualization: {constraints_output_path}") print(f"- Constraints visualization: {constraints_output_path}")
print(f"- Robot parameters: {robot_output_path}") print(f"- Robot parameters: {robot_output_path}")
return { return {
"segmented": segmented_output_path, "segmented": segmented_output_path,
"depth": depth_output_path, "depth": depth_output_path,
"mesh": mesh_output_path, "mesh": mesh_output_path,
"registration": registration_output_path, "registration": registration_output_path,
"constraints": constraints_output_path, "constraints": constraints_output_path,
"robot": robot_output_path "robot": robot_output_path,
} }

View File

@ -5,16 +5,16 @@ from hand_processor import HandProcessor
def main(): def main():
processor = HandProcessor() processor = HandProcessor()
record_option = input("Record a new video? (y/n): ") record_option = input("Record a new video? (y/n): ")
if record_option.lower() == 'y': if record_option.lower() == "y":
print("Starting video recording...") print("Starting video recording...")
video_path = processor.record_video() video_path = processor.record_video()
if not video_path: if not video_path:
print("Video recording failed.") print("Video recording failed.")
return return
print(f"Video recorded successfully to {video_path}") print(f"Video recorded successfully to {video_path}")
process_option = input("Process the video now? (y/n): ") process_option = input("Process the video now? (y/n): ")
if process_option.lower() == "n": if process_option.lower() == "n":
@ -25,9 +25,9 @@ def main():
while not os.path.exists(video_path): while not os.path.exists(video_path):
print("Error: Video file not found.") print("Error: Video file not found.")
video_path = input("Enter the path to the video file: ") video_path = input("Enter the path to the video file: ")
processor.process_video(video_path) processor.process_video(video_path)
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -1,72 +1,75 @@
import os import os
import cv2 import cv2
import numpy as np import numpy as np
import pybullet as p import pybullet as p
from utils.math import *
from utils.handle_urdf import handle_urdf from utils.handle_urdf import handle_urdf
from scipy.spatial.transform import Rotation as R
class RobotManager: class RobotManager:
def __init__(self, urdf_path: str, camera_intrinsics: tuple) -> None: def __init__(self, urdf_path: str, camera_intrinsics: tuple) -> None:
self.physics_client = p.connect(p.DIRECT) self.physics_client = p.connect(p.DIRECT)
robot_urdf = handle_urdf(urdf_path) robot_urdf = handle_urdf(urdf_path)
self.robot_id = self._load_robot(robot_urdf) self.robot_id = self._load_robot(robot_urdf)
self.joint_count = p.getNumJoints(self.robot_id) self.joint_count = p.getNumJoints(self.robot_id)
self.end_effector_index = self._find_end_effector() self.end_effector_index = self._find_end_effector()
self.fx, self.fy, self.cx, self.cy = camera_intrinsics self.fx, self.fy, self.cx, self.cy = camera_intrinsics
# Set up rendering parameters # Set up rendering parameters
self.img_width = int(self.cx * 2) self.img_width = int(self.cx * 2)
self.img_height = int(self.cy * 2) self.img_height = int(self.cy * 2)
# Load the robot URDF into PyBullet # Load the robot URDF into PyBullet
def _load_robot(self, robot_urdf: str) -> int: def _load_robot(self, robot_urdf: str) -> int:
try: try:
robot_id = p.loadURDF( robot_id = p.loadURDF(
robot_urdf, robot_urdf,
basePosition=[0, 0, 0], basePosition=[0, 0, 0],
useFixedBase=True, useFixedBase=True,
flags=p.URDF_USE_SELF_COLLISION | p.URDF_USE_INERTIA_FROM_FILE flags=p.URDF_USE_SELF_COLLISION | p.URDF_USE_INERTIA_FROM_FILE,
) )
except p.error as e: except p.error as e:
print(f"PyBullet error when loading URDF: {e}") print(f"PyBullet error when loading URDF: {e}")
raise e raise e
robot_name = p.getBodyInfo(robot_id)[1].decode('utf-8') robot_name = p.getBodyInfo(robot_id)[1].decode("utf-8")
print(f"Successfully loaded {robot_name} robot with ID: {robot_id}") print(f"Successfully loaded {robot_name} robot with ID: {robot_id}")
return robot_id return robot_id
# NOTE: Only applicable if the robot has one end effector # NOTE: Only applicable if the robot has one end effector
def _find_end_effector(self) -> int: def _find_end_effector(self) -> int:
assert self.joint_count > 0, "Robot has no joints" assert self.joint_count > 0, "Robot has no joints"
# Keywords to look for in joint names to identify end effector # Keywords to look for in joint names to identify end effector
keywords = ['gripper', 'tool', 'tcp', 'end_effector', 'hand'] keywords = ["gripper", "tool", "tcp", "end_effector", "hand"]
for i in range(self.joint_count): for i in range(self.joint_count):
info = p.getJointInfo(self.robot_id, i) info = p.getJointInfo(self.robot_id, i)
joint_name = info[1].decode('utf-8').lower() joint_name = info[1].decode("utf-8").lower()
# Check if any keyword is in the joint name # Check if any keyword is in the joint name
if any(keyword in joint_name for keyword in keywords): if any(keyword in joint_name for keyword in keywords):
return i return i
# If no specific end effector found, use the last joint in the chain # If no specific end effector found, use the last joint in the chain
return self.joint_count - 1 return self.joint_count - 1
# TODO: Use inverse kinematics to set the robot pose # TODO: Use inverse kinematics to set the robot pose
def set_robot_pose(self, position: np.ndarray, orientation_vectors: np.ndarray, gripper_width: float) -> None: def set_robot_pose(
self,
position: np.ndarray,
orientation_vectors: np.ndarray,
gripper_width: float,
) -> None:
pass pass
# Render the robot in some scene using some camera parameters # Render the robot in some scene using some camera parameters
def render_robot(self, bg_image=None, camera_params=None): def render_robot(self, bg_image=None, camera_params=None):
assert self.robot_id >= 0, "Robot not properly loaded" assert self.robot_id >= 0, "Robot not properly loaded"
# Set up camera parameters # Set up camera parameters
if camera_params is None: if camera_params is None:
# Default camera setup # Default camera setup
@ -77,40 +80,37 @@ class RobotManager:
cam_roll = 0 cam_roll = 0
else: else:
cam_target, cam_distance, cam_yaw, cam_pitch, cam_roll = camera_params cam_target, cam_distance, cam_yaw, cam_pitch, cam_roll = camera_params
# Compute view matrix # Compute view matrix
view_matrix = p.computeViewMatrixFromYawPitchRoll( view_matrix = p.computeViewMatrixFromYawPitchRoll(
cameraTargetPosition=cam_target, cameraTargetPosition=cam_target,
distance=cam_distance, distance=cam_distance,
yaw=cam_yaw, yaw=cam_yaw,
pitch=cam_pitch, pitch=cam_pitch,
roll=cam_roll, roll=cam_roll,
upAxisIndex=2 upAxisIndex=2,
) )
# Compute projection matrix # Compute projection matrix
aspect = self.img_width / self.img_height aspect = self.img_width / self.img_height
proj_matrix = p.computeProjectionMatrixFOV( proj_matrix = p.computeProjectionMatrixFOV(
fov=60, fov=60, aspect=aspect, nearVal=0.01, farVal=100.0
aspect=aspect,
nearVal=0.01,
farVal=100.0
) )
# Render the scene # Render the scene
img_arr = p.getCameraImage( img_arr = p.getCameraImage(
width=self.img_width, width=self.img_width,
height=self.img_height, height=self.img_height,
viewMatrix=view_matrix, viewMatrix=view_matrix,
projectionMatrix=proj_matrix, projectionMatrix=proj_matrix,
renderer=p.ER_BULLET_HARDWARE_OPENGL renderer=p.ER_BULLET_HARDWARE_OPENGL,
) )
# Extract RGB and depth # Extract RGB and depth
rgb = np.reshape(img_arr[2], (self.img_height, self.img_width, 4)) rgb = np.reshape(img_arr[2], (self.img_height, self.img_width, 4))
rgb = rgb[:, :, :3] # Remove alpha channel rgb = rgb[:, :, :3] # Remove alpha channel
depth = np.reshape(img_arr[3], (self.img_height, self.img_width)) depth = np.reshape(img_arr[3], (self.img_height, self.img_width))
# If background image is provided, composite # If background image is provided, composite
if bg_image is not None: if bg_image is not None:
# Resize background if needed # Resize background if needed
@ -119,31 +119,33 @@ class RobotManager:
bg_resized = cv2.resize(bg_image, (self.img_width, self.img_height)) bg_resized = cv2.resize(bg_image, (self.img_width, self.img_height))
else: else:
bg_resized = bg_image bg_resized = bg_image
# Create mask from depth # Create mask from depth
mask = (depth < 0.99).astype(np.float32) mask = (depth < 0.99).astype(np.float32)
mask = np.stack([mask, mask, mask], axis=2) mask = np.stack([mask, mask, mask], axis=2)
# Composite # Composite
composite = bg_resized * (1 - mask) + rgb * mask composite = bg_resized * (1 - mask) + rgb * mask
return composite.astype(np.uint8) return composite.astype(np.uint8)
return rgb.astype(np.uint8) return rgb.astype(np.uint8)
def __del__(self) -> None: def __del__(self) -> None:
if hasattr(self, 'physics_client'): if hasattr(self, "physics_client"):
try: try:
p.disconnect(self.physics_client) p.disconnect(self.physics_client)
except: except:
pass pass
if __name__ == "__main__": if __name__ == "__main__":
cwd = os.getcwd() cwd = os.getcwd()
urdf_path = os.path.join(cwd, "notebook/phantom/urdf/SO_5DOF_ARM100_05d.SLDASM/urdf/SO_5DOF_ARM100_05d.SLDASM.urdf") urdf_path = os.path.join(
camera_intrinsics = (320, 240, 320, 240) # Random intrinsics for example cwd,
"notebook/phantom/urdf/SO_5DOF_ARM100_05d.SLDASM/urdf/SO_5DOF_ARM100_05d.SLDASM.urdf",
)
camera_intrinsics = (320, 240, 320, 240) # Random intrinsics for example
robot_vis = RobotManager(urdf_path, camera_intrinsics) robot_vis = RobotManager(urdf_path, camera_intrinsics)
rendered_image = robot_vis.render_robot() rendered_image = robot_vis.render_robot()
@ -155,4 +157,4 @@ if __name__ == "__main__":
# Option 2: Save the image to a file # Option 2: Save the image to a file
output_path = "robot_render.png" output_path = "robot_render.png"
cv2.imwrite(output_path, rendered_image) cv2.imwrite(output_path, rendered_image)
print(f"Render saved to {output_path}") print(f"Render saved to {output_path}")

View File

@ -0,0 +1 @@
# TODO: Implement camera calibration functions to extract camera extrinsics and intrinsics

View File

@ -2,7 +2,6 @@ import os
import xml.dom.minidom as minidom import xml.dom.minidom as minidom
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
""" """
Fixes a specific path that uses the ROS-style 'package://' prefix. Fixes a specific path that uses the ROS-style 'package://' prefix.
The 'package://' prefix is used in URDF files to refer to files in the same package. The 'package://' prefix is used in URDF files to refer to files in the same package.
@ -10,26 +9,28 @@ However, when we're not running in a ROS environment, the paths are not valid.
This function tries to find the absolute path of the mesh file. This function tries to find the absolute path of the mesh file.
If the mesh file is not found, the original path is used. If the mesh file is not found, the original path is used.
""" """
def fix_path(path: str, urdf_dir: str) -> str: def fix_path(path: str, urdf_dir: str) -> str:
if path.startswith('package://'): if path.startswith("package://"):
parts = path[len('package://'):].split('/', 1) parts = path[len("package://") :].split("/", 1)
if len(parts) == 2: if len(parts) == 2:
package_name, rel_path = parts package_name, rel_path = parts
# Try potential locations for the mesh # Try potential locations for the mesh
potential_paths = [ potential_paths = [
os.path.join(urdf_dir, rel_path), os.path.join(urdf_dir, rel_path),
os.path.join(urdf_dir, '../meshes', rel_path), os.path.join(urdf_dir, "../meshes", rel_path),
os.path.join(urdf_dir, f'../{package_name}', rel_path), os.path.join(urdf_dir, f"../{package_name}", rel_path),
os.path.join(urdf_dir, '../..', rel_path) os.path.join(urdf_dir, "../..", rel_path),
] ]
for possible_path in potential_paths: for possible_path in potential_paths:
if os.path.exists(possible_path): if os.path.exists(possible_path):
return possible_path return possible_path
print(f"Failed to find mesh for package path: {path}") print(f"Failed to find mesh for package path: {path}")
return path return path
@ -37,58 +38,67 @@ def fix_path(path: str, urdf_dir: str) -> str:
Iterates through the URDF file and fixes the paths of all mesh files. Iterates through the URDF file and fixes the paths of all mesh files.
The URDF file is parsed and the mesh paths are modified in-place. The URDF file is parsed and the mesh paths are modified in-place.
""" """
def fix_mesh_paths(urdf_path:str, urdf_dir: str) -> str:
def fix_mesh_paths(urdf_path: str, urdf_dir: str) -> str:
root = ET.parse(urdf_path).getroot() root = ET.parse(urdf_path).getroot()
try: try:
for mesh in root.findall('.//mesh'): for mesh in root.findall(".//mesh"):
if 'filename' in mesh.attrib: if "filename" in mesh.attrib:
mesh.attrib['filename'] = fix_path(mesh.attrib['filename'], urdf_dir) mesh.attrib["filename"] = fix_path(mesh.attrib["filename"], urdf_dir)
except Exception as e: except Exception as e:
print(f"Error fixing mesh paths: {e}") print(f"Error fixing mesh paths: {e}")
raise e raise e
fixed_path = urdf_path.replace('.urdf', '_fixed.urdf') fixed_path = urdf_path.replace(".urdf", "_fixed.urdf")
return root, fixed_path return root, fixed_path
""" """
Formats the XML tree to be human-readable and writes it to a file. Formats the XML tree to be human-readable and writes it to a file.
""" """
def format_xml(root: ET.Element, fixed_path: str) -> None: def format_xml(root: ET.Element, fixed_path: str) -> None:
xml_str = ET.tostring(root, encoding='utf-8') xml_str = ET.tostring(root, encoding="utf-8")
dom = minidom.parseString(xml_str) dom = minidom.parseString(xml_str)
with open(fixed_path, 'w', encoding='utf-8') as f: with open(fixed_path, "w", encoding="utf-8") as f:
# Write with nice indentation but remove extra whitespace # Write with nice indentation but remove extra whitespace
pretty_xml = dom.toprettyxml(indent=' ') pretty_xml = dom.toprettyxml(indent=" ")
# Remove extra blank lines that minidom sometimes adds # Remove extra blank lines that minidom sometimes adds
pretty_xml = '\n'.join([line for line in pretty_xml.split('\n') if line.strip()]) pretty_xml = "\n".join(
[line for line in pretty_xml.split("\n") if line.strip()]
)
f.write(pretty_xml) f.write(pretty_xml)
def handle_urdf(urdf_path: str) -> str: def handle_urdf(urdf_path: str) -> str:
# Check if URDF is valid # Check if URDF is valid
if not os.path.exists(urdf_path): if not os.path.exists(urdf_path):
print(f"Invalid URDF path: {urdf_path}") print(f"Invalid URDF path: {urdf_path}")
return None return None
# FIXME: Add check to see if URDF needs to be processed # FIXME: Add check to see if URDF needs to be processed
try: try:
urdf_dir = os.path.dirname(urdf_path) urdf_dir = os.path.dirname(urdf_path)
root, fixed_path = fix_mesh_paths(urdf_path, urdf_dir) root, fixed_path = fix_mesh_paths(urdf_path, urdf_dir)
format_xml(root, fixed_path) format_xml(root, fixed_path)
print(f"Successfully processed URDF: {fixed_path}") print(f"Successfully processed URDF: {fixed_path}")
return fixed_path return fixed_path
except Exception as e: except Exception as e:
print(f"Failed to process URDF: {e}") print(f"Failed to process URDF: {e}")
raise e raise e
if __name__ == '__main__':
if __name__ == "__main__":
# Example usage # Example usage
cwd = os.getcwd() cwd = os.getcwd()
urdf_path = os.path.join(cwd, "notebook/phantom/urdf/SO_5DOF_ARM100_05d.SLDASM/urdf/SO_5DOF_ARM100_05d.SLDASM.urdf") urdf_path = os.path.join(
handle_urdf(urdf_path) cwd,
"open_phantom/urdf/SO_5DOF_ARM100_05d.SLDASM/urdf/SO_5DOF_ARM100_05d.SLDASM.urdf",
)
handle_urdf(urdf_path)

View File

@ -6,75 +6,107 @@ def visualize_mesh(frame, hand_vertices, faces):
"""Visualize the hand mesh on the frame""" """Visualize the hand mesh on the frame"""
mesh_vis = frame.copy() mesh_vis = frame.copy()
height, width = frame.shape[:2] height, width = frame.shape[:2]
# Draw the base vertices of the mesh (original landmarks) # Draw the base vertices of the mesh (original landmarks)
for i in range(21): for i in range(21):
pt = tuple(map(int, hand_vertices[i][:2])) pt = tuple(map(int, hand_vertices[i][:2]))
if 0 <= pt[0] < width and 0 <= pt[1] < height: if 0 <= pt[0] < width and 0 <= pt[1] < height:
cv2.circle(mesh_vis, pt, 4, (255, 0, 0), -1) # Red circles for base vertices cv2.circle(
mesh_vis, pt, 4, (255, 0, 0), -1
) # Red circles for base vertices
# Draw the interpolated vertices # Draw the interpolated vertices
for i in range(21, len(hand_vertices)): for i in range(21, len(hand_vertices)):
pt = tuple(map(int, hand_vertices[i][:2])) pt = tuple(map(int, hand_vertices[i][:2]))
if 0 <= pt[0] < width and 0 <= pt[1] < height: if 0 <= pt[0] < width and 0 <= pt[1] < height:
cv2.circle(mesh_vis, pt, 2, (0, 255, 255), -1) # Yellow circles for interpolated vertices cv2.circle(
mesh_vis, pt, 2, (0, 255, 255), -1
) # Yellow circles for interpolated vertices
# Draw the faces of the mesh # Draw the faces of the mesh
for face in faces: for face in faces:
if len(face) == 3: # Triangle face if len(face) == 3: # Triangle face
pt1 = tuple(map(int, hand_vertices[face[0]][:2])) pt1 = tuple(map(int, hand_vertices[face[0]][:2]))
pt2 = tuple(map(int, hand_vertices[face[1]][:2])) pt2 = tuple(map(int, hand_vertices[face[1]][:2]))
pt3 = tuple(map(int, hand_vertices[face[2]][:2])) pt3 = tuple(map(int, hand_vertices[face[2]][:2]))
if (0 <= pt1[0] < width and 0 <= pt1[1] < height and if (
0 <= pt2[0] < width and 0 <= pt2[1] < height and 0 <= pt1[0] < width
0 <= pt3[0] < width and 0 <= pt3[1] < height): and 0 <= pt1[1] < height
cv2.line(mesh_vis, pt1, pt2, (0, 255, 0), 1) # Green lines for mesh edges and 0 <= pt2[0] < width
and 0 <= pt2[1] < height
and 0 <= pt3[0] < width
and 0 <= pt3[1] < height
):
cv2.line(
mesh_vis, pt1, pt2, (0, 255, 0), 1
) # Green lines for mesh edges
cv2.line(mesh_vis, pt2, pt3, (0, 255, 0), 1) cv2.line(mesh_vis, pt2, pt3, (0, 255, 0), 1)
cv2.line(mesh_vis, pt3, pt1, (0, 255, 0), 1) cv2.line(mesh_vis, pt3, pt1, (0, 255, 0), 1)
# Add explanation text # Add explanation text
cv2.putText(mesh_vis, "Red: Base vertices", (10, 30), cv2.putText(
cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 0), 2) mesh_vis,
cv2.putText(mesh_vis, "Yellow: Interpolated vertices", (10, 60), "Red: Base vertices",
cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 255), 2) (10, 30),
cv2.putText(mesh_vis, "Green: Mesh edges", (10, 90), cv2.FONT_HERSHEY_SIMPLEX,
cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2) 0.6,
(255, 0, 0),
2,
)
cv2.putText(
mesh_vis,
"Yellow: Interpolated vertices",
(10, 60),
cv2.FONT_HERSHEY_SIMPLEX,
0.6,
(0, 255, 255),
2,
)
cv2.putText(
mesh_vis,
"Green: Mesh edges",
(10, 90),
cv2.FONT_HERSHEY_SIMPLEX,
0.6,
(0, 255, 0),
2,
)
return mesh_vis return mesh_vis
def visualize_registration(cloud, vertices, icp_transform): def visualize_registration(cloud, vertices, icp_transform):
"""Visualize the ICP registration process"""
height, width = 480, 640 # Fixed size visualization height, width = 480, 640 # Fixed size visualization
reg_vis = np.ones((height, width, 3), dtype=np.uint8) * 255 # White background reg_vis = np.ones((height, width, 3), dtype=np.uint8) * 255 # White background
# Define visualization boundaries # Define visualization boundaries
margin = 50 margin = 50
view_width = width - 2 * margin view_width = width - 2 * margin
view_height = height - 2 * margin view_height = height - 2 * margin
# Find 2D min/max values for scaling # Find 2D min/max values for scaling
all_points_2d = np.vstack([cloud[:, :2], vertices[:, :2]]) all_points_2d = np.vstack([cloud[:, :2], vertices[:, :2]])
min_vals = np.min(all_points_2d, axis=0) min_vals = np.min(all_points_2d, axis=0)
max_vals = np.max(all_points_2d, axis=0) max_vals = np.max(all_points_2d, axis=0)
# Function to scale points to fit visualization # Function to scale points to fit visualization
def scale_point(point): def scale_point(point):
scaled = (point - min_vals) / (max_vals - min_vals) scaled = (point - min_vals) / (max_vals - min_vals)
x = int(scaled[0] * view_width) + margin x = int(scaled[0] * view_width) + margin
y = int(scaled[1] * view_height) + margin y = int(scaled[1] * view_height) + margin
return (x, y) return (x, y)
# Draw original point cloud (red) # Draw original point cloud (red)
for point in cloud[::10]: # Downsample for visualization for point in cloud[::10]: # Downsample for visualization
pt = scale_point(point[:2]) pt = scale_point(point[:2])
cv2.circle(reg_vis, pt, 1, (0, 0, 255), -1) # Red dots cv2.circle(reg_vis, pt, 1, (0, 0, 255), -1) # Red dots
# Draw original mesh vertices (blue) # Draw original mesh vertices (blue)
for vertex in vertices: for vertex in vertices:
pt = scale_point(vertex[:2]) pt = scale_point(vertex[:2])
cv2.circle(reg_vis, pt, 2, (255, 0, 0), -1) # Blue dots cv2.circle(reg_vis, pt, 2, (255, 0, 0), -1) # Blue dots
# Apply transformation to mesh vertices # Apply transformation to mesh vertices
transformed_vertices = [] transformed_vertices = []
for vertex in vertices: for vertex in vertices:
@ -83,181 +115,319 @@ def visualize_registration(cloud, vertices, icp_transform):
# Apply transformation # Apply transformation
v_transformed = np.dot(icp_transform, v_hom) v_transformed = np.dot(icp_transform, v_hom)
transformed_vertices.append(v_transformed[:3]) transformed_vertices.append(v_transformed[:3])
# Draw transformed mesh vertices (green) # Draw transformed mesh vertices (green)
for vertex in transformed_vertices: for vertex in transformed_vertices:
pt = scale_point(vertex[:2]) pt = scale_point(vertex[:2])
cv2.circle(reg_vis, pt, 2, (0, 255, 0), -1) # Green dots cv2.circle(reg_vis, pt, 2, (0, 255, 0), -1) # Green dots
# Add transformation matrix display # Add transformation matrix display
cv2.putText(reg_vis, "ICP Transformation Matrix:", (10, height - 120), cv2.putText(
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1) reg_vis,
"ICP Transformation Matrix:",
(10, height - 120),
cv2.FONT_HERSHEY_SIMPLEX,
0.5,
(0, 0, 0),
1,
)
for i in range(4): for i in range(4):
matrix_text = f"[{icp_transform[i][0]:.2f}, {icp_transform[i][1]:.2f}, {icp_transform[i][2]:.2f}, {icp_transform[i][3]:.2f}]" matrix_text = f"[{icp_transform[i][0]:.2f}, {icp_transform[i][1]:.2f}, {icp_transform[i][2]:.2f}, {icp_transform[i][3]:.2f}]"
cv2.putText(reg_vis, matrix_text, (10, height - 90 + i * 20), cv2.putText(
cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 0, 0), 1) reg_vis,
matrix_text,
(10, height - 90 + i * 20),
cv2.FONT_HERSHEY_SIMPLEX,
0.4,
(0, 0, 0),
1,
)
# Add legend # Add legend
cv2.putText(reg_vis, "Red: Point Cloud", (width - 200, 30), cv2.putText(
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1) reg_vis,
cv2.putText(reg_vis, "Blue: Original Mesh", (width - 200, 60), "Red: Point Cloud",
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1) (width - 200, 30),
cv2.putText(reg_vis, "Green: Transformed Mesh", (width - 200, 90), cv2.FONT_HERSHEY_SIMPLEX,
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1) 0.5,
(0, 0, 255),
1,
)
cv2.putText(
reg_vis,
"Blue: Original Mesh",
(width - 200, 60),
cv2.FONT_HERSHEY_SIMPLEX,
0.5,
(255, 0, 0),
1,
)
cv2.putText(
reg_vis,
"Green: Transformed Mesh",
(width - 200, 90),
cv2.FONT_HERSHEY_SIMPLEX,
0.5,
(0, 255, 0),
1,
)
return reg_vis return reg_vis
def visualize_constraints(frame, refined_landmarks, constrained_landmarks, camera_intrinsics):
def visualize_constraints(
frame, refined_landmarks, constrained_landmarks, camera_intrinsics
):
"""Visualize before and after applying anatomical constraints""" """Visualize before and after applying anatomical constraints"""
constraints_vis = frame.copy() constraints_vis = frame.copy()
height, width = frame.shape[:2] height, width = frame.shape[:2]
focal_x, focal_y, center_x, center_y = camera_intrinsics focal_x, focal_y, center_x, center_y = camera_intrinsics
# Define finger indices # Define finger indices
thumb_indices = [1, 2, 3, 4] thumb_indices = [1, 2, 3, 4]
index_indices = [5, 6, 7, 8] index_indices = [5, 6, 7, 8]
highlighted_indices = thumb_indices + index_indices highlighted_indices = thumb_indices + index_indices
# Draw original refined landmarks # Draw original refined landmarks
for i, landmark in enumerate(refined_landmarks): for i, landmark in enumerate(refined_landmarks):
x, y, z = landmark x, y, z = landmark
if z > 0: if z > 0:
u = int(x * focal_x / z + center_x) u = int(x * focal_x / z + center_x)
v = int(y * focal_y / z + center_y) v = int(y * focal_y / z + center_y)
if 0 <= u < width and 0 <= v < height: if 0 <= u < width and 0 <= v < height:
if i in highlighted_indices: if i in highlighted_indices:
# Draw larger circles for thumb and index fingers (the constrained ones) # Draw larger circles for thumb and index fingers (the constrained ones)
cv2.circle(constraints_vis, (u, v), 5, (0, 0, 255), -1) # Red circles cv2.circle(
constraints_vis, (u, v), 5, (0, 0, 255), -1
) # Red circles
else: else:
cv2.circle(constraints_vis, (u, v), 3, (0, 0, 255), -1) cv2.circle(constraints_vis, (u, v), 3, (0, 0, 255), -1)
# Draw connections for original landmarks # Draw connections for original landmarks
for i in range(len(refined_landmarks) - 1): for i in range(len(refined_landmarks) - 1):
if i + 1 in thumb_indices and i in thumb_indices: if i + 1 in thumb_indices and i in thumb_indices:
# Draw thumb connections # Draw thumb connections
start = refined_landmarks[i] start = refined_landmarks[i]
end = refined_landmarks[i + 1] end = refined_landmarks[i + 1]
if start[2] > 0 and end[2] > 0: if start[2] > 0 and end[2] > 0:
start_u = int(start[0] * focal_x / start[2] + center_x) start_u = int(start[0] * focal_x / start[2] + center_x)
start_v = int(start[1] * focal_y / start[2] + center_y) start_v = int(start[1] * focal_y / start[2] + center_y)
end_u = int(end[0] * focal_x / end[2] + center_x) end_u = int(end[0] * focal_x / end[2] + center_x)
end_v = int(end[1] * focal_y / end[2] + center_y) end_v = int(end[1] * focal_y / end[2] + center_y)
if (0 <= start_u < width and 0 <= start_v < height and if (
0 <= end_u < width and 0 <= end_v < height): 0 <= start_u < width
cv2.line(constraints_vis, (start_u, start_v), (end_u, end_v), (0, 0, 255), 2) and 0 <= start_v < height
and 0 <= end_u < width
and 0 <= end_v < height
):
cv2.line(
constraints_vis,
(start_u, start_v),
(end_u, end_v),
(0, 0, 255),
2,
)
# Draw constrained landmarks # Draw constrained landmarks
for i, landmark in enumerate(constrained_landmarks): for i, landmark in enumerate(constrained_landmarks):
x, y, z = landmark x, y, z = landmark
if z > 0: if z > 0:
u = int(x * focal_x / z + center_x) u = int(x * focal_x / z + center_x)
v = int(y * focal_y / z + center_y) v = int(y * focal_y / z + center_y)
if 0 <= u < width and 0 <= v < height: if 0 <= u < width and 0 <= v < height:
if i in highlighted_indices: if i in highlighted_indices:
# Draw larger circles for thumb and index fingers # Draw larger circles for thumb and index fingers
cv2.circle(constraints_vis, (u, v), 5, (0, 255, 0), -1) # Green circles cv2.circle(
constraints_vis, (u, v), 5, (0, 255, 0), -1
) # Green circles
else: else:
cv2.circle(constraints_vis, (u, v), 3, (0, 255, 0), -1) cv2.circle(constraints_vis, (u, v), 3, (0, 255, 0), -1)
# Draw connections for constrained landmarks # Draw connections for constrained landmarks
for i in range(len(constrained_landmarks) - 1): for i in range(len(constrained_landmarks) - 1):
if i + 1 in thumb_indices and i in thumb_indices: if i + 1 in thumb_indices and i in thumb_indices:
# Draw thumb connections # Draw thumb connections
start = constrained_landmarks[i] start = constrained_landmarks[i]
end = constrained_landmarks[i + 1] end = constrained_landmarks[i + 1]
if start[2] > 0 and end[2] > 0: if start[2] > 0 and end[2] > 0:
start_u = int(start[0] * focal_x / start[2] + center_x) start_u = int(start[0] * focal_x / start[2] + center_x)
start_v = int(start[1] * focal_y / start[2] + center_y) start_v = int(start[1] * focal_y / start[2] + center_y)
end_u = int(end[0] * focal_x / end[2] + center_x) end_u = int(end[0] * focal_x / end[2] + center_x)
end_v = int(end[1] * focal_y / end[2] + center_y) end_v = int(end[1] * focal_y / end[2] + center_y)
if (0 <= start_u < width and 0 <= start_v < height and if (
0 <= end_u < width and 0 <= end_v < height): 0 <= start_u < width
cv2.line(constraints_vis, (start_u, start_v), (end_u, end_v), (0, 255, 0), 2) and 0 <= start_v < height
and 0 <= end_u < width
and 0 <= end_v < height
):
cv2.line(
constraints_vis,
(start_u, start_v),
(end_u, end_v),
(0, 255, 0),
2,
)
# Add legend # Add legend
cv2.putText(constraints_vis, "Red: Before constraints", (10, 30), cv2.putText(
cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2) constraints_vis,
cv2.putText(constraints_vis, "Green: After constraints", (10, 60), "Red: Before constraints",
cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2) (10, 30),
cv2.FONT_HERSHEY_SIMPLEX,
0.7,
(0, 0, 255),
2,
)
cv2.putText(
constraints_vis,
"Green: After constraints",
(10, 60),
cv2.FONT_HERSHEY_SIMPLEX,
0.7,
(0, 255, 0),
2,
)
return constraints_vis return constraints_vis
def visualize_robot_params(frame, position, direction, normal, gripper_width, camera_intrinsics):
"""Visualize robot parameters (position, orientation, gripper width)""" def visualize_robot_params(
frame, position, orientation_matrix, gripper_width, camera_intrinsics
):
"""Visualize robot parameters (position, orientation matrix, gripper width)"""
robot_vis = frame.copy() robot_vis = frame.copy()
height, width = frame.shape[:2] height, width = frame.shape[:2]
focal_x, focal_y, center_x, center_y = camera_intrinsics focal_x, focal_y, center_x, center_y = camera_intrinsics
# Extract axes from orientation matrix
x_axis = orientation_matrix[:, 0] # First column (principal axis)
y_axis = orientation_matrix[:, 1] # Second column
z_axis = orientation_matrix[:, 2] # Third column (normal)
# Project position to image coordinates # Project position to image coordinates
x, y, z = position x, y, z = position
if z > 0: if z > 0:
u = int(x * focal_x / z + center_x) u = int(x * focal_x / z + center_x)
v = int(y * focal_y / z + center_y) v = int(y * focal_y / z + center_y)
if 0 <= u < width and 0 <= v < height: if 0 <= u < width and 0 <= v < height:
# Draw end effector position # Draw end effector position
cv2.circle(robot_vis, (u, v), 10, (255, 0, 0), -1) # Blue circle cv2.circle(robot_vis, (u, v), 10, (255, 0, 0), -1) # Blue circle
# Draw direction vector (X axis) # Draw X axis (principal axis)
dx, dy, dz = direction dx, dy, dz = x_axis
scale = 50 # Scale for better visualization scale = 50 # Scale for better visualization
end_u = int((x + dx * scale) * focal_x / (z + dz * scale) + center_x) end_u = int((x + dx * scale) * focal_x / (z + dz * scale) + center_x)
end_v = int((y + dy * scale) * focal_y / (z + dz * scale) + center_y) end_v = int((y + dy * scale) * focal_y / (z + dz * scale) + center_y)
if 0 <= end_u < width and 0 <= end_v < height: if 0 <= end_u < width and 0 <= end_v < height:
cv2.line(robot_vis, (u, v), (end_u, end_v), (0, 0, 255), 2) # Red line (X axis) cv2.line(
cv2.putText(robot_vis, "X", (end_u, end_v), robot_vis, (u, v), (end_u, end_v), (0, 0, 255), 2
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2) ) # Red line (X axis)
cv2.putText(
# Draw normal vector (Z axis) robot_vis,
nx, ny, nz = normal "X",
(end_u, end_v),
cv2.FONT_HERSHEY_SIMPLEX,
0.5,
(0, 0, 255),
2,
)
# Draw Z axis (normal)
nx, ny, nz = z_axis
end_u = int((x + nx * scale) * focal_x / (z + nz * scale) + center_x) end_u = int((x + nx * scale) * focal_x / (z + nz * scale) + center_x)
end_v = int((y + ny * scale) * focal_y / (z + nz * scale) + center_y) end_v = int((y + ny * scale) * focal_y / (z + nz * scale) + center_y)
if 0 <= end_u < width and 0 <= end_v < height: if 0 <= end_u < width and 0 <= end_v < height:
cv2.line(robot_vis, (u, v), (end_u, end_v), (0, 255, 0), 2) # Green line (Z axis) cv2.line(
cv2.putText(robot_vis, "Z", (end_u, end_v), robot_vis, (u, v), (end_u, end_v), (0, 255, 0), 2
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) ) # Green line (Z axis)
cv2.putText(
# Calculate Y axis (cross product of Z and X for right-hand coordinate system) robot_vis,
y_axis = np.cross(normal, direction) "Z",
y_axis = y_axis / np.linalg.norm(y_axis) (end_u, end_v),
cv2.FONT_HERSHEY_SIMPLEX,
0.5,
(0, 255, 0),
2,
)
# Draw Y axis
yx, yy, yz = y_axis yx, yy, yz = y_axis
end_u = int((x + yx * scale) * focal_x / (z + yz * scale) + center_x) end_u = int((x + yx * scale) * focal_x / (z + yz * scale) + center_x)
end_v = int((y + yy * scale) * focal_y / (z + yz * scale) + center_y) end_v = int((y + yy * scale) * focal_y / (z + yz * scale) + center_y)
if 0 <= end_u < width and 0 <= end_v < height: if 0 <= end_u < width and 0 <= end_v < height:
cv2.line(robot_vis, (u, v), (end_u, end_v), (255, 0, 0), 2) # Blue line (Y axis) cv2.line(
cv2.putText(robot_vis, "Y", (end_u, end_v), robot_vis, (u, v), (end_u, end_v), (255, 0, 0), 2
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2) ) # Blue line (Y axis)
cv2.putText(
robot_vis,
"Y",
(end_u, end_v),
cv2.FONT_HERSHEY_SIMPLEX,
0.5,
(255, 0, 0),
2,
)
# Visualize gripper width # Visualize gripper width
gripper_radius = int(gripper_width * 100) # Scale for better visualization gripper_radius = int(gripper_width * 100) # Scale for better visualization
cv2.circle(robot_vis, (u, v), gripper_radius, (0, 255, 255), 2) # Yellow circle cv2.circle(
robot_vis, (u, v), gripper_radius, (0, 255, 255), 2
) # Yellow circle
# Add parameter values as text # Add parameter values as text
y_offset = height - 160 y_offset = height - 160
cv2.putText(robot_vis, f"Position: ({x:.2f}, {y:.2f}, {z:.2f})", (10, y_offset), cv2.putText(
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1) robot_vis,
f"Position: ({x:.2f}, {y:.2f}, {z:.2f})",
(10, y_offset),
cv2.FONT_HERSHEY_SIMPLEX,
0.5,
(255, 255, 255),
1,
)
y_offset += 30 y_offset += 30
cv2.putText(robot_vis, f"Direction: ({dx:.2f}, {dy:.2f}, {dz:.2f})", (10, y_offset), cv2.putText(
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1) robot_vis,
f"X axis: ({x_axis[0]:.2f}, {x_axis[1]:.2f}, {x_axis[2]:.2f})",
(10, y_offset),
cv2.FONT_HERSHEY_SIMPLEX,
0.5,
(0, 0, 255),
1,
)
y_offset += 30 y_offset += 30
cv2.putText(robot_vis, f"Normal: ({nx:.2f}, {ny:.2f}, {nz:.2f})", (10, y_offset), cv2.putText(
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1) robot_vis,
f"Z axis: ({z_axis[0]:.2f}, {z_axis[1]:.2f}, {z_axis[2]:.2f})",
(10, y_offset),
cv2.FONT_HERSHEY_SIMPLEX,
0.5,
(0, 255, 0),
1,
)
y_offset += 30 y_offset += 30
cv2.putText(robot_vis, f"Gripper Width: {gripper_width:.2f}", (10, y_offset), cv2.putText(
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 255), 1) robot_vis,
f"Gripper Width: {gripper_width:.2f}",
return robot_vis (10, y_offset),
cv2.FONT_HERSHEY_SIMPLEX,
0.5,
(0, 255, 255),
1,
)
return robot_vis