improve icp and robot params

This commit is contained in:
Ethan Clark 2025-03-21 17:05:51 -07:00
parent aae4e91f54
commit b38527b5db
6 changed files with 690 additions and 417 deletions

View File

@ -1,92 +1,94 @@
import os
import cv2
import time
import torch
import cv2
import depth_pro
import mediapipe as mp
import numpy as np
import open3d as o3d
import mediapipe as mp
import torch
from PIL import Image
from robot_manager import RobotManager
from sam2.sam2_image_predictor import SAM2ImagePredictor
from tqdm import tqdm
from utils.visualizations import *
from robot_manager import RobotManager
# from diffusers import StableDiffusionInpaintPipeline
class HandProcessor:
def __init__(self) -> None:
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.mp_hands = mp.solutions.hands
self.mp_drawing = mp.solutions.drawing_utils
self.mp_drawing_styles = mp.solutions.drawing_styles
self.hands = self.mp_hands.Hands(
model_complexity=1,
max_num_hands=1,
static_image_mode=False,
)
# NOTE: Look into better depth estimation models
# Initialize MiDaS for depth estimation
print("Loading MiDaS model...")
self.midas = torch.hub.load("intel-isl/MiDaS", "DPT_Hybrid")
self.midas.to(self.device)
self.midas.eval()
# Load MiDaS transforms to resize and normalize input images
self.midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms")
self.transform = self.midas_transforms.dpt_transform
# Segment hand from image using MediaPipe Hands landmarks
def _create_mask(self, image: np.ndarray, landmarks: list, thickness: int=5, padding:int=10) -> np.ndarray:
height, width = image.shape[:2]
mask = np.zeros((height, width), dtype=np.uint8)
points = [(int(landmark.x * width), int(landmark.y * height)) for landmark in landmarks.landmark]
# Draw filled circles at each landmark
for point in points:
cv2.circle(mask, point, thickness, 255, -1)
# Connect all landmarks with thick lines
for connection in self.mp_hands.HAND_CONNECTIONS:
start_idx, end_idx = connection
cv2.line(mask, points[start_idx], points[end_idx], 255, thickness)
# Create palm by connecting base of fingers with wrist
palm_points = [points[0], points[1], points[5], points[9], points[13], points[17]]
cv2.fillPoly(mask, [np.array(palm_points)], 255)
# Create shape between fingers
finger_bases = [(1,5), (5,9), (9,13), (13,17)]
for base1, base2 in finger_bases:
triangle = np.array([points[0], points[base1], points[base2]])
cv2.fillPoly(mask, [triangle], 255)
# Dilate to smooth and expand slightly
kernel = np.ones((padding, padding), np.uint8)
mask = cv2.dilate(mask, kernel, iterations=1)
# Smooth the mask
mask = cv2.GaussianBlur(mask, (21, 21), 0)
_, mask = cv2.threshold(mask, 50, 255, cv2.THRESH_BINARY)
# Load SAM2 model for hand segmentation
print("Loading SAM2 model...")
self.sam2_predictor = SAM2ImagePredictor.from_pretrained(
"facebook/sam2-hiera-large"
)
def _create_mask(self, frame: np.ndarray, landmarks: list) -> np.ndarray:
height, width = frame.shape[:2]
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# Set image in SAM2 predictor
with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
self.sam2_predictor.set_image(frame_rgb) # Set image for prediction
# Convert landmarks to point prompts
points = []
for landmark in landmarks.landmark:
x, y = int(landmark.x * width), int(landmark.y * height)
points.append([x, y])
input_points = np.array(points)
# Predict mask with point prompts
masks, _, _ = self.sam2_predictor.predict(
point_coords=input_points, # Pass the points as prompts
point_labels=np.ones(
len(input_points)
), # All points from hand are foreground
multimask_output=False, # Just get one mask
)
mask = masks[0].astype(np.uint8) * 255
return mask
"""
Transform input image to match MiDaS model requirements
Estimate depth map using MiDaS model
"""
# TODO: Look into why screen goes black sometimes
# TODO: Swap MiDaS for ml-depth-pro model
def _estimate_depth(self, image: np.ndarray) -> tuple:
img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
# Transform img for MiDaS model
input_batch = self.transform(img).to(self.device)
with torch.inference_mode():
prediction = self.midas(input_batch)
# Interpolate to original size
prediction = torch.nn.functional.interpolate(
prediction.unsqueeze(1),
@ -94,127 +96,146 @@ class HandProcessor:
mode="bicubic",
align_corners=False,
).squeeze()
# Convert to numpy and normalize to 0-255 for visualization
depth_map = prediction.cpu().numpy()
depth_min = depth_map.min()
depth_max = depth_map.max()
depth_map_normalized = 255 * (depth_map - depth_min) / (depth_max - depth_min)
return depth_map, depth_map_normalized.astype(np.uint8)
# TODO: Look into better depth scaling
def _create_cloud(self, depth_map: np.ndarray, hand_mask: np.ndarray) -> np.ndarray:
focal_x, focal_y, center_x, center_y = self.camera_intrinsics
v_coords, u_coords = np.where(hand_mask > 0)
v_coords, u_coords = np.where(hand_mask > 0)
z_values = depth_map[v_coords, u_coords]
# Filter out zero depth values
valid_indices = z_values > 0
u_coords = u_coords[valid_indices]
v_coords = v_coords[valid_indices]
z_values = z_values[valid_indices]
# NOTE: Abritrary scaling factor for depth
z_metric = z_values * 0.5
# Back-project to 3D using camera intrinsics
x_values = (u_coords - center_x) * z_metric / focal_x
y_values = (v_coords - center_y) * z_metric / focal_y
points = np.column_stack((x_values, y_values, z_metric))
return points
# TODO: Look into better Z scaling
# TODO: Look into better depth scaling
def _create_mesh(self, landmarks: list, image_dims: tuple) -> np.ndarray:
width, height = image_dims
vertices = []
for landmark in landmarks.landmark:
vertices.append([
landmark.x * width,
landmark.y * height,
landmark.z * width
])
vertices.append(
[landmark.x * width, landmark.y * height, landmark.z * width]
)
# Define faces (triangles) connecting landmarks
faces = [
# Palm
[0, 1, 5], [0, 5, 9], [0, 9, 13], [0, 13, 17],
[0, 1, 5],
[0, 5, 9],
[0, 9, 13],
[0, 13, 17],
# Thumb
[1, 2, 3], [2, 3, 4],
[1, 2, 3],
[2, 3, 4],
# Index finger
[5, 6, 7], [6, 7, 8],
[5, 6, 7],
[6, 7, 8],
# Middle finger
[9, 10, 11], [10, 11, 12],
[9, 10, 11],
[10, 11, 12],
# Ring finger
[13, 14, 15], [14, 15, 16],
[13, 14, 15],
[14, 15, 16],
# Pinky
[17, 18, 19], [18, 19, 20]
[17, 18, 19],
[18, 19, 20],
]
dense_vertices = list(vertices)
# Add interpolated vertices along finger segments
connections = self.mp_hands.HAND_CONNECTIONS
for connection in connections:
start_idx, end_idx = connection
start_point = np.array(vertices[start_idx])
end_point = np.array(vertices[end_idx])
# Add 2 interpolated points between each connected pair
for t in [0.33, 0.66]:
interp_point = start_point * (1-t) + end_point * t
interp_point = start_point * (1 - t) + end_point * t
dense_vertices.append(interp_point.tolist())
return np.array(dense_vertices), np.array(faces)
# TODO: Improve ICP registration for better alignment
def _icp_registration(self, cloud: np.ndarray, vertices: np.ndarray) -> np.ndarray:
source = o3d.geometry.PointCloud()
source.points = o3d.utility.Vector3dVector(vertices)
target = o3d.geometry.PointCloud()
target.points = o3d.utility.Vector3dVector(cloud)
# Calculate centroids
source_centroid = np.mean(vertices, axis=0)
target_centroid = np.mean(cloud, axis=0)
# Calculate initial translation to align centroids
initial_translation = target_centroid - source_centroid
# Create initial transformation matrix
trans_init = np.eye(4)
trans_init[:3, 3] = initial_translation
result = o3d.pipelines.registration.registration_icp(
source,
target,
max_correspondence_distance=0.05,
target,
max_correspondence_distance=0.1, # Increased distance
init=trans_init, # Initial transformation
estimation_method=o3d.pipelines.registration.TransformationEstimationPointToPoint(),
criteria=o3d.pipelines.registration.ICPConvergenceCriteria(
max_iteration=100
), # Increased iterations
)
transformation = result.transformation
return transformation
# TODO: Look into better depth scaling
def _refine_landmarks(self, landmarks: list, transform: int, image_dims: tuple):
width, height = image_dims
refined_landmarks = []
for landmark in landmarks.landmark:
point = np.array([
landmark.x * width,
landmark.y * height,
landmark.z * width, # TODO: Figure out better scaling factor
1.0
])
point = np.array(
[landmark.x * width, landmark.y * height, landmark.z * width, 1.0]
)
# Apply transformation to 3D point
transformed = np.dot(transform, point)
refined_landmarks.append(transformed[:3])
return refined_landmarks
def _apply_constraints(self, landmarks: list):
constrained = np.array(landmarks)
# Define finger joint indices
# MediaPipe hand model: Wrist is 0, thumb is 1-4, index is 5-8, etc.
thumb_indices = [1, 2, 3, 4]
index_indices = [5, 6, 7, 8]
# Constrain the last two joints of thumb and index finger
# as mentioned in the paper
for finger_indices in [thumb_indices, index_indices]:
@ -224,115 +245,165 @@ class HandProcessor:
joint1 = constrained[finger_indices[-3]]
joint2 = constrained[finger_indices[-2]]
joint3 = constrained[finger_indices[-1]]
# Direction of the first segment
dir1 = joint2 - joint1
dir1 = dir1 / np.linalg.norm(dir1)
# Instead of full ball joint, constrain the last joint's direction
# to follow similar direction as the previous segment
ideal_dir = dir1.copy()
actual_dir = joint3 - joint2
actual_length = np.linalg.norm(actual_dir)
# Force the direction to be within a cone of the previous segment
# (limiting to single degree of freedom approximately)
corrected_dir = ideal_dir * actual_length
# Apply the correction
constrained[finger_indices[-1]] = joint2 + corrected_dir
return constrained
def _get_robot_params(self, refined_landmarks) -> tuple:
# Extract key landmarks
wrist = refined_landmarks[0] # Wrist landmark
thumb_tip = refined_landmarks[4] # Thumb tip
index_tip = refined_landmarks[8] # Index finger tip
# Calculate end effector position (midpoint between thumb and index tips)
def _get_robot_params(self, refined_landmarks):
# Extract keypoints
landmarks = np.array(refined_landmarks)
# Define indices for specific parts of the hand
thumb_indices = [1, 2, 3, 4] # Thumb landmarks
index_indices = [5, 6, 7, 8] # Index finger landmarks
thumb_tip_idx = 4
index_tip_idx = 8
# 1. Set target position as midpoint between thumb tip and index tip
thumb_tip = landmarks[thumb_tip_idx]
index_tip = landmarks[index_tip_idx]
position = (thumb_tip + index_tip) / 2
# Calculate vectors for orientation
v1 = thumb_tip - wrist # Vector from wrist to thumb tip
v2 = index_tip - wrist # Vector from wrist to index tip
# Calculate normal to hand plane
normal = np.cross(v1, v2)
if np.linalg.norm(normal) > 0:
normal = normal / np.linalg.norm(normal)
else:
# Default if vectors are collinear
normal = np.array([0, 0, 1])
# Calculate main direction along thumb
direction = thumb_tip - wrist
if np.linalg.norm(direction) > 0:
direction = direction / np.linalg.norm(direction)
else:
# Default if thumb is at wrist (unlikely)
direction = np.array([1, 0, 0])
# Calculate gripper width
# 2. Calculate orientation
# Get all thumb and index finger points for plane fitting
thumb_points = landmarks[thumb_indices]
index_points = landmarks[index_indices]
finger_points = np.vstack([thumb_points, index_points])
# Fit plane to finger points
centroid = np.mean(finger_points, axis=0)
centered_points = finger_points - centroid
# Use SVD to find the normal to the best-fitting plane
u, s, vh = np.linalg.svd(centered_points)
# The normal is the last right singular vector
normal = vh[2, :]
# Ensure normal is a unit vector
normal = normal / np.linalg.norm(normal)
# Fit a principal axis through thumb points
thumb_centroid = np.mean(thumb_points, axis=0)
thumb_centered = thumb_points - thumb_centroid
# Use SVD again to find direction of maximum variance (principal axis)
u, s, vh = np.linalg.svd(thumb_centered)
principal_axis = vh[0, :] # First singular vector
# Ensure principal axis is a unit vector
principal_axis = principal_axis / np.linalg.norm(principal_axis)
# Construct orthogonal vectors for orientation matrix
z_axis = normal # Normal to the plane
x_axis = principal_axis # Along the thumb
# Ensure x_axis is orthogonal to z_axis
x_axis = x_axis - np.dot(x_axis, z_axis) * z_axis
x_axis = x_axis / np.linalg.norm(x_axis)
# Compute y_axis as cross product to ensure orthogonality
y_axis = np.cross(z_axis, x_axis)
# Create rotation matrix
rotation_matrix = np.column_stack([x_axis, y_axis, z_axis])
# 3. Calculate gripper width
gripper_width = np.linalg.norm(thumb_tip - index_tip)
return position, direction, normal, gripper_width
# Apply percentile threshold to mitigate slippage (as mentioned in the paper)
# Store gripper widths in a buffer to compute percentiles
if not hasattr(self, "gripper_width_buffer"):
self.gripper_width_buffer = []
self.gripper_width_buffer.append(gripper_width)
# Keep only the last 100 values
if len(self.gripper_width_buffer) > 100:
self.gripper_width_buffer.pop(0)
# Apply 20th percentile threshold
if len(self.gripper_width_buffer) > 5: # Need enough samples
min_width = np.percentile(self.gripper_width_buffer, 20)
if gripper_width < min_width:
gripper_width = min_width
return position, rotation_matrix, gripper_width
def record_video(self) -> str:
output_dir = os.path.join(os.getcwd(), 'recordings')
output_dir = os.path.join(os.getcwd(), "recordings")
os.makedirs(output_dir, exist_ok=True)
timestamp = time.strftime("%Y%m%d-%H%M%S")
video_filename = os.path.join(output_dir, f"recorded_video_{timestamp}.mp4")
cap = cv2.VideoCapture(0)
assert cap.isOpened(), "Failed to open camera."
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS))
# Update camera intrinsics based on video dimensions
self.camera_intrinsics = (width * 0.8, height * 0.8, width / 2, height / 2)
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
out = None
recording = False
did_record = False
print("Camera is active. Press 'r' to start/stop recording, 'q' to quit.")
while cap.isOpened():
success, frame = cap.read()
assert success, "Failed to read from camera."
# Mirror the image for more intuitive viewing
frame = cv2.flip(frame, 1)
# Create a separate display frame for showing the recording indicator
display_frame = frame.copy()
# Display recording indicator ONLY on the display frame
if recording:
cv2.circle(display_frame, (30, 30), 15, (0, 0, 255), -1)
cv2.putText(display_frame, "RECORDING", (50, 40), cv2.FONT_HERSHEY_SIMPLEX,
1, (0, 0, 255), 2, cv2.LINE_AA)
cv2.putText(
display_frame,
"RECORDING",
(50, 40),
cv2.FONT_HERSHEY_SIMPLEX,
1,
(0, 0, 255),
2,
cv2.LINE_AA,
)
# Show the display frame (with indicator if recording)
cv2.imshow('Video Recording', display_frame)
cv2.imshow("Video Recording", display_frame)
# Write the original frame (without indicator) to video file if recording
if recording and out is not None:
out.write(frame)
key = cv2.waitKey(1) & 0xFF
# NOTE: Haven't tested what happens if user records multiple videos in one session (probably overwrites?)
if key == ord('q'):
break
elif key == ord('r'):
if key == ord("r"):
recording = not recording
if recording:
print(f"Started recording to {video_filename}")
@ -342,29 +413,30 @@ class HandProcessor:
out.release()
print(f"Stopped recording. Video saved to {video_filename}")
did_record = True
break
if out is not None:
out.release()
cap.release()
cv2.destroyAllWindows()
return video_filename if did_record else None
def process_video(self, video_path: str) -> tuple:
assert video_path, "Video path is required."
cap = cv2.VideoCapture(video_path)
assert cap.isOpened(), f"Failed to open video file {video_path}"
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS))
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
# Update camera intrinsics based on video dimensions
self.camera_intrinsics = (width * 0.8, height * 0.8, width / 2, height / 2)
base_path = os.path.splitext(video_path)[0]
segmented_output_path = f"{base_path}_masked.mp4"
depth_output_path = f"{base_path}_depth.mp4"
@ -372,94 +444,112 @@ class HandProcessor:
registration_output_path = f"{base_path}_registration.mp4"
constraints_output_path = f"{base_path}_constraints.mp4"
robot_output_path = f"{base_path}_robot.mp4"
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
segmented_out = cv2.VideoWriter(segmented_output_path, fourcc, fps, (width, height))
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
segmented_out = cv2.VideoWriter(
segmented_output_path, fourcc, fps, (width, height)
)
depth_out = cv2.VideoWriter(depth_output_path, fourcc, fps, (width, height))
mesh_out = cv2.VideoWriter(mesh_output_path, fourcc, fps, (width, height))
reg_out = cv2.VideoWriter(registration_output_path, fourcc, fps, (640, 480)) # Fixed size
constraints_out = cv2.VideoWriter(constraints_output_path, fourcc, fps, (width, height))
reg_out = cv2.VideoWriter(
registration_output_path, fourcc, fps, (640, 480)
) # Fixed size
constraints_out = cv2.VideoWriter(
constraints_output_path, fourcc, fps, (width, height)
)
robot_out = cv2.VideoWriter(robot_output_path, fourcc, fps, (width, height))
print(f"Processing video with {total_frames} frames...")
print(f"Processing video with {total_frames} frames...")
for _ in tqdm(range(total_frames)):
success, frame = cap.read()
assert success, "Failed to read frame from video."
# Convert image to RGB for MediaPipe
image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# Process with MediaPipe Hands
results = self.hands.process(image_rgb)
# Initialize output frames
segmented_frame = frame.copy()
depth_frame = np.zeros((height, width, 3), dtype=np.uint8)
mesh_frame = frame.copy()
reg_frame = np.ones((480, 640, 3), dtype=np.uint8) * 255 # Fixed size, white background
reg_frame = (
np.ones((480, 640, 3), dtype=np.uint8) * 255
) # Fixed size, white background
constraints_frame = frame.copy()
robot_frame = frame.copy()
# Process if hand is detected
if results.multi_hand_landmarks:
for hand_landmarks in results.multi_hand_landmarks:
# Segment hand
hand_mask = self._create_mask(frame, hand_landmarks)
mask_overlay = frame.copy()
mask_overlay[hand_mask > 0] = [0, 0, 255] # Red color for mask
segmented_frame = cv2.addWeighted(frame, 0.7, mask_overlay, 0.3, 0)
# Depth estimation
depth_map, depth_vis = self._estimate_depth(frame)
depth_colored = cv2.applyColorMap(depth_vis, cv2.COLORMAP_JET)
depth_frame = depth_colored.copy()
# Create hand mesh
hand_vertices, hand_faces = self._create_mesh(hand_landmarks, (width, height))
mesh_frame = visualize_mesh(frame, hand_vertices, hand_faces)
# Create point cloud from depth
cloud = self._create_cloud(depth_map, hand_mask)
# Perform ICP registration
hand_vertices, hand_faces = self._create_mesh(
hand_landmarks, (width, height)
)
mesh_frame = visualize_mesh(frame, hand_vertices, hand_faces)
icp_transform = self._icp_registration(cloud, hand_vertices)
reg_frame = visualize_registration(cloud, hand_vertices, icp_transform)
# Refine landmarks using ICP transformation
refined_landmarks = self._refine_landmarks(hand_landmarks, icp_transform, (width, height))
# Store pre-constraint landmarks for visualization
reg_frame = visualize_registration(
cloud, hand_vertices, icp_transform
)
refined_landmarks = self._refine_landmarks(
hand_landmarks, icp_transform, (width, height)
)
original_refined = refined_landmarks.copy()
# Apply anatomical constraints
constrained_landmarks = self._apply_constraints(refined_landmarks)
constraints_frame = visualize_constraints(frame, original_refined, constrained_landmarks, self.camera_intrinsics)
# # Calculate robot parameters
# position, direction, normal, gripper_width = self._get_robot_params(constrained_landmarks)
# robot_frame = visualize_robot_params(frame, position, direction, normal, gripper_width)
constraints_frame = visualize_constraints(
frame,
original_refined,
constrained_landmarks,
self.camera_intrinsics,
)
position, orientation, gripper_width = self._get_robot_params(
constrained_landmarks
)
robot_frame = visualize_robot_params(
frame,
position,
orientation,
gripper_width,
self.camera_intrinsics,
)
segmented_out.write(segmented_frame)
depth_out.write(depth_frame)
mesh_out.write(mesh_frame)
reg_out.write(reg_frame)
constraints_out.write(constraints_frame)
robot_out.write(robot_frame)
display_scale = 0.5
display_size = (int(width * display_scale), int(height * display_scale))
reg_display_size = (int(640 * display_scale), int(480 * display_scale))
cv2.imshow('Segmented', cv2.resize(segmented_frame, display_size))
cv2.imshow('Depth', cv2.resize(depth_frame, display_size))
cv2.imshow('Mesh', cv2.resize(mesh_frame, display_size))
cv2.imshow('Registration', cv2.resize(reg_frame, reg_display_size))
cv2.imshow('Constraints', cv2.resize(constraints_frame, display_size))
cv2.imshow('Robot Parameters', cv2.resize(robot_frame, display_size))
cv2.imshow("Segmented", cv2.resize(segmented_frame, display_size))
cv2.imshow("Depth", cv2.resize(depth_frame, display_size))
cv2.imshow("Mesh", cv2.resize(mesh_frame, display_size))
cv2.imshow("Registration", cv2.resize(reg_frame, reg_display_size))
cv2.imshow("Constraints", cv2.resize(constraints_frame, display_size))
cv2.imshow("Robot Parameters", cv2.resize(robot_frame, display_size))
if cv2.waitKey(1) & 0xFF == 27: # ESC to exit
break
cap.release()
segmented_out.release()
depth_out.release()
@ -468,7 +558,7 @@ class HandProcessor:
constraints_out.release()
robot_out.release()
cv2.destroyAllWindows()
print(f"Processing complete. Results saved to:")
print(f"- Hand mask: {segmented_output_path}")
print(f"- Depth visualization: {depth_output_path}")
@ -476,12 +566,12 @@ class HandProcessor:
print(f"- Registration visualization: {registration_output_path}")
print(f"- Constraints visualization: {constraints_output_path}")
print(f"- Robot parameters: {robot_output_path}")
return {
"segmented": segmented_output_path,
"depth": depth_output_path,
"mesh": mesh_output_path,
"registration": registration_output_path,
"constraints": constraints_output_path,
"robot": robot_output_path
}
"robot": robot_output_path,
}

View File

@ -5,16 +5,16 @@ from hand_processor import HandProcessor
def main():
processor = HandProcessor()
record_option = input("Record a new video? (y/n): ")
if record_option.lower() == 'y':
if record_option.lower() == "y":
print("Starting video recording...")
video_path = processor.record_video()
if not video_path:
print("Video recording failed.")
return
print(f"Video recorded successfully to {video_path}")
process_option = input("Process the video now? (y/n): ")
if process_option.lower() == "n":
@ -25,9 +25,9 @@ def main():
while not os.path.exists(video_path):
print("Error: Video file not found.")
video_path = input("Enter the path to the video file: ")
processor.process_video(video_path)
if __name__ == "__main__":
main()
main()

View File

@ -1,72 +1,75 @@
import os
import cv2
import numpy as np
import pybullet as p
from utils.math import *
from utils.handle_urdf import handle_urdf
from scipy.spatial.transform import Rotation as R
class RobotManager:
def __init__(self, urdf_path: str, camera_intrinsics: tuple) -> None:
self.physics_client = p.connect(p.DIRECT)
robot_urdf = handle_urdf(urdf_path)
self.robot_id = self._load_robot(robot_urdf)
self.joint_count = p.getNumJoints(self.robot_id)
self.end_effector_index = self._find_end_effector()
self.fx, self.fy, self.cx, self.cy = camera_intrinsics
# Set up rendering parameters
self.img_width = int(self.cx * 2)
self.img_height = int(self.cy * 2)
# Load the robot URDF into PyBullet
# Load the robot URDF into PyBullet
def _load_robot(self, robot_urdf: str) -> int:
try:
robot_id = p.loadURDF(
robot_urdf,
basePosition=[0, 0, 0],
useFixedBase=True,
flags=p.URDF_USE_SELF_COLLISION | p.URDF_USE_INERTIA_FROM_FILE
flags=p.URDF_USE_SELF_COLLISION | p.URDF_USE_INERTIA_FROM_FILE,
)
except p.error as e:
print(f"PyBullet error when loading URDF: {e}")
raise e
robot_name = p.getBodyInfo(robot_id)[1].decode('utf-8')
robot_name = p.getBodyInfo(robot_id)[1].decode("utf-8")
print(f"Successfully loaded {robot_name} robot with ID: {robot_id}")
return robot_id
# NOTE: Only applicable if the robot has one end effector
def _find_end_effector(self) -> int:
assert self.joint_count > 0, "Robot has no joints"
# Keywords to look for in joint names to identify end effector
keywords = ['gripper', 'tool', 'tcp', 'end_effector', 'hand']
keywords = ["gripper", "tool", "tcp", "end_effector", "hand"]
for i in range(self.joint_count):
info = p.getJointInfo(self.robot_id, i)
joint_name = info[1].decode('utf-8').lower()
joint_name = info[1].decode("utf-8").lower()
# Check if any keyword is in the joint name
if any(keyword in joint_name for keyword in keywords):
return i
# If no specific end effector found, use the last joint in the chain
return self.joint_count - 1
# TODO: Use inverse kinematics to set the robot pose
def set_robot_pose(self, position: np.ndarray, orientation_vectors: np.ndarray, gripper_width: float) -> None:
def set_robot_pose(
self,
position: np.ndarray,
orientation_vectors: np.ndarray,
gripper_width: float,
) -> None:
pass
# Render the robot in some scene using some camera parameters
def render_robot(self, bg_image=None, camera_params=None):
assert self.robot_id >= 0, "Robot not properly loaded"
# Set up camera parameters
if camera_params is None:
# Default camera setup
@ -77,40 +80,37 @@ class RobotManager:
cam_roll = 0
else:
cam_target, cam_distance, cam_yaw, cam_pitch, cam_roll = camera_params
# Compute view matrix
view_matrix = p.computeViewMatrixFromYawPitchRoll(
cameraTargetPosition=cam_target,
distance=cam_distance,
yaw=cam_yaw,
yaw=cam_yaw,
pitch=cam_pitch,
roll=cam_roll,
upAxisIndex=2
upAxisIndex=2,
)
# Compute projection matrix
aspect = self.img_width / self.img_height
proj_matrix = p.computeProjectionMatrixFOV(
fov=60,
aspect=aspect,
nearVal=0.01,
farVal=100.0
fov=60, aspect=aspect, nearVal=0.01, farVal=100.0
)
# Render the scene
img_arr = p.getCameraImage(
width=self.img_width,
height=self.img_height,
viewMatrix=view_matrix,
projectionMatrix=proj_matrix,
renderer=p.ER_BULLET_HARDWARE_OPENGL
renderer=p.ER_BULLET_HARDWARE_OPENGL,
)
# Extract RGB and depth
rgb = np.reshape(img_arr[2], (self.img_height, self.img_width, 4))
rgb = rgb[:, :, :3] # Remove alpha channel
depth = np.reshape(img_arr[3], (self.img_height, self.img_width))
# If background image is provided, composite
if bg_image is not None:
# Resize background if needed
@ -119,31 +119,33 @@ class RobotManager:
bg_resized = cv2.resize(bg_image, (self.img_width, self.img_height))
else:
bg_resized = bg_image
# Create mask from depth
mask = (depth < 0.99).astype(np.float32)
mask = np.stack([mask, mask, mask], axis=2)
# Composite
composite = bg_resized * (1 - mask) + rgb * mask
return composite.astype(np.uint8)
return rgb.astype(np.uint8)
def __del__(self) -> None:
if hasattr(self, 'physics_client'):
if hasattr(self, "physics_client"):
try:
p.disconnect(self.physics_client)
except:
pass
if __name__ == "__main__":
cwd = os.getcwd()
urdf_path = os.path.join(cwd, "notebook/phantom/urdf/SO_5DOF_ARM100_05d.SLDASM/urdf/SO_5DOF_ARM100_05d.SLDASM.urdf")
camera_intrinsics = (320, 240, 320, 240) # Random intrinsics for example
urdf_path = os.path.join(
cwd,
"notebook/phantom/urdf/SO_5DOF_ARM100_05d.SLDASM/urdf/SO_5DOF_ARM100_05d.SLDASM.urdf",
)
camera_intrinsics = (320, 240, 320, 240) # Random intrinsics for example
robot_vis = RobotManager(urdf_path, camera_intrinsics)
rendered_image = robot_vis.render_robot()
@ -155,4 +157,4 @@ if __name__ == "__main__":
# Option 2: Save the image to a file
output_path = "robot_render.png"
cv2.imwrite(output_path, rendered_image)
print(f"Render saved to {output_path}")
print(f"Render saved to {output_path}")

View File

@ -0,0 +1 @@
# TODO: Implement camera calibration functions to extract camera extrinsics and intrinsics

View File

@ -2,7 +2,6 @@ import os
import xml.dom.minidom as minidom
import xml.etree.ElementTree as ET
"""
Fixes a specific path that uses the ROS-style 'package://' prefix.
The 'package://' prefix is used in URDF files to refer to files in the same package.
@ -10,26 +9,28 @@ However, when we're not running in a ROS environment, the paths are not valid.
This function tries to find the absolute path of the mesh file.
If the mesh file is not found, the original path is used.
"""
def fix_path(path: str, urdf_dir: str) -> str:
if path.startswith('package://'):
parts = path[len('package://'):].split('/', 1)
if path.startswith("package://"):
parts = path[len("package://") :].split("/", 1)
if len(parts) == 2:
package_name, rel_path = parts
# Try potential locations for the mesh
potential_paths = [
os.path.join(urdf_dir, rel_path),
os.path.join(urdf_dir, '../meshes', rel_path),
os.path.join(urdf_dir, f'../{package_name}', rel_path),
os.path.join(urdf_dir, '../..', rel_path)
os.path.join(urdf_dir, "../meshes", rel_path),
os.path.join(urdf_dir, f"../{package_name}", rel_path),
os.path.join(urdf_dir, "../..", rel_path),
]
for possible_path in potential_paths:
if os.path.exists(possible_path):
return possible_path
print(f"Failed to find mesh for package path: {path}")
print(f"Failed to find mesh for package path: {path}")
return path
@ -37,58 +38,67 @@ def fix_path(path: str, urdf_dir: str) -> str:
Iterates through the URDF file and fixes the paths of all mesh files.
The URDF file is parsed and the mesh paths are modified in-place.
"""
def fix_mesh_paths(urdf_path:str, urdf_dir: str) -> str:
def fix_mesh_paths(urdf_path: str, urdf_dir: str) -> str:
root = ET.parse(urdf_path).getroot()
try:
for mesh in root.findall('.//mesh'):
if 'filename' in mesh.attrib:
mesh.attrib['filename'] = fix_path(mesh.attrib['filename'], urdf_dir)
for mesh in root.findall(".//mesh"):
if "filename" in mesh.attrib:
mesh.attrib["filename"] = fix_path(mesh.attrib["filename"], urdf_dir)
except Exception as e:
print(f"Error fixing mesh paths: {e}")
raise e
fixed_path = urdf_path.replace('.urdf', '_fixed.urdf')
fixed_path = urdf_path.replace(".urdf", "_fixed.urdf")
return root, fixed_path
"""
Formats the XML tree to be human-readable and writes it to a file.
"""
def format_xml(root: ET.Element, fixed_path: str) -> None:
xml_str = ET.tostring(root, encoding='utf-8')
xml_str = ET.tostring(root, encoding="utf-8")
dom = minidom.parseString(xml_str)
with open(fixed_path, 'w', encoding='utf-8') as f:
with open(fixed_path, "w", encoding="utf-8") as f:
# Write with nice indentation but remove extra whitespace
pretty_xml = dom.toprettyxml(indent=' ')
pretty_xml = dom.toprettyxml(indent=" ")
# Remove extra blank lines that minidom sometimes adds
pretty_xml = '\n'.join([line for line in pretty_xml.split('\n') if line.strip()])
pretty_xml = "\n".join(
[line for line in pretty_xml.split("\n") if line.strip()]
)
f.write(pretty_xml)
def handle_urdf(urdf_path: str) -> str:
# Check if URDF is valid
if not os.path.exists(urdf_path):
print(f"Invalid URDF path: {urdf_path}")
return None
# FIXME: Add check to see if URDF needs to be processed
try:
urdf_dir = os.path.dirname(urdf_path)
root, fixed_path = fix_mesh_paths(urdf_path, urdf_dir)
root, fixed_path = fix_mesh_paths(urdf_path, urdf_dir)
format_xml(root, fixed_path)
print(f"Successfully processed URDF: {fixed_path}")
return fixed_path
except Exception as e:
print(f"Failed to process URDF: {e}")
raise e
if __name__ == '__main__':
if __name__ == "__main__":
# Example usage
cwd = os.getcwd()
urdf_path = os.path.join(cwd, "notebook/phantom/urdf/SO_5DOF_ARM100_05d.SLDASM/urdf/SO_5DOF_ARM100_05d.SLDASM.urdf")
handle_urdf(urdf_path)
urdf_path = os.path.join(
cwd,
"open_phantom/urdf/SO_5DOF_ARM100_05d.SLDASM/urdf/SO_5DOF_ARM100_05d.SLDASM.urdf",
)
handle_urdf(urdf_path)

View File

@ -6,75 +6,107 @@ def visualize_mesh(frame, hand_vertices, faces):
"""Visualize the hand mesh on the frame"""
mesh_vis = frame.copy()
height, width = frame.shape[:2]
# Draw the base vertices of the mesh (original landmarks)
for i in range(21):
pt = tuple(map(int, hand_vertices[i][:2]))
if 0 <= pt[0] < width and 0 <= pt[1] < height:
cv2.circle(mesh_vis, pt, 4, (255, 0, 0), -1) # Red circles for base vertices
cv2.circle(
mesh_vis, pt, 4, (255, 0, 0), -1
) # Red circles for base vertices
# Draw the interpolated vertices
for i in range(21, len(hand_vertices)):
pt = tuple(map(int, hand_vertices[i][:2]))
if 0 <= pt[0] < width and 0 <= pt[1] < height:
cv2.circle(mesh_vis, pt, 2, (0, 255, 255), -1) # Yellow circles for interpolated vertices
cv2.circle(
mesh_vis, pt, 2, (0, 255, 255), -1
) # Yellow circles for interpolated vertices
# Draw the faces of the mesh
for face in faces:
if len(face) == 3: # Triangle face
pt1 = tuple(map(int, hand_vertices[face[0]][:2]))
pt2 = tuple(map(int, hand_vertices[face[1]][:2]))
pt3 = tuple(map(int, hand_vertices[face[2]][:2]))
if (0 <= pt1[0] < width and 0 <= pt1[1] < height and
0 <= pt2[0] < width and 0 <= pt2[1] < height and
0 <= pt3[0] < width and 0 <= pt3[1] < height):
cv2.line(mesh_vis, pt1, pt2, (0, 255, 0), 1) # Green lines for mesh edges
if (
0 <= pt1[0] < width
and 0 <= pt1[1] < height
and 0 <= pt2[0] < width
and 0 <= pt2[1] < height
and 0 <= pt3[0] < width
and 0 <= pt3[1] < height
):
cv2.line(
mesh_vis, pt1, pt2, (0, 255, 0), 1
) # Green lines for mesh edges
cv2.line(mesh_vis, pt2, pt3, (0, 255, 0), 1)
cv2.line(mesh_vis, pt3, pt1, (0, 255, 0), 1)
# Add explanation text
cv2.putText(mesh_vis, "Red: Base vertices", (10, 30),
cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 0), 2)
cv2.putText(mesh_vis, "Yellow: Interpolated vertices", (10, 60),
cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 255), 2)
cv2.putText(mesh_vis, "Green: Mesh edges", (10, 90),
cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)
cv2.putText(
mesh_vis,
"Red: Base vertices",
(10, 30),
cv2.FONT_HERSHEY_SIMPLEX,
0.6,
(255, 0, 0),
2,
)
cv2.putText(
mesh_vis,
"Yellow: Interpolated vertices",
(10, 60),
cv2.FONT_HERSHEY_SIMPLEX,
0.6,
(0, 255, 255),
2,
)
cv2.putText(
mesh_vis,
"Green: Mesh edges",
(10, 90),
cv2.FONT_HERSHEY_SIMPLEX,
0.6,
(0, 255, 0),
2,
)
return mesh_vis
def visualize_registration(cloud, vertices, icp_transform):
"""Visualize the ICP registration process"""
height, width = 480, 640 # Fixed size visualization
reg_vis = np.ones((height, width, 3), dtype=np.uint8) * 255 # White background
# Define visualization boundaries
margin = 50
view_width = width - 2 * margin
view_height = height - 2 * margin
# Find 2D min/max values for scaling
all_points_2d = np.vstack([cloud[:, :2], vertices[:, :2]])
min_vals = np.min(all_points_2d, axis=0)
max_vals = np.max(all_points_2d, axis=0)
# Function to scale points to fit visualization
def scale_point(point):
scaled = (point - min_vals) / (max_vals - min_vals)
x = int(scaled[0] * view_width) + margin
y = int(scaled[1] * view_height) + margin
return (x, y)
# Draw original point cloud (red)
for point in cloud[::10]: # Downsample for visualization
pt = scale_point(point[:2])
cv2.circle(reg_vis, pt, 1, (0, 0, 255), -1) # Red dots
# Draw original mesh vertices (blue)
for vertex in vertices:
pt = scale_point(vertex[:2])
cv2.circle(reg_vis, pt, 2, (255, 0, 0), -1) # Blue dots
# Apply transformation to mesh vertices
transformed_vertices = []
for vertex in vertices:
@ -83,181 +115,319 @@ def visualize_registration(cloud, vertices, icp_transform):
# Apply transformation
v_transformed = np.dot(icp_transform, v_hom)
transformed_vertices.append(v_transformed[:3])
# Draw transformed mesh vertices (green)
for vertex in transformed_vertices:
pt = scale_point(vertex[:2])
cv2.circle(reg_vis, pt, 2, (0, 255, 0), -1) # Green dots
# Add transformation matrix display
cv2.putText(reg_vis, "ICP Transformation Matrix:", (10, height - 120),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1)
cv2.putText(
reg_vis,
"ICP Transformation Matrix:",
(10, height - 120),
cv2.FONT_HERSHEY_SIMPLEX,
0.5,
(0, 0, 0),
1,
)
for i in range(4):
matrix_text = f"[{icp_transform[i][0]:.2f}, {icp_transform[i][1]:.2f}, {icp_transform[i][2]:.2f}, {icp_transform[i][3]:.2f}]"
cv2.putText(reg_vis, matrix_text, (10, height - 90 + i * 20),
cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 0, 0), 1)
cv2.putText(
reg_vis,
matrix_text,
(10, height - 90 + i * 20),
cv2.FONT_HERSHEY_SIMPLEX,
0.4,
(0, 0, 0),
1,
)
# Add legend
cv2.putText(reg_vis, "Red: Point Cloud", (width - 200, 30),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)
cv2.putText(reg_vis, "Blue: Original Mesh", (width - 200, 60),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1)
cv2.putText(reg_vis, "Green: Transformed Mesh", (width - 200, 90),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
cv2.putText(
reg_vis,
"Red: Point Cloud",
(width - 200, 30),
cv2.FONT_HERSHEY_SIMPLEX,
0.5,
(0, 0, 255),
1,
)
cv2.putText(
reg_vis,
"Blue: Original Mesh",
(width - 200, 60),
cv2.FONT_HERSHEY_SIMPLEX,
0.5,
(255, 0, 0),
1,
)
cv2.putText(
reg_vis,
"Green: Transformed Mesh",
(width - 200, 90),
cv2.FONT_HERSHEY_SIMPLEX,
0.5,
(0, 255, 0),
1,
)
return reg_vis
def visualize_constraints(frame, refined_landmarks, constrained_landmarks, camera_intrinsics):
def visualize_constraints(
frame, refined_landmarks, constrained_landmarks, camera_intrinsics
):
"""Visualize before and after applying anatomical constraints"""
constraints_vis = frame.copy()
height, width = frame.shape[:2]
focal_x, focal_y, center_x, center_y = camera_intrinsics
# Define finger indices
thumb_indices = [1, 2, 3, 4]
index_indices = [5, 6, 7, 8]
highlighted_indices = thumb_indices + index_indices
# Draw original refined landmarks
for i, landmark in enumerate(refined_landmarks):
x, y, z = landmark
if z > 0:
u = int(x * focal_x / z + center_x)
v = int(y * focal_y / z + center_y)
if 0 <= u < width and 0 <= v < height:
if i in highlighted_indices:
# Draw larger circles for thumb and index fingers (the constrained ones)
cv2.circle(constraints_vis, (u, v), 5, (0, 0, 255), -1) # Red circles
cv2.circle(
constraints_vis, (u, v), 5, (0, 0, 255), -1
) # Red circles
else:
cv2.circle(constraints_vis, (u, v), 3, (0, 0, 255), -1)
# Draw connections for original landmarks
for i in range(len(refined_landmarks) - 1):
if i + 1 in thumb_indices and i in thumb_indices:
# Draw thumb connections
start = refined_landmarks[i]
end = refined_landmarks[i + 1]
if start[2] > 0 and end[2] > 0:
start_u = int(start[0] * focal_x / start[2] + center_x)
start_v = int(start[1] * focal_y / start[2] + center_y)
end_u = int(end[0] * focal_x / end[2] + center_x)
end_v = int(end[1] * focal_y / end[2] + center_y)
if (0 <= start_u < width and 0 <= start_v < height and
0 <= end_u < width and 0 <= end_v < height):
cv2.line(constraints_vis, (start_u, start_v), (end_u, end_v), (0, 0, 255), 2)
if (
0 <= start_u < width
and 0 <= start_v < height
and 0 <= end_u < width
and 0 <= end_v < height
):
cv2.line(
constraints_vis,
(start_u, start_v),
(end_u, end_v),
(0, 0, 255),
2,
)
# Draw constrained landmarks
for i, landmark in enumerate(constrained_landmarks):
x, y, z = landmark
if z > 0:
u = int(x * focal_x / z + center_x)
v = int(y * focal_y / z + center_y)
if 0 <= u < width and 0 <= v < height:
if i in highlighted_indices:
# Draw larger circles for thumb and index fingers
cv2.circle(constraints_vis, (u, v), 5, (0, 255, 0), -1) # Green circles
cv2.circle(
constraints_vis, (u, v), 5, (0, 255, 0), -1
) # Green circles
else:
cv2.circle(constraints_vis, (u, v), 3, (0, 255, 0), -1)
# Draw connections for constrained landmarks
for i in range(len(constrained_landmarks) - 1):
if i + 1 in thumb_indices and i in thumb_indices:
# Draw thumb connections
start = constrained_landmarks[i]
end = constrained_landmarks[i + 1]
if start[2] > 0 and end[2] > 0:
start_u = int(start[0] * focal_x / start[2] + center_x)
start_v = int(start[1] * focal_y / start[2] + center_y)
end_u = int(end[0] * focal_x / end[2] + center_x)
end_v = int(end[1] * focal_y / end[2] + center_y)
if (0 <= start_u < width and 0 <= start_v < height and
0 <= end_u < width and 0 <= end_v < height):
cv2.line(constraints_vis, (start_u, start_v), (end_u, end_v), (0, 255, 0), 2)
if (
0 <= start_u < width
and 0 <= start_v < height
and 0 <= end_u < width
and 0 <= end_v < height
):
cv2.line(
constraints_vis,
(start_u, start_v),
(end_u, end_v),
(0, 255, 0),
2,
)
# Add legend
cv2.putText(constraints_vis, "Red: Before constraints", (10, 30),
cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
cv2.putText(constraints_vis, "Green: After constraints", (10, 60),
cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
cv2.putText(
constraints_vis,
"Red: Before constraints",
(10, 30),
cv2.FONT_HERSHEY_SIMPLEX,
0.7,
(0, 0, 255),
2,
)
cv2.putText(
constraints_vis,
"Green: After constraints",
(10, 60),
cv2.FONT_HERSHEY_SIMPLEX,
0.7,
(0, 255, 0),
2,
)
return constraints_vis
def visualize_robot_params(frame, position, direction, normal, gripper_width, camera_intrinsics):
"""Visualize robot parameters (position, orientation, gripper width)"""
def visualize_robot_params(
frame, position, orientation_matrix, gripper_width, camera_intrinsics
):
"""Visualize robot parameters (position, orientation matrix, gripper width)"""
robot_vis = frame.copy()
height, width = frame.shape[:2]
focal_x, focal_y, center_x, center_y = camera_intrinsics
# Extract axes from orientation matrix
x_axis = orientation_matrix[:, 0] # First column (principal axis)
y_axis = orientation_matrix[:, 1] # Second column
z_axis = orientation_matrix[:, 2] # Third column (normal)
# Project position to image coordinates
x, y, z = position
if z > 0:
u = int(x * focal_x / z + center_x)
v = int(y * focal_y / z + center_y)
if 0 <= u < width and 0 <= v < height:
# Draw end effector position
cv2.circle(robot_vis, (u, v), 10, (255, 0, 0), -1) # Blue circle
# Draw direction vector (X axis)
dx, dy, dz = direction
# Draw X axis (principal axis)
dx, dy, dz = x_axis
scale = 50 # Scale for better visualization
end_u = int((x + dx * scale) * focal_x / (z + dz * scale) + center_x)
end_v = int((y + dy * scale) * focal_y / (z + dz * scale) + center_y)
if 0 <= end_u < width and 0 <= end_v < height:
cv2.line(robot_vis, (u, v), (end_u, end_v), (0, 0, 255), 2) # Red line (X axis)
cv2.putText(robot_vis, "X", (end_u, end_v),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)
# Draw normal vector (Z axis)
nx, ny, nz = normal
cv2.line(
robot_vis, (u, v), (end_u, end_v), (0, 0, 255), 2
) # Red line (X axis)
cv2.putText(
robot_vis,
"X",
(end_u, end_v),
cv2.FONT_HERSHEY_SIMPLEX,
0.5,
(0, 0, 255),
2,
)
# Draw Z axis (normal)
nx, ny, nz = z_axis
end_u = int((x + nx * scale) * focal_x / (z + nz * scale) + center_x)
end_v = int((y + ny * scale) * focal_y / (z + nz * scale) + center_y)
if 0 <= end_u < width and 0 <= end_v < height:
cv2.line(robot_vis, (u, v), (end_u, end_v), (0, 255, 0), 2) # Green line (Z axis)
cv2.putText(robot_vis, "Z", (end_u, end_v),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
# Calculate Y axis (cross product of Z and X for right-hand coordinate system)
y_axis = np.cross(normal, direction)
y_axis = y_axis / np.linalg.norm(y_axis)
cv2.line(
robot_vis, (u, v), (end_u, end_v), (0, 255, 0), 2
) # Green line (Z axis)
cv2.putText(
robot_vis,
"Z",
(end_u, end_v),
cv2.FONT_HERSHEY_SIMPLEX,
0.5,
(0, 255, 0),
2,
)
# Draw Y axis
yx, yy, yz = y_axis
end_u = int((x + yx * scale) * focal_x / (z + yz * scale) + center_x)
end_v = int((y + yy * scale) * focal_y / (z + yz * scale) + center_y)
if 0 <= end_u < width and 0 <= end_v < height:
cv2.line(robot_vis, (u, v), (end_u, end_v), (255, 0, 0), 2) # Blue line (Y axis)
cv2.putText(robot_vis, "Y", (end_u, end_v),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)
cv2.line(
robot_vis, (u, v), (end_u, end_v), (255, 0, 0), 2
) # Blue line (Y axis)
cv2.putText(
robot_vis,
"Y",
(end_u, end_v),
cv2.FONT_HERSHEY_SIMPLEX,
0.5,
(255, 0, 0),
2,
)
# Visualize gripper width
gripper_radius = int(gripper_width * 100) # Scale for better visualization
cv2.circle(robot_vis, (u, v), gripper_radius, (0, 255, 255), 2) # Yellow circle
cv2.circle(
robot_vis, (u, v), gripper_radius, (0, 255, 255), 2
) # Yellow circle
# Add parameter values as text
y_offset = height - 160
cv2.putText(robot_vis, f"Position: ({x:.2f}, {y:.2f}, {z:.2f})", (10, y_offset),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
cv2.putText(
robot_vis,
f"Position: ({x:.2f}, {y:.2f}, {z:.2f})",
(10, y_offset),
cv2.FONT_HERSHEY_SIMPLEX,
0.5,
(255, 255, 255),
1,
)
y_offset += 30
cv2.putText(robot_vis, f"Direction: ({dx:.2f}, {dy:.2f}, {dz:.2f})", (10, y_offset),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)
cv2.putText(
robot_vis,
f"X axis: ({x_axis[0]:.2f}, {x_axis[1]:.2f}, {x_axis[2]:.2f})",
(10, y_offset),
cv2.FONT_HERSHEY_SIMPLEX,
0.5,
(0, 0, 255),
1,
)
y_offset += 30
cv2.putText(robot_vis, f"Normal: ({nx:.2f}, {ny:.2f}, {nz:.2f})", (10, y_offset),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
cv2.putText(
robot_vis,
f"Z axis: ({z_axis[0]:.2f}, {z_axis[1]:.2f}, {z_axis[2]:.2f})",
(10, y_offset),
cv2.FONT_HERSHEY_SIMPLEX,
0.5,
(0, 255, 0),
1,
)
y_offset += 30
cv2.putText(robot_vis, f"Gripper Width: {gripper_width:.2f}", (10, y_offset),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 255), 1)
return robot_vis
cv2.putText(
robot_vis,
f"Gripper Width: {gripper_width:.2f}",
(10, y_offset),
cv2.FONT_HERSHEY_SIMPLEX,
0.5,
(0, 255, 255),
1,
)
return robot_vis