diff --git a/open_phantom/hand_processor.py b/open_phantom/hand_processor.py index ace58f6..b74b51e 100644 --- a/open_phantom/hand_processor.py +++ b/open_phantom/hand_processor.py @@ -1,92 +1,94 @@ import os -import cv2 import time -import torch + +import cv2 +import depth_pro +import mediapipe as mp import numpy as np import open3d as o3d -import mediapipe as mp - +import torch +from PIL import Image +from robot_manager import RobotManager +from sam2.sam2_image_predictor import SAM2ImagePredictor from tqdm import tqdm from utils.visualizations import * -from robot_manager import RobotManager -# from diffusers import StableDiffusionInpaintPipeline class HandProcessor: def __init__(self) -> None: self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - + self.mp_hands = mp.solutions.hands self.mp_drawing = mp.solutions.drawing_utils self.mp_drawing_styles = mp.solutions.drawing_styles - + self.hands = self.mp_hands.Hands( model_complexity=1, max_num_hands=1, static_image_mode=False, ) - + # NOTE: Look into better depth estimation models # Initialize MiDaS for depth estimation print("Loading MiDaS model...") self.midas = torch.hub.load("intel-isl/MiDaS", "DPT_Hybrid") self.midas.to(self.device) self.midas.eval() - + # Load MiDaS transforms to resize and normalize input images self.midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms") self.transform = self.midas_transforms.dpt_transform - - # Segment hand from image using MediaPipe Hands landmarks - def _create_mask(self, image: np.ndarray, landmarks: list, thickness: int=5, padding:int=10) -> np.ndarray: - height, width = image.shape[:2] - mask = np.zeros((height, width), dtype=np.uint8) - - points = [(int(landmark.x * width), int(landmark.y * height)) for landmark in landmarks.landmark] - - # Draw filled circles at each landmark - for point in points: - cv2.circle(mask, point, thickness, 255, -1) - - # Connect all landmarks with thick lines - for connection in self.mp_hands.HAND_CONNECTIONS: - start_idx, end_idx = connection - cv2.line(mask, points[start_idx], points[end_idx], 255, thickness) - - # Create palm by connecting base of fingers with wrist - palm_points = [points[0], points[1], points[5], points[9], points[13], points[17]] - cv2.fillPoly(mask, [np.array(palm_points)], 255) - - # Create shape between fingers - finger_bases = [(1,5), (5,9), (9,13), (13,17)] - for base1, base2 in finger_bases: - triangle = np.array([points[0], points[base1], points[base2]]) - cv2.fillPoly(mask, [triangle], 255) - - # Dilate to smooth and expand slightly - kernel = np.ones((padding, padding), np.uint8) - mask = cv2.dilate(mask, kernel, iterations=1) - - # Smooth the mask - mask = cv2.GaussianBlur(mask, (21, 21), 0) - _, mask = cv2.threshold(mask, 50, 255, cv2.THRESH_BINARY) - + + # Load SAM2 model for hand segmentation + print("Loading SAM2 model...") + self.sam2_predictor = SAM2ImagePredictor.from_pretrained( + "facebook/sam2-hiera-large" + ) + + def _create_mask(self, frame: np.ndarray, landmarks: list) -> np.ndarray: + height, width = frame.shape[:2] + frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + + # Set image in SAM2 predictor + with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16): + self.sam2_predictor.set_image(frame_rgb) # Set image for prediction + + # Convert landmarks to point prompts + points = [] + for landmark in landmarks.landmark: + x, y = int(landmark.x * width), int(landmark.y * height) + points.append([x, y]) + + input_points = np.array(points) + + # Predict mask with point prompts + masks, _, _ = self.sam2_predictor.predict( + point_coords=input_points, # Pass the points as prompts + point_labels=np.ones( + len(input_points) + ), # All points from hand are foreground + multimask_output=False, # Just get one mask + ) + + mask = masks[0].astype(np.uint8) * 255 + return mask """ Transform input image to match MiDaS model requirements Estimate depth map using MiDaS model """ - # TODO: Look into why screen goes black sometimes + + # TODO: Swap MiDaS for ml-depth-pro model def _estimate_depth(self, image: np.ndarray) -> tuple: img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) - + # Transform img for MiDaS model input_batch = self.transform(img).to(self.device) - + with torch.inference_mode(): prediction = self.midas(input_batch) - + # Interpolate to original size prediction = torch.nn.functional.interpolate( prediction.unsqueeze(1), @@ -94,127 +96,146 @@ class HandProcessor: mode="bicubic", align_corners=False, ).squeeze() - + # Convert to numpy and normalize to 0-255 for visualization depth_map = prediction.cpu().numpy() depth_min = depth_map.min() depth_max = depth_map.max() depth_map_normalized = 255 * (depth_map - depth_min) / (depth_max - depth_min) - + return depth_map, depth_map_normalized.astype(np.uint8) - + # TODO: Look into better depth scaling def _create_cloud(self, depth_map: np.ndarray, hand_mask: np.ndarray) -> np.ndarray: focal_x, focal_y, center_x, center_y = self.camera_intrinsics - - v_coords, u_coords = np.where(hand_mask > 0) + + v_coords, u_coords = np.where(hand_mask > 0) z_values = depth_map[v_coords, u_coords] - + # Filter out zero depth values valid_indices = z_values > 0 u_coords = u_coords[valid_indices] v_coords = v_coords[valid_indices] z_values = z_values[valid_indices] - - # NOTE: Abritrary scaling factor for depth + z_metric = z_values * 0.5 - + # Back-project to 3D using camera intrinsics x_values = (u_coords - center_x) * z_metric / focal_x y_values = (v_coords - center_y) * z_metric / focal_y - + points = np.column_stack((x_values, y_values, z_metric)) - + return points - - # TODO: Look into better Z scaling + + # TODO: Look into better depth scaling def _create_mesh(self, landmarks: list, image_dims: tuple) -> np.ndarray: width, height = image_dims - + vertices = [] for landmark in landmarks.landmark: - vertices.append([ - landmark.x * width, - landmark.y * height, - landmark.z * width - ]) - + vertices.append( + [landmark.x * width, landmark.y * height, landmark.z * width] + ) + # Define faces (triangles) connecting landmarks faces = [ # Palm - [0, 1, 5], [0, 5, 9], [0, 9, 13], [0, 13, 17], + [0, 1, 5], + [0, 5, 9], + [0, 9, 13], + [0, 13, 17], # Thumb - [1, 2, 3], [2, 3, 4], + [1, 2, 3], + [2, 3, 4], # Index finger - [5, 6, 7], [6, 7, 8], + [5, 6, 7], + [6, 7, 8], # Middle finger - [9, 10, 11], [10, 11, 12], + [9, 10, 11], + [10, 11, 12], # Ring finger - [13, 14, 15], [14, 15, 16], + [13, 14, 15], + [14, 15, 16], # Pinky - [17, 18, 19], [18, 19, 20] + [17, 18, 19], + [18, 19, 20], ] - + dense_vertices = list(vertices) - + # Add interpolated vertices along finger segments connections = self.mp_hands.HAND_CONNECTIONS for connection in connections: start_idx, end_idx = connection start_point = np.array(vertices[start_idx]) end_point = np.array(vertices[end_idx]) - + # Add 2 interpolated points between each connected pair for t in [0.33, 0.66]: - interp_point = start_point * (1-t) + end_point * t + interp_point = start_point * (1 - t) + end_point * t dense_vertices.append(interp_point.tolist()) - + return np.array(dense_vertices), np.array(faces) - + + # TODO: Improve ICP registration for better alignment def _icp_registration(self, cloud: np.ndarray, vertices: np.ndarray) -> np.ndarray: source = o3d.geometry.PointCloud() source.points = o3d.utility.Vector3dVector(vertices) - + target = o3d.geometry.PointCloud() target.points = o3d.utility.Vector3dVector(cloud) - + + # Calculate centroids + source_centroid = np.mean(vertices, axis=0) + target_centroid = np.mean(cloud, axis=0) + + # Calculate initial translation to align centroids + initial_translation = target_centroid - source_centroid + + # Create initial transformation matrix + trans_init = np.eye(4) + trans_init[:3, 3] = initial_translation + result = o3d.pipelines.registration.registration_icp( source, - target, - max_correspondence_distance=0.05, + target, + max_correspondence_distance=0.1, # Increased distance + init=trans_init, # Initial transformation estimation_method=o3d.pipelines.registration.TransformationEstimationPointToPoint(), + criteria=o3d.pipelines.registration.ICPConvergenceCriteria( + max_iteration=100 + ), # Increased iterations ) - + transformation = result.transformation - + return transformation - + + # TODO: Look into better depth scaling def _refine_landmarks(self, landmarks: list, transform: int, image_dims: tuple): width, height = image_dims - + refined_landmarks = [] for landmark in landmarks.landmark: - point = np.array([ - landmark.x * width, - landmark.y * height, - landmark.z * width, # TODO: Figure out better scaling factor - 1.0 - ]) - + point = np.array( + [landmark.x * width, landmark.y * height, landmark.z * width, 1.0] + ) + # Apply transformation to 3D point transformed = np.dot(transform, point) refined_landmarks.append(transformed[:3]) - + return refined_landmarks - + def _apply_constraints(self, landmarks: list): constrained = np.array(landmarks) - + # Define finger joint indices # MediaPipe hand model: Wrist is 0, thumb is 1-4, index is 5-8, etc. thumb_indices = [1, 2, 3, 4] index_indices = [5, 6, 7, 8] - + # Constrain the last two joints of thumb and index finger # as mentioned in the paper for finger_indices in [thumb_indices, index_indices]: @@ -224,115 +245,165 @@ class HandProcessor: joint1 = constrained[finger_indices[-3]] joint2 = constrained[finger_indices[-2]] joint3 = constrained[finger_indices[-1]] - + # Direction of the first segment dir1 = joint2 - joint1 dir1 = dir1 / np.linalg.norm(dir1) - + # Instead of full ball joint, constrain the last joint's direction # to follow similar direction as the previous segment ideal_dir = dir1.copy() actual_dir = joint3 - joint2 actual_length = np.linalg.norm(actual_dir) - + # Force the direction to be within a cone of the previous segment # (limiting to single degree of freedom approximately) corrected_dir = ideal_dir * actual_length - + # Apply the correction constrained[finger_indices[-1]] = joint2 + corrected_dir - + return constrained - - def _get_robot_params(self, refined_landmarks) -> tuple: - # Extract key landmarks - wrist = refined_landmarks[0] # Wrist landmark - thumb_tip = refined_landmarks[4] # Thumb tip - index_tip = refined_landmarks[8] # Index finger tip - - # Calculate end effector position (midpoint between thumb and index tips) + + def _get_robot_params(self, refined_landmarks): + # Extract keypoints + landmarks = np.array(refined_landmarks) + + # Define indices for specific parts of the hand + thumb_indices = [1, 2, 3, 4] # Thumb landmarks + index_indices = [5, 6, 7, 8] # Index finger landmarks + thumb_tip_idx = 4 + index_tip_idx = 8 + + # 1. Set target position as midpoint between thumb tip and index tip + thumb_tip = landmarks[thumb_tip_idx] + index_tip = landmarks[index_tip_idx] position = (thumb_tip + index_tip) / 2 - - # Calculate vectors for orientation - v1 = thumb_tip - wrist # Vector from wrist to thumb tip - v2 = index_tip - wrist # Vector from wrist to index tip - - # Calculate normal to hand plane - normal = np.cross(v1, v2) - if np.linalg.norm(normal) > 0: - normal = normal / np.linalg.norm(normal) - else: - # Default if vectors are collinear - normal = np.array([0, 0, 1]) - - # Calculate main direction along thumb - direction = thumb_tip - wrist - if np.linalg.norm(direction) > 0: - direction = direction / np.linalg.norm(direction) - else: - # Default if thumb is at wrist (unlikely) - direction = np.array([1, 0, 0]) - - # Calculate gripper width + + # 2. Calculate orientation + # Get all thumb and index finger points for plane fitting + thumb_points = landmarks[thumb_indices] + index_points = landmarks[index_indices] + finger_points = np.vstack([thumb_points, index_points]) + + # Fit plane to finger points + centroid = np.mean(finger_points, axis=0) + centered_points = finger_points - centroid + + # Use SVD to find the normal to the best-fitting plane + u, s, vh = np.linalg.svd(centered_points) + # The normal is the last right singular vector + normal = vh[2, :] + + # Ensure normal is a unit vector + normal = normal / np.linalg.norm(normal) + + # Fit a principal axis through thumb points + thumb_centroid = np.mean(thumb_points, axis=0) + thumb_centered = thumb_points - thumb_centroid + + # Use SVD again to find direction of maximum variance (principal axis) + u, s, vh = np.linalg.svd(thumb_centered) + principal_axis = vh[0, :] # First singular vector + + # Ensure principal axis is a unit vector + principal_axis = principal_axis / np.linalg.norm(principal_axis) + + # Construct orthogonal vectors for orientation matrix + z_axis = normal # Normal to the plane + x_axis = principal_axis # Along the thumb + + # Ensure x_axis is orthogonal to z_axis + x_axis = x_axis - np.dot(x_axis, z_axis) * z_axis + x_axis = x_axis / np.linalg.norm(x_axis) + + # Compute y_axis as cross product to ensure orthogonality + y_axis = np.cross(z_axis, x_axis) + + # Create rotation matrix + rotation_matrix = np.column_stack([x_axis, y_axis, z_axis]) + + # 3. Calculate gripper width gripper_width = np.linalg.norm(thumb_tip - index_tip) - - return position, direction, normal, gripper_width - + + # Apply percentile threshold to mitigate slippage (as mentioned in the paper) + # Store gripper widths in a buffer to compute percentiles + if not hasattr(self, "gripper_width_buffer"): + self.gripper_width_buffer = [] + + self.gripper_width_buffer.append(gripper_width) + # Keep only the last 100 values + if len(self.gripper_width_buffer) > 100: + self.gripper_width_buffer.pop(0) + + # Apply 20th percentile threshold + if len(self.gripper_width_buffer) > 5: # Need enough samples + min_width = np.percentile(self.gripper_width_buffer, 20) + if gripper_width < min_width: + gripper_width = min_width + + return position, rotation_matrix, gripper_width + def record_video(self) -> str: - output_dir = os.path.join(os.getcwd(), 'recordings') + output_dir = os.path.join(os.getcwd(), "recordings") os.makedirs(output_dir, exist_ok=True) - + timestamp = time.strftime("%Y%m%d-%H%M%S") video_filename = os.path.join(output_dir, f"recorded_video_{timestamp}.mp4") - + cap = cv2.VideoCapture(0) - + assert cap.isOpened(), "Failed to open camera." - + width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(cap.get(cv2.CAP_PROP_FPS)) - + # Update camera intrinsics based on video dimensions self.camera_intrinsics = (width * 0.8, height * 0.8, width / 2, height / 2) - - fourcc = cv2.VideoWriter_fourcc(*'mp4v') + + fourcc = cv2.VideoWriter_fourcc(*"mp4v") out = None - + recording = False did_record = False - + print("Camera is active. Press 'r' to start/stop recording, 'q' to quit.") - + while cap.isOpened(): success, frame = cap.read() assert success, "Failed to read from camera." - + # Mirror the image for more intuitive viewing frame = cv2.flip(frame, 1) - + # Create a separate display frame for showing the recording indicator display_frame = frame.copy() - + # Display recording indicator ONLY on the display frame if recording: cv2.circle(display_frame, (30, 30), 15, (0, 0, 255), -1) - cv2.putText(display_frame, "RECORDING", (50, 40), cv2.FONT_HERSHEY_SIMPLEX, - 1, (0, 0, 255), 2, cv2.LINE_AA) - + cv2.putText( + display_frame, + "RECORDING", + (50, 40), + cv2.FONT_HERSHEY_SIMPLEX, + 1, + (0, 0, 255), + 2, + cv2.LINE_AA, + ) + # Show the display frame (with indicator if recording) - cv2.imshow('Video Recording', display_frame) - + cv2.imshow("Video Recording", display_frame) + # Write the original frame (without indicator) to video file if recording if recording and out is not None: out.write(frame) - + key = cv2.waitKey(1) & 0xFF - - # NOTE: Haven't tested what happens if user records multiple videos in one session (probably overwrites?) - if key == ord('q'): - break - elif key == ord('r'): + + if key == ord("r"): recording = not recording if recording: print(f"Started recording to {video_filename}") @@ -342,29 +413,30 @@ class HandProcessor: out.release() print(f"Stopped recording. Video saved to {video_filename}") did_record = True - + break + if out is not None: out.release() cap.release() cv2.destroyAllWindows() - + return video_filename if did_record else None - + def process_video(self, video_path: str) -> tuple: assert video_path, "Video path is required." - + cap = cv2.VideoCapture(video_path) - + assert cap.isOpened(), f"Failed to open video file {video_path}" - + width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(cap.get(cv2.CAP_PROP_FPS)) total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - + # Update camera intrinsics based on video dimensions self.camera_intrinsics = (width * 0.8, height * 0.8, width / 2, height / 2) - + base_path = os.path.splitext(video_path)[0] segmented_output_path = f"{base_path}_masked.mp4" depth_output_path = f"{base_path}_depth.mp4" @@ -372,94 +444,112 @@ class HandProcessor: registration_output_path = f"{base_path}_registration.mp4" constraints_output_path = f"{base_path}_constraints.mp4" robot_output_path = f"{base_path}_robot.mp4" - - fourcc = cv2.VideoWriter_fourcc(*'mp4v') - segmented_out = cv2.VideoWriter(segmented_output_path, fourcc, fps, (width, height)) + + fourcc = cv2.VideoWriter_fourcc(*"mp4v") + segmented_out = cv2.VideoWriter( + segmented_output_path, fourcc, fps, (width, height) + ) depth_out = cv2.VideoWriter(depth_output_path, fourcc, fps, (width, height)) mesh_out = cv2.VideoWriter(mesh_output_path, fourcc, fps, (width, height)) - reg_out = cv2.VideoWriter(registration_output_path, fourcc, fps, (640, 480)) # Fixed size - constraints_out = cv2.VideoWriter(constraints_output_path, fourcc, fps, (width, height)) + reg_out = cv2.VideoWriter( + registration_output_path, fourcc, fps, (640, 480) + ) # Fixed size + constraints_out = cv2.VideoWriter( + constraints_output_path, fourcc, fps, (width, height) + ) robot_out = cv2.VideoWriter(robot_output_path, fourcc, fps, (width, height)) - - print(f"Processing video with {total_frames} frames...") + + print(f"Processing video with {total_frames} frames...") for _ in tqdm(range(total_frames)): success, frame = cap.read() assert success, "Failed to read frame from video." - + # Convert image to RGB for MediaPipe image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) - + # Process with MediaPipe Hands results = self.hands.process(image_rgb) - + # Initialize output frames segmented_frame = frame.copy() depth_frame = np.zeros((height, width, 3), dtype=np.uint8) mesh_frame = frame.copy() - reg_frame = np.ones((480, 640, 3), dtype=np.uint8) * 255 # Fixed size, white background + reg_frame = ( + np.ones((480, 640, 3), dtype=np.uint8) * 255 + ) # Fixed size, white background constraints_frame = frame.copy() robot_frame = frame.copy() - + # Process if hand is detected if results.multi_hand_landmarks: for hand_landmarks in results.multi_hand_landmarks: - # Segment hand hand_mask = self._create_mask(frame, hand_landmarks) mask_overlay = frame.copy() mask_overlay[hand_mask > 0] = [0, 0, 255] # Red color for mask segmented_frame = cv2.addWeighted(frame, 0.7, mask_overlay, 0.3, 0) - - # Depth estimation + depth_map, depth_vis = self._estimate_depth(frame) depth_colored = cv2.applyColorMap(depth_vis, cv2.COLORMAP_JET) depth_frame = depth_colored.copy() - - # Create hand mesh - hand_vertices, hand_faces = self._create_mesh(hand_landmarks, (width, height)) - mesh_frame = visualize_mesh(frame, hand_vertices, hand_faces) - - # Create point cloud from depth + cloud = self._create_cloud(depth_map, hand_mask) - - # Perform ICP registration + + hand_vertices, hand_faces = self._create_mesh( + hand_landmarks, (width, height) + ) + mesh_frame = visualize_mesh(frame, hand_vertices, hand_faces) + icp_transform = self._icp_registration(cloud, hand_vertices) - reg_frame = visualize_registration(cloud, hand_vertices, icp_transform) - - # Refine landmarks using ICP transformation - refined_landmarks = self._refine_landmarks(hand_landmarks, icp_transform, (width, height)) - - # Store pre-constraint landmarks for visualization + reg_frame = visualize_registration( + cloud, hand_vertices, icp_transform + ) + + refined_landmarks = self._refine_landmarks( + hand_landmarks, icp_transform, (width, height) + ) + original_refined = refined_landmarks.copy() - - # Apply anatomical constraints + constrained_landmarks = self._apply_constraints(refined_landmarks) - constraints_frame = visualize_constraints(frame, original_refined, constrained_landmarks, self.camera_intrinsics) - - # # Calculate robot parameters - # position, direction, normal, gripper_width = self._get_robot_params(constrained_landmarks) - # robot_frame = visualize_robot_params(frame, position, direction, normal, gripper_width) - + constraints_frame = visualize_constraints( + frame, + original_refined, + constrained_landmarks, + self.camera_intrinsics, + ) + + position, orientation, gripper_width = self._get_robot_params( + constrained_landmarks + ) + robot_frame = visualize_robot_params( + frame, + position, + orientation, + gripper_width, + self.camera_intrinsics, + ) + segmented_out.write(segmented_frame) depth_out.write(depth_frame) mesh_out.write(mesh_frame) reg_out.write(reg_frame) constraints_out.write(constraints_frame) robot_out.write(robot_frame) - + display_scale = 0.5 display_size = (int(width * display_scale), int(height * display_scale)) reg_display_size = (int(640 * display_scale), int(480 * display_scale)) - - cv2.imshow('Segmented', cv2.resize(segmented_frame, display_size)) - cv2.imshow('Depth', cv2.resize(depth_frame, display_size)) - cv2.imshow('Mesh', cv2.resize(mesh_frame, display_size)) - cv2.imshow('Registration', cv2.resize(reg_frame, reg_display_size)) - cv2.imshow('Constraints', cv2.resize(constraints_frame, display_size)) - cv2.imshow('Robot Parameters', cv2.resize(robot_frame, display_size)) - + + cv2.imshow("Segmented", cv2.resize(segmented_frame, display_size)) + cv2.imshow("Depth", cv2.resize(depth_frame, display_size)) + cv2.imshow("Mesh", cv2.resize(mesh_frame, display_size)) + cv2.imshow("Registration", cv2.resize(reg_frame, reg_display_size)) + cv2.imshow("Constraints", cv2.resize(constraints_frame, display_size)) + cv2.imshow("Robot Parameters", cv2.resize(robot_frame, display_size)) + if cv2.waitKey(1) & 0xFF == 27: # ESC to exit break - + cap.release() segmented_out.release() depth_out.release() @@ -468,7 +558,7 @@ class HandProcessor: constraints_out.release() robot_out.release() cv2.destroyAllWindows() - + print(f"Processing complete. Results saved to:") print(f"- Hand mask: {segmented_output_path}") print(f"- Depth visualization: {depth_output_path}") @@ -476,12 +566,12 @@ class HandProcessor: print(f"- Registration visualization: {registration_output_path}") print(f"- Constraints visualization: {constraints_output_path}") print(f"- Robot parameters: {robot_output_path}") - + return { "segmented": segmented_output_path, "depth": depth_output_path, "mesh": mesh_output_path, "registration": registration_output_path, "constraints": constraints_output_path, - "robot": robot_output_path - } \ No newline at end of file + "robot": robot_output_path, + } diff --git a/open_phantom/main.py b/open_phantom/main.py index 7e07d22..6273556 100644 --- a/open_phantom/main.py +++ b/open_phantom/main.py @@ -5,16 +5,16 @@ from hand_processor import HandProcessor def main(): processor = HandProcessor() - + record_option = input("Record a new video? (y/n): ") - - if record_option.lower() == 'y': + + if record_option.lower() == "y": print("Starting video recording...") video_path = processor.record_video() if not video_path: print("Video recording failed.") return - + print(f"Video recorded successfully to {video_path}") process_option = input("Process the video now? (y/n): ") if process_option.lower() == "n": @@ -25,9 +25,9 @@ def main(): while not os.path.exists(video_path): print("Error: Video file not found.") video_path = input("Enter the path to the video file: ") - + processor.process_video(video_path) - - + + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/open_phantom/robot_manager.py b/open_phantom/robot_manager.py index 3dc215a..2d58d85 100644 --- a/open_phantom/robot_manager.py +++ b/open_phantom/robot_manager.py @@ -1,72 +1,75 @@ import os + import cv2 import numpy as np import pybullet as p - -from utils.math import * from utils.handle_urdf import handle_urdf -from scipy.spatial.transform import Rotation as R - + class RobotManager: def __init__(self, urdf_path: str, camera_intrinsics: tuple) -> None: self.physics_client = p.connect(p.DIRECT) - + robot_urdf = handle_urdf(urdf_path) self.robot_id = self._load_robot(robot_urdf) - + self.joint_count = p.getNumJoints(self.robot_id) self.end_effector_index = self._find_end_effector() - + self.fx, self.fy, self.cx, self.cy = camera_intrinsics # Set up rendering parameters self.img_width = int(self.cx * 2) self.img_height = int(self.cy * 2) - # Load the robot URDF into PyBullet + # Load the robot URDF into PyBullet def _load_robot(self, robot_urdf: str) -> int: try: robot_id = p.loadURDF( robot_urdf, basePosition=[0, 0, 0], useFixedBase=True, - flags=p.URDF_USE_SELF_COLLISION | p.URDF_USE_INERTIA_FROM_FILE + flags=p.URDF_USE_SELF_COLLISION | p.URDF_USE_INERTIA_FROM_FILE, ) except p.error as e: print(f"PyBullet error when loading URDF: {e}") raise e - - robot_name = p.getBodyInfo(robot_id)[1].decode('utf-8') + + robot_name = p.getBodyInfo(robot_id)[1].decode("utf-8") print(f"Successfully loaded {robot_name} robot with ID: {robot_id}") - + return robot_id - + # NOTE: Only applicable if the robot has one end effector def _find_end_effector(self) -> int: assert self.joint_count > 0, "Robot has no joints" - + # Keywords to look for in joint names to identify end effector - keywords = ['gripper', 'tool', 'tcp', 'end_effector', 'hand'] - + keywords = ["gripper", "tool", "tcp", "end_effector", "hand"] + for i in range(self.joint_count): info = p.getJointInfo(self.robot_id, i) - joint_name = info[1].decode('utf-8').lower() - + joint_name = info[1].decode("utf-8").lower() + # Check if any keyword is in the joint name if any(keyword in joint_name for keyword in keywords): return i - + # If no specific end effector found, use the last joint in the chain return self.joint_count - 1 - + # TODO: Use inverse kinematics to set the robot pose - def set_robot_pose(self, position: np.ndarray, orientation_vectors: np.ndarray, gripper_width: float) -> None: + def set_robot_pose( + self, + position: np.ndarray, + orientation_vectors: np.ndarray, + gripper_width: float, + ) -> None: pass - + # Render the robot in some scene using some camera parameters def render_robot(self, bg_image=None, camera_params=None): assert self.robot_id >= 0, "Robot not properly loaded" - + # Set up camera parameters if camera_params is None: # Default camera setup @@ -77,40 +80,37 @@ class RobotManager: cam_roll = 0 else: cam_target, cam_distance, cam_yaw, cam_pitch, cam_roll = camera_params - + # Compute view matrix view_matrix = p.computeViewMatrixFromYawPitchRoll( cameraTargetPosition=cam_target, distance=cam_distance, - yaw=cam_yaw, + yaw=cam_yaw, pitch=cam_pitch, roll=cam_roll, - upAxisIndex=2 + upAxisIndex=2, ) - + # Compute projection matrix aspect = self.img_width / self.img_height proj_matrix = p.computeProjectionMatrixFOV( - fov=60, - aspect=aspect, - nearVal=0.01, - farVal=100.0 + fov=60, aspect=aspect, nearVal=0.01, farVal=100.0 ) - + # Render the scene img_arr = p.getCameraImage( width=self.img_width, height=self.img_height, viewMatrix=view_matrix, projectionMatrix=proj_matrix, - renderer=p.ER_BULLET_HARDWARE_OPENGL + renderer=p.ER_BULLET_HARDWARE_OPENGL, ) - + # Extract RGB and depth rgb = np.reshape(img_arr[2], (self.img_height, self.img_width, 4)) rgb = rgb[:, :, :3] # Remove alpha channel depth = np.reshape(img_arr[3], (self.img_height, self.img_width)) - + # If background image is provided, composite if bg_image is not None: # Resize background if needed @@ -119,31 +119,33 @@ class RobotManager: bg_resized = cv2.resize(bg_image, (self.img_width, self.img_height)) else: bg_resized = bg_image - + # Create mask from depth mask = (depth < 0.99).astype(np.float32) mask = np.stack([mask, mask, mask], axis=2) - + # Composite composite = bg_resized * (1 - mask) + rgb * mask return composite.astype(np.uint8) - + return rgb.astype(np.uint8) - + def __del__(self) -> None: - if hasattr(self, 'physics_client'): + if hasattr(self, "physics_client"): try: p.disconnect(self.physics_client) except: pass - - + if __name__ == "__main__": cwd = os.getcwd() - urdf_path = os.path.join(cwd, "notebook/phantom/urdf/SO_5DOF_ARM100_05d.SLDASM/urdf/SO_5DOF_ARM100_05d.SLDASM.urdf") - camera_intrinsics = (320, 240, 320, 240) # Random intrinsics for example - + urdf_path = os.path.join( + cwd, + "notebook/phantom/urdf/SO_5DOF_ARM100_05d.SLDASM/urdf/SO_5DOF_ARM100_05d.SLDASM.urdf", + ) + camera_intrinsics = (320, 240, 320, 240) # Random intrinsics for example + robot_vis = RobotManager(urdf_path, camera_intrinsics) rendered_image = robot_vis.render_robot() @@ -155,4 +157,4 @@ if __name__ == "__main__": # Option 2: Save the image to a file output_path = "robot_render.png" cv2.imwrite(output_path, rendered_image) - print(f"Render saved to {output_path}") \ No newline at end of file + print(f"Render saved to {output_path}") diff --git a/open_phantom/utils/calibrations.py b/open_phantom/utils/calibrations.py new file mode 100644 index 0000000..8b53754 --- /dev/null +++ b/open_phantom/utils/calibrations.py @@ -0,0 +1 @@ +# TODO: Implement camera calibration functions to extract camera extrinsics and intrinsics diff --git a/open_phantom/utils/handle_urdf.py b/open_phantom/utils/handle_urdf.py index d4cb0d5..33afc1c 100644 --- a/open_phantom/utils/handle_urdf.py +++ b/open_phantom/utils/handle_urdf.py @@ -2,7 +2,6 @@ import os import xml.dom.minidom as minidom import xml.etree.ElementTree as ET - """ Fixes a specific path that uses the ROS-style 'package://' prefix. The 'package://' prefix is used in URDF files to refer to files in the same package. @@ -10,26 +9,28 @@ However, when we're not running in a ROS environment, the paths are not valid. This function tries to find the absolute path of the mesh file. If the mesh file is not found, the original path is used. """ + + def fix_path(path: str, urdf_dir: str) -> str: - if path.startswith('package://'): - parts = path[len('package://'):].split('/', 1) + if path.startswith("package://"): + parts = path[len("package://") :].split("/", 1) if len(parts) == 2: package_name, rel_path = parts - + # Try potential locations for the mesh potential_paths = [ os.path.join(urdf_dir, rel_path), - os.path.join(urdf_dir, '../meshes', rel_path), - os.path.join(urdf_dir, f'../{package_name}', rel_path), - os.path.join(urdf_dir, '../..', rel_path) + os.path.join(urdf_dir, "../meshes", rel_path), + os.path.join(urdf_dir, f"../{package_name}", rel_path), + os.path.join(urdf_dir, "../..", rel_path), ] - + for possible_path in potential_paths: if os.path.exists(possible_path): return possible_path - - print(f"Failed to find mesh for package path: {path}") - + + print(f"Failed to find mesh for package path: {path}") + return path @@ -37,58 +38,67 @@ def fix_path(path: str, urdf_dir: str) -> str: Iterates through the URDF file and fixes the paths of all mesh files. The URDF file is parsed and the mesh paths are modified in-place. """ -def fix_mesh_paths(urdf_path:str, urdf_dir: str) -> str: + + +def fix_mesh_paths(urdf_path: str, urdf_dir: str) -> str: root = ET.parse(urdf_path).getroot() - + try: - for mesh in root.findall('.//mesh'): - if 'filename' in mesh.attrib: - mesh.attrib['filename'] = fix_path(mesh.attrib['filename'], urdf_dir) + for mesh in root.findall(".//mesh"): + if "filename" in mesh.attrib: + mesh.attrib["filename"] = fix_path(mesh.attrib["filename"], urdf_dir) except Exception as e: print(f"Error fixing mesh paths: {e}") raise e - - fixed_path = urdf_path.replace('.urdf', '_fixed.urdf') - + + fixed_path = urdf_path.replace(".urdf", "_fixed.urdf") + return root, fixed_path """ Formats the XML tree to be human-readable and writes it to a file. """ + + def format_xml(root: ET.Element, fixed_path: str) -> None: - xml_str = ET.tostring(root, encoding='utf-8') + xml_str = ET.tostring(root, encoding="utf-8") dom = minidom.parseString(xml_str) - - with open(fixed_path, 'w', encoding='utf-8') as f: + + with open(fixed_path, "w", encoding="utf-8") as f: # Write with nice indentation but remove extra whitespace - pretty_xml = dom.toprettyxml(indent=' ') + pretty_xml = dom.toprettyxml(indent=" ") # Remove extra blank lines that minidom sometimes adds - pretty_xml = '\n'.join([line for line in pretty_xml.split('\n') if line.strip()]) + pretty_xml = "\n".join( + [line for line in pretty_xml.split("\n") if line.strip()] + ) f.write(pretty_xml) - + def handle_urdf(urdf_path: str) -> str: # Check if URDF is valid if not os.path.exists(urdf_path): print(f"Invalid URDF path: {urdf_path}") return None - + # FIXME: Add check to see if URDF needs to be processed - + try: urdf_dir = os.path.dirname(urdf_path) - root, fixed_path = fix_mesh_paths(urdf_path, urdf_dir) + root, fixed_path = fix_mesh_paths(urdf_path, urdf_dir) format_xml(root, fixed_path) print(f"Successfully processed URDF: {fixed_path}") return fixed_path except Exception as e: print(f"Failed to process URDF: {e}") raise e - -if __name__ == '__main__': + +if __name__ == "__main__": # Example usage cwd = os.getcwd() - urdf_path = os.path.join(cwd, "notebook/phantom/urdf/SO_5DOF_ARM100_05d.SLDASM/urdf/SO_5DOF_ARM100_05d.SLDASM.urdf") - handle_urdf(urdf_path) \ No newline at end of file + urdf_path = os.path.join( + cwd, + "open_phantom/urdf/SO_5DOF_ARM100_05d.SLDASM/urdf/SO_5DOF_ARM100_05d.SLDASM.urdf", + ) + handle_urdf(urdf_path) diff --git a/open_phantom/utils/visualizations.py b/open_phantom/utils/visualizations.py index fe1d3f6..e3523a3 100644 --- a/open_phantom/utils/visualizations.py +++ b/open_phantom/utils/visualizations.py @@ -6,75 +6,107 @@ def visualize_mesh(frame, hand_vertices, faces): """Visualize the hand mesh on the frame""" mesh_vis = frame.copy() height, width = frame.shape[:2] - + # Draw the base vertices of the mesh (original landmarks) for i in range(21): pt = tuple(map(int, hand_vertices[i][:2])) if 0 <= pt[0] < width and 0 <= pt[1] < height: - cv2.circle(mesh_vis, pt, 4, (255, 0, 0), -1) # Red circles for base vertices - + cv2.circle( + mesh_vis, pt, 4, (255, 0, 0), -1 + ) # Red circles for base vertices + # Draw the interpolated vertices for i in range(21, len(hand_vertices)): pt = tuple(map(int, hand_vertices[i][:2])) if 0 <= pt[0] < width and 0 <= pt[1] < height: - cv2.circle(mesh_vis, pt, 2, (0, 255, 255), -1) # Yellow circles for interpolated vertices - + cv2.circle( + mesh_vis, pt, 2, (0, 255, 255), -1 + ) # Yellow circles for interpolated vertices + # Draw the faces of the mesh for face in faces: if len(face) == 3: # Triangle face pt1 = tuple(map(int, hand_vertices[face[0]][:2])) pt2 = tuple(map(int, hand_vertices[face[1]][:2])) pt3 = tuple(map(int, hand_vertices[face[2]][:2])) - - if (0 <= pt1[0] < width and 0 <= pt1[1] < height and - 0 <= pt2[0] < width and 0 <= pt2[1] < height and - 0 <= pt3[0] < width and 0 <= pt3[1] < height): - cv2.line(mesh_vis, pt1, pt2, (0, 255, 0), 1) # Green lines for mesh edges + + if ( + 0 <= pt1[0] < width + and 0 <= pt1[1] < height + and 0 <= pt2[0] < width + and 0 <= pt2[1] < height + and 0 <= pt3[0] < width + and 0 <= pt3[1] < height + ): + cv2.line( + mesh_vis, pt1, pt2, (0, 255, 0), 1 + ) # Green lines for mesh edges cv2.line(mesh_vis, pt2, pt3, (0, 255, 0), 1) cv2.line(mesh_vis, pt3, pt1, (0, 255, 0), 1) - + # Add explanation text - cv2.putText(mesh_vis, "Red: Base vertices", (10, 30), - cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 0), 2) - cv2.putText(mesh_vis, "Yellow: Interpolated vertices", (10, 60), - cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 255), 2) - cv2.putText(mesh_vis, "Green: Mesh edges", (10, 90), - cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2) - + cv2.putText( + mesh_vis, + "Red: Base vertices", + (10, 30), + cv2.FONT_HERSHEY_SIMPLEX, + 0.6, + (255, 0, 0), + 2, + ) + cv2.putText( + mesh_vis, + "Yellow: Interpolated vertices", + (10, 60), + cv2.FONT_HERSHEY_SIMPLEX, + 0.6, + (0, 255, 255), + 2, + ) + cv2.putText( + mesh_vis, + "Green: Mesh edges", + (10, 90), + cv2.FONT_HERSHEY_SIMPLEX, + 0.6, + (0, 255, 0), + 2, + ) + return mesh_vis + def visualize_registration(cloud, vertices, icp_transform): - """Visualize the ICP registration process""" height, width = 480, 640 # Fixed size visualization reg_vis = np.ones((height, width, 3), dtype=np.uint8) * 255 # White background - + # Define visualization boundaries margin = 50 view_width = width - 2 * margin view_height = height - 2 * margin - + # Find 2D min/max values for scaling all_points_2d = np.vstack([cloud[:, :2], vertices[:, :2]]) min_vals = np.min(all_points_2d, axis=0) max_vals = np.max(all_points_2d, axis=0) - + # Function to scale points to fit visualization def scale_point(point): scaled = (point - min_vals) / (max_vals - min_vals) x = int(scaled[0] * view_width) + margin y = int(scaled[1] * view_height) + margin return (x, y) - + # Draw original point cloud (red) for point in cloud[::10]: # Downsample for visualization pt = scale_point(point[:2]) cv2.circle(reg_vis, pt, 1, (0, 0, 255), -1) # Red dots - + # Draw original mesh vertices (blue) for vertex in vertices: pt = scale_point(vertex[:2]) cv2.circle(reg_vis, pt, 2, (255, 0, 0), -1) # Blue dots - + # Apply transformation to mesh vertices transformed_vertices = [] for vertex in vertices: @@ -83,181 +115,319 @@ def visualize_registration(cloud, vertices, icp_transform): # Apply transformation v_transformed = np.dot(icp_transform, v_hom) transformed_vertices.append(v_transformed[:3]) - + # Draw transformed mesh vertices (green) for vertex in transformed_vertices: pt = scale_point(vertex[:2]) cv2.circle(reg_vis, pt, 2, (0, 255, 0), -1) # Green dots - + # Add transformation matrix display - cv2.putText(reg_vis, "ICP Transformation Matrix:", (10, height - 120), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1) - + cv2.putText( + reg_vis, + "ICP Transformation Matrix:", + (10, height - 120), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + (0, 0, 0), + 1, + ) + for i in range(4): matrix_text = f"[{icp_transform[i][0]:.2f}, {icp_transform[i][1]:.2f}, {icp_transform[i][2]:.2f}, {icp_transform[i][3]:.2f}]" - cv2.putText(reg_vis, matrix_text, (10, height - 90 + i * 20), - cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 0, 0), 1) - + cv2.putText( + reg_vis, + matrix_text, + (10, height - 90 + i * 20), + cv2.FONT_HERSHEY_SIMPLEX, + 0.4, + (0, 0, 0), + 1, + ) + # Add legend - cv2.putText(reg_vis, "Red: Point Cloud", (width - 200, 30), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1) - cv2.putText(reg_vis, "Blue: Original Mesh", (width - 200, 60), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1) - cv2.putText(reg_vis, "Green: Transformed Mesh", (width - 200, 90), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1) - + cv2.putText( + reg_vis, + "Red: Point Cloud", + (width - 200, 30), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + (0, 0, 255), + 1, + ) + cv2.putText( + reg_vis, + "Blue: Original Mesh", + (width - 200, 60), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + (255, 0, 0), + 1, + ) + cv2.putText( + reg_vis, + "Green: Transformed Mesh", + (width - 200, 90), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + (0, 255, 0), + 1, + ) + return reg_vis -def visualize_constraints(frame, refined_landmarks, constrained_landmarks, camera_intrinsics): + +def visualize_constraints( + frame, refined_landmarks, constrained_landmarks, camera_intrinsics +): """Visualize before and after applying anatomical constraints""" constraints_vis = frame.copy() height, width = frame.shape[:2] focal_x, focal_y, center_x, center_y = camera_intrinsics - + # Define finger indices thumb_indices = [1, 2, 3, 4] index_indices = [5, 6, 7, 8] highlighted_indices = thumb_indices + index_indices - + # Draw original refined landmarks for i, landmark in enumerate(refined_landmarks): x, y, z = landmark if z > 0: u = int(x * focal_x / z + center_x) v = int(y * focal_y / z + center_y) - + if 0 <= u < width and 0 <= v < height: if i in highlighted_indices: # Draw larger circles for thumb and index fingers (the constrained ones) - cv2.circle(constraints_vis, (u, v), 5, (0, 0, 255), -1) # Red circles + cv2.circle( + constraints_vis, (u, v), 5, (0, 0, 255), -1 + ) # Red circles else: cv2.circle(constraints_vis, (u, v), 3, (0, 0, 255), -1) - + # Draw connections for original landmarks for i in range(len(refined_landmarks) - 1): if i + 1 in thumb_indices and i in thumb_indices: # Draw thumb connections start = refined_landmarks[i] end = refined_landmarks[i + 1] - + if start[2] > 0 and end[2] > 0: start_u = int(start[0] * focal_x / start[2] + center_x) start_v = int(start[1] * focal_y / start[2] + center_y) end_u = int(end[0] * focal_x / end[2] + center_x) end_v = int(end[1] * focal_y / end[2] + center_y) - - if (0 <= start_u < width and 0 <= start_v < height and - 0 <= end_u < width and 0 <= end_v < height): - cv2.line(constraints_vis, (start_u, start_v), (end_u, end_v), (0, 0, 255), 2) - + + if ( + 0 <= start_u < width + and 0 <= start_v < height + and 0 <= end_u < width + and 0 <= end_v < height + ): + cv2.line( + constraints_vis, + (start_u, start_v), + (end_u, end_v), + (0, 0, 255), + 2, + ) + # Draw constrained landmarks for i, landmark in enumerate(constrained_landmarks): x, y, z = landmark if z > 0: u = int(x * focal_x / z + center_x) v = int(y * focal_y / z + center_y) - + if 0 <= u < width and 0 <= v < height: if i in highlighted_indices: # Draw larger circles for thumb and index fingers - cv2.circle(constraints_vis, (u, v), 5, (0, 255, 0), -1) # Green circles + cv2.circle( + constraints_vis, (u, v), 5, (0, 255, 0), -1 + ) # Green circles else: cv2.circle(constraints_vis, (u, v), 3, (0, 255, 0), -1) - + # Draw connections for constrained landmarks for i in range(len(constrained_landmarks) - 1): if i + 1 in thumb_indices and i in thumb_indices: # Draw thumb connections start = constrained_landmarks[i] end = constrained_landmarks[i + 1] - + if start[2] > 0 and end[2] > 0: start_u = int(start[0] * focal_x / start[2] + center_x) start_v = int(start[1] * focal_y / start[2] + center_y) end_u = int(end[0] * focal_x / end[2] + center_x) end_v = int(end[1] * focal_y / end[2] + center_y) - - if (0 <= start_u < width and 0 <= start_v < height and - 0 <= end_u < width and 0 <= end_v < height): - cv2.line(constraints_vis, (start_u, start_v), (end_u, end_v), (0, 255, 0), 2) - + + if ( + 0 <= start_u < width + and 0 <= start_v < height + and 0 <= end_u < width + and 0 <= end_v < height + ): + cv2.line( + constraints_vis, + (start_u, start_v), + (end_u, end_v), + (0, 255, 0), + 2, + ) + # Add legend - cv2.putText(constraints_vis, "Red: Before constraints", (10, 30), - cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2) - cv2.putText(constraints_vis, "Green: After constraints", (10, 60), - cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2) - + cv2.putText( + constraints_vis, + "Red: Before constraints", + (10, 30), + cv2.FONT_HERSHEY_SIMPLEX, + 0.7, + (0, 0, 255), + 2, + ) + cv2.putText( + constraints_vis, + "Green: After constraints", + (10, 60), + cv2.FONT_HERSHEY_SIMPLEX, + 0.7, + (0, 255, 0), + 2, + ) + return constraints_vis -def visualize_robot_params(frame, position, direction, normal, gripper_width, camera_intrinsics): - """Visualize robot parameters (position, orientation, gripper width)""" + +def visualize_robot_params( + frame, position, orientation_matrix, gripper_width, camera_intrinsics +): + """Visualize robot parameters (position, orientation matrix, gripper width)""" robot_vis = frame.copy() height, width = frame.shape[:2] focal_x, focal_y, center_x, center_y = camera_intrinsics - + + # Extract axes from orientation matrix + x_axis = orientation_matrix[:, 0] # First column (principal axis) + y_axis = orientation_matrix[:, 1] # Second column + z_axis = orientation_matrix[:, 2] # Third column (normal) + # Project position to image coordinates x, y, z = position if z > 0: u = int(x * focal_x / z + center_x) v = int(y * focal_y / z + center_y) - + if 0 <= u < width and 0 <= v < height: # Draw end effector position cv2.circle(robot_vis, (u, v), 10, (255, 0, 0), -1) # Blue circle - - # Draw direction vector (X axis) - dx, dy, dz = direction + + # Draw X axis (principal axis) + dx, dy, dz = x_axis scale = 50 # Scale for better visualization end_u = int((x + dx * scale) * focal_x / (z + dz * scale) + center_x) end_v = int((y + dy * scale) * focal_y / (z + dz * scale) + center_y) - + if 0 <= end_u < width and 0 <= end_v < height: - cv2.line(robot_vis, (u, v), (end_u, end_v), (0, 0, 255), 2) # Red line (X axis) - cv2.putText(robot_vis, "X", (end_u, end_v), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2) - - # Draw normal vector (Z axis) - nx, ny, nz = normal + cv2.line( + robot_vis, (u, v), (end_u, end_v), (0, 0, 255), 2 + ) # Red line (X axis) + cv2.putText( + robot_vis, + "X", + (end_u, end_v), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + (0, 0, 255), + 2, + ) + + # Draw Z axis (normal) + nx, ny, nz = z_axis end_u = int((x + nx * scale) * focal_x / (z + nz * scale) + center_x) end_v = int((y + ny * scale) * focal_y / (z + nz * scale) + center_y) - + if 0 <= end_u < width and 0 <= end_v < height: - cv2.line(robot_vis, (u, v), (end_u, end_v), (0, 255, 0), 2) # Green line (Z axis) - cv2.putText(robot_vis, "Z", (end_u, end_v), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) - - # Calculate Y axis (cross product of Z and X for right-hand coordinate system) - y_axis = np.cross(normal, direction) - y_axis = y_axis / np.linalg.norm(y_axis) + cv2.line( + robot_vis, (u, v), (end_u, end_v), (0, 255, 0), 2 + ) # Green line (Z axis) + cv2.putText( + robot_vis, + "Z", + (end_u, end_v), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + (0, 255, 0), + 2, + ) + + # Draw Y axis yx, yy, yz = y_axis - end_u = int((x + yx * scale) * focal_x / (z + yz * scale) + center_x) end_v = int((y + yy * scale) * focal_y / (z + yz * scale) + center_y) - + if 0 <= end_u < width and 0 <= end_v < height: - cv2.line(robot_vis, (u, v), (end_u, end_v), (255, 0, 0), 2) # Blue line (Y axis) - cv2.putText(robot_vis, "Y", (end_u, end_v), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2) - + cv2.line( + robot_vis, (u, v), (end_u, end_v), (255, 0, 0), 2 + ) # Blue line (Y axis) + cv2.putText( + robot_vis, + "Y", + (end_u, end_v), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + (255, 0, 0), + 2, + ) + # Visualize gripper width gripper_radius = int(gripper_width * 100) # Scale for better visualization - cv2.circle(robot_vis, (u, v), gripper_radius, (0, 255, 255), 2) # Yellow circle - + cv2.circle( + robot_vis, (u, v), gripper_radius, (0, 255, 255), 2 + ) # Yellow circle + # Add parameter values as text y_offset = height - 160 - cv2.putText(robot_vis, f"Position: ({x:.2f}, {y:.2f}, {z:.2f})", (10, y_offset), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1) - + cv2.putText( + robot_vis, + f"Position: ({x:.2f}, {y:.2f}, {z:.2f})", + (10, y_offset), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + (255, 255, 255), + 1, + ) + y_offset += 30 - cv2.putText(robot_vis, f"Direction: ({dx:.2f}, {dy:.2f}, {dz:.2f})", (10, y_offset), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1) - + cv2.putText( + robot_vis, + f"X axis: ({x_axis[0]:.2f}, {x_axis[1]:.2f}, {x_axis[2]:.2f})", + (10, y_offset), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + (0, 0, 255), + 1, + ) + y_offset += 30 - cv2.putText(robot_vis, f"Normal: ({nx:.2f}, {ny:.2f}, {nz:.2f})", (10, y_offset), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1) - + cv2.putText( + robot_vis, + f"Z axis: ({z_axis[0]:.2f}, {z_axis[1]:.2f}, {z_axis[2]:.2f})", + (10, y_offset), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + (0, 255, 0), + 1, + ) + y_offset += 30 - cv2.putText(robot_vis, f"Gripper Width: {gripper_width:.2f}", (10, y_offset), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 255), 1) - - return robot_vis \ No newline at end of file + cv2.putText( + robot_vis, + f"Gripper Width: {gripper_width:.2f}", + (10, y_offset), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + (0, 255, 255), + 1, + ) + + return robot_vis