improve icp and robot params

2025-04-10 16:28:04 +00:00 · 2025-03-21 17:05:51 -07:00 · 2025-03-21 17:05:51 -07:00 · b38527b5db
commit b38527b5db
parent aae4e91f54
6 changed files with 690 additions and 417 deletions
--- a/open_phantom/hand_processor.py
+++ b/open_phantom/hand_processor.py
@ -1,15 +1,17 @@
 import os
 import cv2
 import time
-import torch
+
 import cv2
 import depth_pro
 import mediapipe as mp
 import numpy as np
 import open3d as o3d
-import mediapipe as mp
+import torch
-
+from PIL import Image
 from robot_manager import RobotManager
 from sam2.sam2_image_predictor import SAM2ImagePredictor
 from tqdm import tqdm
 from utils.visualizations import *
 from robot_manager import RobotManager
 # from diffusers import StableDiffusionInpaintPipeline
 class HandProcessor:
@ -37,39 +39,38 @@ class HandProcessor:
        self.midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms")
        self.transform = self.midas_transforms.dpt_transform
-    # Segment hand from image using MediaPipe Hands landmarks
+        # Load SAM2 model for hand segmentation
-    def _create_mask(self, image: np.ndarray, landmarks: list, thickness: int=5, padding:int=10) -> np.ndarray:
+        print("Loading SAM2 model...")
-        height, width = image.shape[:2]
+        self.sam2_predictor = SAM2ImagePredictor.from_pretrained(
-        mask = np.zeros((height, width), dtype=np.uint8)
+            "facebook/sam2-hiera-large"
        )
-        points = [(int(landmark.x * width), int(landmark.y * height)) for landmark in landmarks.landmark]
+    def _create_mask(self, frame: np.ndarray, landmarks: list) -> np.ndarray:
        height, width = frame.shape[:2]
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-        # Draw filled circles at each landmark
+        # Set image in SAM2 predictor
-        for point in points:
+        with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
-            cv2.circle(mask, point, thickness, 255, -1)
+            self.sam2_predictor.set_image(frame_rgb)  # Set image for prediction
-        # Connect all landmarks with thick lines
+            # Convert landmarks to point prompts
-        for connection in self.mp_hands.HAND_CONNECTIONS:
+            points = []
-            start_idx, end_idx = connection
+            for landmark in landmarks.landmark:
-            cv2.line(mask, points[start_idx], points[end_idx], 255, thickness)
+                x, y = int(landmark.x * width), int(landmark.y * height)
                points.append([x, y])
-        # Create palm by connecting base of fingers with wrist
+            input_points = np.array(points)
        palm_points = [points[0], points[1], points[5], points[9], points[13], points[17]]
        cv2.fillPoly(mask, [np.array(palm_points)], 255)
-        # Create shape between fingers
+            # Predict mask with point prompts
-        finger_bases = [(1,5), (5,9), (9,13), (13,17)]
+            masks, _, _ = self.sam2_predictor.predict(
-        for base1, base2 in finger_bases:
+                point_coords=input_points,  # Pass the points as prompts
-            triangle = np.array([points[0], points[base1], points[base2]])
+                point_labels=np.ones(
-            cv2.fillPoly(mask, [triangle], 255)
+                    len(input_points)
                ),  # All points from hand are foreground
                multimask_output=False,  # Just get one mask
            )
-        # Dilate to smooth and expand slightly
+            mask = masks[0].astype(np.uint8) * 255
        kernel = np.ones((padding, padding), np.uint8)
        mask = cv2.dilate(mask, kernel, iterations=1)
        # Smooth the mask
        mask = cv2.GaussianBlur(mask, (21, 21), 0)
        _, mask = cv2.threshold(mask, 50, 255, cv2.THRESH_BINARY)
        return mask
@ -77,7 +78,8 @@ class HandProcessor:
    Transform input image to match MiDaS model requirements
    Estimate depth map using MiDaS model
    """
-    # TODO: Look into why screen goes black sometimes
+
    # TODO: Swap MiDaS for ml-depth-pro model
    def _estimate_depth(self, image: np.ndarray) -> tuple:
        img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
@ -116,7 +118,6 @@ class HandProcessor:
        v_coords = v_coords[valid_indices]
        z_values = z_values[valid_indices]
        # NOTE: Abritrary scaling factor for depth
        z_metric = z_values * 0.5
        # Back-project to 3D using camera intrinsics
@ -127,32 +128,38 @@ class HandProcessor:
        return points
-    # TODO: Look into better Z scaling
+    # TODO: Look into better depth scaling
    def _create_mesh(self, landmarks: list, image_dims: tuple) -> np.ndarray:
        width, height = image_dims
        vertices = []
        for landmark in landmarks.landmark:
-            vertices.append([
+            vertices.append(
-                landmark.x * width,
+                [landmark.x * width, landmark.y * height, landmark.z * width]
-                landmark.y * height,
+            )
                landmark.z * width
            ])
        # Define faces (triangles) connecting landmarks
        faces = [
            # Palm
-            [0, 1, 5], [0, 5, 9], [0, 9, 13], [0, 13, 17],
+            [0, 1, 5],
            [0, 5, 9],
            [0, 9, 13],
            [0, 13, 17],
            # Thumb
-            [1, 2, 3], [2, 3, 4],
+            [1, 2, 3],
            [2, 3, 4],
            # Index finger
-            [5, 6, 7], [6, 7, 8],
+            [5, 6, 7],
            [6, 7, 8],
            # Middle finger
-            [9, 10, 11], [10, 11, 12],
+            [9, 10, 11],
            [10, 11, 12],
            # Ring finger
-            [13, 14, 15], [14, 15, 16],
+            [13, 14, 15],
            [14, 15, 16],
            # Pinky
-            [17, 18, 19], [18, 19, 20]
+            [17, 18, 19],
            [18, 19, 20],
        ]
        dense_vertices = list(vertices)
@ -171,6 +178,7 @@ class HandProcessor:
        return np.array(dense_vertices), np.array(faces)
    # TODO: Improve ICP registration for better alignment
    def _icp_registration(self, cloud: np.ndarray, vertices: np.ndarray) -> np.ndarray:
        source = o3d.geometry.PointCloud()
        source.points = o3d.utility.Vector3dVector(vertices)
@ -178,28 +186,41 @@ class HandProcessor:
        target = o3d.geometry.PointCloud()
        target.points = o3d.utility.Vector3dVector(cloud)
        # Calculate centroids
        source_centroid = np.mean(vertices, axis=0)
        target_centroid = np.mean(cloud, axis=0)
        # Calculate initial translation to align centroids
        initial_translation = target_centroid - source_centroid
        # Create initial transformation matrix
        trans_init = np.eye(4)
        trans_init[:3, 3] = initial_translation
        result = o3d.pipelines.registration.registration_icp(
            source,
            target,
-            max_correspondence_distance=0.05,
+            max_correspondence_distance=0.1,  # Increased distance
            init=trans_init,  # Initial transformation
            estimation_method=o3d.pipelines.registration.TransformationEstimationPointToPoint(),
            criteria=o3d.pipelines.registration.ICPConvergenceCriteria(
                max_iteration=100
            ),  # Increased iterations
        )
        transformation = result.transformation
        return transformation
    # TODO: Look into better depth scaling
    def _refine_landmarks(self, landmarks: list, transform: int, image_dims: tuple):
        width, height = image_dims
        refined_landmarks = []
        for landmark in landmarks.landmark:
-            point = np.array([
+            point = np.array(
-                landmark.x * width,
+                [landmark.x * width, landmark.y * height, landmark.z * width, 1.0]
-                landmark.y * height,
+            )
                landmark.z * width, # TODO: Figure out better scaling factor
                1.0
            ])
            # Apply transformation to 3D point
            transformed = np.dot(transform, point)
@ -244,42 +265,87 @@ class HandProcessor:
        return constrained
-    def _get_robot_params(self, refined_landmarks) -> tuple:
+    def _get_robot_params(self, refined_landmarks):
-        # Extract key landmarks
+        # Extract keypoints
-        wrist = refined_landmarks[0]  # Wrist landmark
+        landmarks = np.array(refined_landmarks)
        thumb_tip = refined_landmarks[4]  # Thumb tip
        index_tip = refined_landmarks[8]  # Index finger tip
-        # Calculate end effector position (midpoint between thumb and index tips)
+        # Define indices for specific parts of the hand
        thumb_indices = [1, 2, 3, 4]  # Thumb landmarks
        index_indices = [5, 6, 7, 8]  # Index finger landmarks
        thumb_tip_idx = 4
        index_tip_idx = 8
        # 1. Set target position as midpoint between thumb tip and index tip
        thumb_tip = landmarks[thumb_tip_idx]
        index_tip = landmarks[index_tip_idx]
        position = (thumb_tip + index_tip) / 2
-        # Calculate vectors for orientation
+        # 2. Calculate orientation
-        v1 = thumb_tip - wrist  # Vector from wrist to thumb tip
+        # Get all thumb and index finger points for plane fitting
-        v2 = index_tip - wrist  # Vector from wrist to index tip
+        thumb_points = landmarks[thumb_indices]
        index_points = landmarks[index_indices]
        finger_points = np.vstack([thumb_points, index_points])
-        # Calculate normal to hand plane
+        # Fit plane to finger points
-        normal = np.cross(v1, v2)
+        centroid = np.mean(finger_points, axis=0)
-        if np.linalg.norm(normal) > 0:
+        centered_points = finger_points - centroid
        # Use SVD to find the normal to the best-fitting plane
        u, s, vh = np.linalg.svd(centered_points)
        # The normal is the last right singular vector
        normal = vh[2, :]
        # Ensure normal is a unit vector
        normal = normal / np.linalg.norm(normal)
        else:
            # Default if vectors are collinear
            normal = np.array([0, 0, 1])
-        # Calculate main direction along thumb
+        # Fit a principal axis through thumb points
-        direction = thumb_tip - wrist
+        thumb_centroid = np.mean(thumb_points, axis=0)
-        if np.linalg.norm(direction) > 0:
+        thumb_centered = thumb_points - thumb_centroid
            direction = direction / np.linalg.norm(direction)
        else:
            # Default if thumb is at wrist (unlikely)
            direction = np.array([1, 0, 0])
-        # Calculate gripper width
+        # Use SVD again to find direction of maximum variance (principal axis)
        u, s, vh = np.linalg.svd(thumb_centered)
        principal_axis = vh[0, :]  # First singular vector
        # Ensure principal axis is a unit vector
        principal_axis = principal_axis / np.linalg.norm(principal_axis)
        # Construct orthogonal vectors for orientation matrix
        z_axis = normal  # Normal to the plane
        x_axis = principal_axis  # Along the thumb
        # Ensure x_axis is orthogonal to z_axis
        x_axis = x_axis - np.dot(x_axis, z_axis) * z_axis
        x_axis = x_axis / np.linalg.norm(x_axis)
        # Compute y_axis as cross product to ensure orthogonality
        y_axis = np.cross(z_axis, x_axis)
        # Create rotation matrix
        rotation_matrix = np.column_stack([x_axis, y_axis, z_axis])
        # 3. Calculate gripper width
        gripper_width = np.linalg.norm(thumb_tip - index_tip)
-        return position, direction, normal, gripper_width
+        # Apply percentile threshold to mitigate slippage (as mentioned in the paper)
        # Store gripper widths in a buffer to compute percentiles
        if not hasattr(self, "gripper_width_buffer"):
            self.gripper_width_buffer = []
        self.gripper_width_buffer.append(gripper_width)
        # Keep only the last 100 values
        if len(self.gripper_width_buffer) > 100:
            self.gripper_width_buffer.pop(0)
        # Apply 20th percentile threshold
        if len(self.gripper_width_buffer) > 5:  # Need enough samples
            min_width = np.percentile(self.gripper_width_buffer, 20)
            if gripper_width < min_width:
                gripper_width = min_width
        return position, rotation_matrix, gripper_width
    def record_video(self) -> str:
-        output_dir = os.path.join(os.getcwd(), 'recordings')
+        output_dir = os.path.join(os.getcwd(), "recordings")
        os.makedirs(output_dir, exist_ok=True)
        timestamp = time.strftime("%Y%m%d-%H%M%S")
@ -296,7 +362,7 @@ class HandProcessor:
        # Update camera intrinsics based on video dimensions
        self.camera_intrinsics = (width * 0.8, height * 0.8, width / 2, height / 2)
-        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
        out = None
        recording = False
@ -317,11 +383,19 @@ class HandProcessor:
            # Display recording indicator ONLY on the display frame
            if recording:
                cv2.circle(display_frame, (30, 30), 15, (0, 0, 255), -1)
-                cv2.putText(display_frame, "RECORDING", (50, 40), cv2.FONT_HERSHEY_SIMPLEX, 
+                cv2.putText(
-                            1, (0, 0, 255), 2, cv2.LINE_AA)
+                    display_frame,
                    "RECORDING",
                    (50, 40),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    1,
                    (0, 0, 255),
                    2,
                    cv2.LINE_AA,
                )
            # Show the display frame (with indicator if recording)
-            cv2.imshow('Video Recording', display_frame)
+            cv2.imshow("Video Recording", display_frame)
            # Write the original frame (without indicator) to video file if recording
            if recording and out is not None:
@ -329,10 +403,7 @@ class HandProcessor:
            key = cv2.waitKey(1) & 0xFF
-            # NOTE: Haven't tested what happens if user records multiple videos in one session (probably overwrites?)
+            if key == ord("r"):
            if key == ord('q'):
                break
            elif key == ord('r'):
                recording = not recording
                if recording:
                    print(f"Started recording to {video_filename}")
@ -342,6 +413,7 @@ class HandProcessor:
                        out.release()
                        print(f"Stopped recording. Video saved to {video_filename}")
                    did_record = True
                    break
        if out is not None:
            out.release()
@ -373,12 +445,18 @@ class HandProcessor:
        constraints_output_path = f"{base_path}_constraints.mp4"
        robot_output_path = f"{base_path}_robot.mp4"
-        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
-        segmented_out = cv2.VideoWriter(segmented_output_path, fourcc, fps, (width, height))
+        segmented_out = cv2.VideoWriter(
            segmented_output_path, fourcc, fps, (width, height)
        )
        depth_out = cv2.VideoWriter(depth_output_path, fourcc, fps, (width, height))
        mesh_out = cv2.VideoWriter(mesh_output_path, fourcc, fps, (width, height))
-        reg_out = cv2.VideoWriter(registration_output_path, fourcc, fps, (640, 480))  # Fixed size
+        reg_out = cv2.VideoWriter(
-        constraints_out = cv2.VideoWriter(constraints_output_path, fourcc, fps, (width, height))
+            registration_output_path, fourcc, fps, (640, 480)
        )  # Fixed size
        constraints_out = cv2.VideoWriter(
            constraints_output_path, fourcc, fps, (width, height)
        )
        robot_out = cv2.VideoWriter(robot_output_path, fourcc, fps, (width, height))
        print(f"Processing video with {total_frames} frames...")
@ -396,48 +474,60 @@ class HandProcessor:
            segmented_frame = frame.copy()
            depth_frame = np.zeros((height, width, 3), dtype=np.uint8)
            mesh_frame = frame.copy()
-            reg_frame = np.ones((480, 640, 3), dtype=np.uint8) * 255  # Fixed size, white background
+            reg_frame = (
                np.ones((480, 640, 3), dtype=np.uint8) * 255
            )  # Fixed size, white background
            constraints_frame = frame.copy()
            robot_frame = frame.copy()
            # Process if hand is detected
            if results.multi_hand_landmarks:
                for hand_landmarks in results.multi_hand_landmarks:
                    # Segment hand
                    hand_mask = self._create_mask(frame, hand_landmarks)
                    mask_overlay = frame.copy()
                    mask_overlay[hand_mask > 0] = [0, 0, 255]  # Red color for mask
                    segmented_frame = cv2.addWeighted(frame, 0.7, mask_overlay, 0.3, 0)
                    # Depth estimation
                    depth_map, depth_vis = self._estimate_depth(frame)
                    depth_colored = cv2.applyColorMap(depth_vis, cv2.COLORMAP_JET)
                    depth_frame = depth_colored.copy()
                    # Create hand mesh
                    hand_vertices, hand_faces = self._create_mesh(hand_landmarks, (width, height))
                    mesh_frame = visualize_mesh(frame, hand_vertices, hand_faces)
                    # Create point cloud from depth
                    cloud = self._create_cloud(depth_map, hand_mask)
-                    # Perform ICP registration
+                    hand_vertices, hand_faces = self._create_mesh(
                        hand_landmarks, (width, height)
                    )
                    mesh_frame = visualize_mesh(frame, hand_vertices, hand_faces)
                    icp_transform = self._icp_registration(cloud, hand_vertices)
-                    reg_frame = visualize_registration(cloud, hand_vertices, icp_transform)
+                    reg_frame = visualize_registration(
                        cloud, hand_vertices, icp_transform
                    )
-                    # Refine landmarks using ICP transformation
+                    refined_landmarks = self._refine_landmarks(
-                    refined_landmarks = self._refine_landmarks(hand_landmarks, icp_transform, (width, height))
+                        hand_landmarks, icp_transform, (width, height)
                    )
                    # Store pre-constraint landmarks for visualization
                    original_refined = refined_landmarks.copy()
                    # Apply anatomical constraints
                    constrained_landmarks = self._apply_constraints(refined_landmarks)
-                    constraints_frame = visualize_constraints(frame, original_refined, constrained_landmarks, self.camera_intrinsics)
+                    constraints_frame = visualize_constraints(
                        frame,
                        original_refined,
                        constrained_landmarks,
                        self.camera_intrinsics,
                    )
-                    # # Calculate robot parameters
+                    position, orientation, gripper_width = self._get_robot_params(
-                    # position, direction, normal, gripper_width = self._get_robot_params(constrained_landmarks)
+                        constrained_landmarks
-                    # robot_frame = visualize_robot_params(frame, position, direction, normal, gripper_width)
+                    )
                    robot_frame = visualize_robot_params(
                        frame,
                        position,
                        orientation,
                        gripper_width,
                        self.camera_intrinsics,
                    )
            segmented_out.write(segmented_frame)
            depth_out.write(depth_frame)
@ -450,12 +540,12 @@ class HandProcessor:
            display_size = (int(width * display_scale), int(height * display_scale))
            reg_display_size = (int(640 * display_scale), int(480 * display_scale))
-            cv2.imshow('Segmented', cv2.resize(segmented_frame, display_size))
+            cv2.imshow("Segmented", cv2.resize(segmented_frame, display_size))
-            cv2.imshow('Depth', cv2.resize(depth_frame, display_size))
+            cv2.imshow("Depth", cv2.resize(depth_frame, display_size))
-            cv2.imshow('Mesh', cv2.resize(mesh_frame, display_size))
+            cv2.imshow("Mesh", cv2.resize(mesh_frame, display_size))
-            cv2.imshow('Registration', cv2.resize(reg_frame, reg_display_size))
+            cv2.imshow("Registration", cv2.resize(reg_frame, reg_display_size))
-            cv2.imshow('Constraints', cv2.resize(constraints_frame, display_size))
+            cv2.imshow("Constraints", cv2.resize(constraints_frame, display_size))
-            cv2.imshow('Robot Parameters', cv2.resize(robot_frame, display_size))
+            cv2.imshow("Robot Parameters", cv2.resize(robot_frame, display_size))
            if cv2.waitKey(1) & 0xFF == 27:  # ESC to exit
                break
@ -483,5 +573,5 @@ class HandProcessor:
            "mesh": mesh_output_path,
            "registration": registration_output_path,
            "constraints": constraints_output_path,
-            "robot": robot_output_path
+            "robot": robot_output_path,
        }
--- a/open_phantom/main.py
+++ b/open_phantom/main.py
@ -8,7 +8,7 @@ def main():
    record_option = input("Record a new video? (y/n): ")
-    if record_option.lower() == 'y':
+    if record_option.lower() == "y":
        print("Starting video recording...")
        video_path = processor.record_video()
        if not video_path:
--- a/open_phantom/robot_manager.py
+++ b/open_phantom/robot_manager.py
@ -1,11 +1,9 @@
 import os
 import cv2
 import numpy as np
 import pybullet as p
 from utils.math import *
 from utils.handle_urdf import handle_urdf
 from scipy.spatial.transform import Rotation as R
 class RobotManager:
@ -30,13 +28,13 @@ class RobotManager:
                robot_urdf,
                basePosition=[0, 0, 0],
                useFixedBase=True,
-                flags=p.URDF_USE_SELF_COLLISION | p.URDF_USE_INERTIA_FROM_FILE
+                flags=p.URDF_USE_SELF_COLLISION | p.URDF_USE_INERTIA_FROM_FILE,
            )
        except p.error as e:
            print(f"PyBullet error when loading URDF: {e}")
            raise e
-        robot_name = p.getBodyInfo(robot_id)[1].decode('utf-8')
+        robot_name = p.getBodyInfo(robot_id)[1].decode("utf-8")
        print(f"Successfully loaded {robot_name} robot with ID: {robot_id}")
        return robot_id
@ -46,11 +44,11 @@ class RobotManager:
        assert self.joint_count > 0, "Robot has no joints"
        # Keywords to look for in joint names to identify end effector
-        keywords = ['gripper', 'tool', 'tcp', 'end_effector', 'hand']
+        keywords = ["gripper", "tool", "tcp", "end_effector", "hand"]
        for i in range(self.joint_count):
            info = p.getJointInfo(self.robot_id, i)
-            joint_name = info[1].decode('utf-8').lower()
+            joint_name = info[1].decode("utf-8").lower()
            # Check if any keyword is in the joint name
            if any(keyword in joint_name for keyword in keywords):
@ -60,7 +58,12 @@ class RobotManager:
        return self.joint_count - 1
    # TODO: Use inverse kinematics to set the robot pose
-    def set_robot_pose(self, position: np.ndarray, orientation_vectors: np.ndarray, gripper_width: float) -> None:
+    def set_robot_pose(
        self,
        position: np.ndarray,
        orientation_vectors: np.ndarray,
        gripper_width: float,
    ) -> None:
        pass
    # Render the robot in some scene using some camera parameters
@ -85,16 +88,13 @@ class RobotManager:
            yaw=cam_yaw,
            pitch=cam_pitch,
            roll=cam_roll,
-            upAxisIndex=2
+            upAxisIndex=2,
        )
        # Compute projection matrix
        aspect = self.img_width / self.img_height
        proj_matrix = p.computeProjectionMatrixFOV(
-            fov=60, 
+            fov=60, aspect=aspect, nearVal=0.01, farVal=100.0
            aspect=aspect,
            nearVal=0.01, 
            farVal=100.0
        )
        # Render the scene
@ -103,7 +103,7 @@ class RobotManager:
            height=self.img_height,
            viewMatrix=view_matrix,
            projectionMatrix=proj_matrix,
-            renderer=p.ER_BULLET_HARDWARE_OPENGL
+            renderer=p.ER_BULLET_HARDWARE_OPENGL,
        )
        # Extract RGB and depth
@ -131,17 +131,19 @@ class RobotManager:
        return rgb.astype(np.uint8)
    def __del__(self) -> None:
-        if hasattr(self, 'physics_client'):
+        if hasattr(self, "physics_client"):
            try:
                p.disconnect(self.physics_client)
            except:
                pass
 if __name__ == "__main__":
    cwd = os.getcwd()
-    urdf_path = os.path.join(cwd, "notebook/phantom/urdf/SO_5DOF_ARM100_05d.SLDASM/urdf/SO_5DOF_ARM100_05d.SLDASM.urdf")
+    urdf_path = os.path.join(
        cwd,
        "notebook/phantom/urdf/SO_5DOF_ARM100_05d.SLDASM/urdf/SO_5DOF_ARM100_05d.SLDASM.urdf",
    )
    camera_intrinsics = (320, 240, 320, 240)  # Random intrinsics for example
    robot_vis = RobotManager(urdf_path, camera_intrinsics)
--- a/open_phantom/utils/calibrations.py
+++ b/open_phantom/utils/calibrations.py
@ -0,0 +1 @@
 # TODO: Implement camera calibration functions to extract camera extrinsics and intrinsics
--- a/open_phantom/utils/handle_urdf.py
+++ b/open_phantom/utils/handle_urdf.py
@ -2,7 +2,6 @@ import os
 import xml.dom.minidom as minidom
 import xml.etree.ElementTree as ET
 """
 Fixes a specific path that uses the ROS-style 'package://' prefix.
 The 'package://' prefix is used in URDF files to refer to files in the same package.
@ -10,18 +9,20 @@ However, when we're not running in a ROS environment, the paths are not valid.
 This function tries to find the absolute path of the mesh file.
 If the mesh file is not found, the original path is used.
 """
 def fix_path(path: str, urdf_dir: str) -> str:
-    if path.startswith('package://'):
+    if path.startswith("package://"):
-        parts = path[len('package://'):].split('/', 1)
+        parts = path[len("package://") :].split("/", 1)
        if len(parts) == 2:
            package_name, rel_path = parts
            # Try potential locations for the mesh
            potential_paths = [
                os.path.join(urdf_dir, rel_path),
-                os.path.join(urdf_dir, '../meshes', rel_path),
+                os.path.join(urdf_dir, "../meshes", rel_path),
-                os.path.join(urdf_dir, f'../{package_name}', rel_path),
+                os.path.join(urdf_dir, f"../{package_name}", rel_path),
-                os.path.join(urdf_dir, '../..', rel_path)
+                os.path.join(urdf_dir, "../..", rel_path),
            ]
            for possible_path in potential_paths:
@ -37,18 +38,20 @@ def fix_path(path: str, urdf_dir: str) -> str:
 Iterates through the URDF file and fixes the paths of all mesh files.
 The URDF file is parsed and the mesh paths are modified in-place.
 """
 def fix_mesh_paths(urdf_path: str, urdf_dir: str) -> str:
    root = ET.parse(urdf_path).getroot()
    try:
-        for mesh in root.findall('.//mesh'):
+        for mesh in root.findall(".//mesh"):
-            if 'filename' in mesh.attrib:
+            if "filename" in mesh.attrib:
-                mesh.attrib['filename'] = fix_path(mesh.attrib['filename'], urdf_dir)
+                mesh.attrib["filename"] = fix_path(mesh.attrib["filename"], urdf_dir)
    except Exception as e:
        print(f"Error fixing mesh paths: {e}")
        raise e
-    fixed_path = urdf_path.replace('.urdf', '_fixed.urdf')
+    fixed_path = urdf_path.replace(".urdf", "_fixed.urdf")
    return root, fixed_path
@ -56,15 +59,19 @@ def fix_mesh_paths(urdf_path:str, urdf_dir: str) -> str:
 """
 Formats the XML tree to be human-readable and writes it to a file.
 """
 def format_xml(root: ET.Element, fixed_path: str) -> None:
-    xml_str = ET.tostring(root, encoding='utf-8')
+    xml_str = ET.tostring(root, encoding="utf-8")
    dom = minidom.parseString(xml_str)
-    with open(fixed_path, 'w', encoding='utf-8') as f:
+    with open(fixed_path, "w", encoding="utf-8") as f:
        # Write with nice indentation but remove extra whitespace
-        pretty_xml = dom.toprettyxml(indent='  ')
+        pretty_xml = dom.toprettyxml(indent="  ")
        # Remove extra blank lines that minidom sometimes adds
-        pretty_xml = '\n'.join([line for line in pretty_xml.split('\n') if line.strip()])
+        pretty_xml = "\n".join(
            [line for line in pretty_xml.split("\n") if line.strip()]
        )
        f.write(pretty_xml)
@ -87,8 +94,11 @@ def handle_urdf(urdf_path: str) -> str:
        raise e
-if __name__ == '__main__':
+if __name__ == "__main__":
    # Example usage
    cwd = os.getcwd()
-    urdf_path = os.path.join(cwd, "notebook/phantom/urdf/SO_5DOF_ARM100_05d.SLDASM/urdf/SO_5DOF_ARM100_05d.SLDASM.urdf")  
+    urdf_path = os.path.join(
        cwd,
        "open_phantom/urdf/SO_5DOF_ARM100_05d.SLDASM/urdf/SO_5DOF_ARM100_05d.SLDASM.urdf",
    )
    handle_urdf(urdf_path)
--- a/open_phantom/utils/visualizations.py
+++ b/open_phantom/utils/visualizations.py
@ -11,13 +11,17 @@ def visualize_mesh(frame, hand_vertices, faces):
    for i in range(21):
        pt = tuple(map(int, hand_vertices[i][:2]))
        if 0 <= pt[0] < width and 0 <= pt[1] < height:
-            cv2.circle(mesh_vis, pt, 4, (255, 0, 0), -1)  # Red circles for base vertices
+            cv2.circle(
                mesh_vis, pt, 4, (255, 0, 0), -1
            )  # Red circles for base vertices
    # Draw the interpolated vertices
    for i in range(21, len(hand_vertices)):
        pt = tuple(map(int, hand_vertices[i][:2]))
        if 0 <= pt[0] < width and 0 <= pt[1] < height:
-            cv2.circle(mesh_vis, pt, 2, (0, 255, 255), -1)  # Yellow circles for interpolated vertices
+            cv2.circle(
                mesh_vis, pt, 2, (0, 255, 255), -1
            )  # Yellow circles for interpolated vertices
    # Draw the faces of the mesh
    for face in faces:
@ -26,25 +30,53 @@ def visualize_mesh(frame, hand_vertices, faces):
            pt2 = tuple(map(int, hand_vertices[face[1]][:2]))
            pt3 = tuple(map(int, hand_vertices[face[2]][:2]))
-            if (0 <= pt1[0] < width and 0 <= pt1[1] < height and
+            if (
-                0 <= pt2[0] < width and 0 <= pt2[1] < height and
+                0 <= pt1[0] < width
-                0 <= pt3[0] < width and 0 <= pt3[1] < height):
+                and 0 <= pt1[1] < height
-                cv2.line(mesh_vis, pt1, pt2, (0, 255, 0), 1)  # Green lines for mesh edges
+                and 0 <= pt2[0] < width
                and 0 <= pt2[1] < height
                and 0 <= pt3[0] < width
                and 0 <= pt3[1] < height
            ):
                cv2.line(
                    mesh_vis, pt1, pt2, (0, 255, 0), 1
                )  # Green lines for mesh edges
                cv2.line(mesh_vis, pt2, pt3, (0, 255, 0), 1)
                cv2.line(mesh_vis, pt3, pt1, (0, 255, 0), 1)
    # Add explanation text
-    cv2.putText(mesh_vis, "Red: Base vertices", (10, 30), 
+    cv2.putText(
-                cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 0), 2)
+        mesh_vis,
-    cv2.putText(mesh_vis, "Yellow: Interpolated vertices", (10, 60), 
+        "Red: Base vertices",
-                cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 255), 2)
+        (10, 30),
-    cv2.putText(mesh_vis, "Green: Mesh edges", (10, 90), 
+        cv2.FONT_HERSHEY_SIMPLEX,
-                cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)
+        0.6,
        (255, 0, 0),
        2,
    )
    cv2.putText(
        mesh_vis,
        "Yellow: Interpolated vertices",
        (10, 60),
        cv2.FONT_HERSHEY_SIMPLEX,
        0.6,
        (0, 255, 255),
        2,
    )
    cv2.putText(
        mesh_vis,
        "Green: Mesh edges",
        (10, 90),
        cv2.FONT_HERSHEY_SIMPLEX,
        0.6,
        (0, 255, 0),
        2,
    )
    return mesh_vis
 def visualize_registration(cloud, vertices, icp_transform):
    """Visualize the ICP registration process"""
    height, width = 480, 640  # Fixed size visualization
    reg_vis = np.ones((height, width, 3), dtype=np.uint8) * 255  # White background
@ -90,25 +122,63 @@ def visualize_registration(cloud, vertices, icp_transform):
        cv2.circle(reg_vis, pt, 2, (0, 255, 0), -1)  # Green dots
    # Add transformation matrix display
-    cv2.putText(reg_vis, "ICP Transformation Matrix:", (10, height - 120), 
+    cv2.putText(
-                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1)
+        reg_vis,
        "ICP Transformation Matrix:",
        (10, height - 120),
        cv2.FONT_HERSHEY_SIMPLEX,
        0.5,
        (0, 0, 0),
        1,
    )
    for i in range(4):
        matrix_text = f"[{icp_transform[i][0]:.2f}, {icp_transform[i][1]:.2f}, {icp_transform[i][2]:.2f}, {icp_transform[i][3]:.2f}]"
-        cv2.putText(reg_vis, matrix_text, (10, height - 90 + i * 20), 
+        cv2.putText(
-                    cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 0, 0), 1)
+            reg_vis,
            matrix_text,
            (10, height - 90 + i * 20),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.4,
            (0, 0, 0),
            1,
        )
    # Add legend
-    cv2.putText(reg_vis, "Red: Point Cloud", (width - 200, 30), 
+    cv2.putText(
-                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)
+        reg_vis,
-    cv2.putText(reg_vis, "Blue: Original Mesh", (width - 200, 60), 
+        "Red: Point Cloud",
-                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1)
+        (width - 200, 30),
-    cv2.putText(reg_vis, "Green: Transformed Mesh", (width - 200, 90), 
+        cv2.FONT_HERSHEY_SIMPLEX,
-                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
+        0.5,
        (0, 0, 255),
        1,
    )
    cv2.putText(
        reg_vis,
        "Blue: Original Mesh",
        (width - 200, 60),
        cv2.FONT_HERSHEY_SIMPLEX,
        0.5,
        (255, 0, 0),
        1,
    )
    cv2.putText(
        reg_vis,
        "Green: Transformed Mesh",
        (width - 200, 90),
        cv2.FONT_HERSHEY_SIMPLEX,
        0.5,
        (0, 255, 0),
        1,
    )
    return reg_vis
-def visualize_constraints(frame, refined_landmarks, constrained_landmarks, camera_intrinsics):
+
 def visualize_constraints(
    frame, refined_landmarks, constrained_landmarks, camera_intrinsics
 ):
    """Visualize before and after applying anatomical constraints"""
    constraints_vis = frame.copy()
    height, width = frame.shape[:2]
@ -129,7 +199,9 @@ def visualize_constraints(frame, refined_landmarks, constrained_landmarks, camer
            if 0 <= u < width and 0 <= v < height:
                if i in highlighted_indices:
                    # Draw larger circles for thumb and index fingers (the constrained ones)
-                    cv2.circle(constraints_vis, (u, v), 5, (0, 0, 255), -1)  # Red circles
+                    cv2.circle(
                        constraints_vis, (u, v), 5, (0, 0, 255), -1
                    )  # Red circles
                else:
                    cv2.circle(constraints_vis, (u, v), 3, (0, 0, 255), -1)
@ -146,9 +218,19 @@ def visualize_constraints(frame, refined_landmarks, constrained_landmarks, camer
                end_u = int(end[0] * focal_x / end[2] + center_x)
                end_v = int(end[1] * focal_y / end[2] + center_y)
-                if (0 <= start_u < width and 0 <= start_v < height and
+                if (
-                    0 <= end_u < width and 0 <= end_v < height):
+                    0 <= start_u < width
-                    cv2.line(constraints_vis, (start_u, start_v), (end_u, end_v), (0, 0, 255), 2)
+                    and 0 <= start_v < height
                    and 0 <= end_u < width
                    and 0 <= end_v < height
                ):
                    cv2.line(
                        constraints_vis,
                        (start_u, start_v),
                        (end_u, end_v),
                        (0, 0, 255),
                        2,
                    )
    # Draw constrained landmarks
    for i, landmark in enumerate(constrained_landmarks):
@ -160,7 +242,9 @@ def visualize_constraints(frame, refined_landmarks, constrained_landmarks, camer
            if 0 <= u < width and 0 <= v < height:
                if i in highlighted_indices:
                    # Draw larger circles for thumb and index fingers
-                    cv2.circle(constraints_vis, (u, v), 5, (0, 255, 0), -1)  # Green circles
+                    cv2.circle(
                        constraints_vis, (u, v), 5, (0, 255, 0), -1
                    )  # Green circles
                else:
                    cv2.circle(constraints_vis, (u, v), 3, (0, 255, 0), -1)
@ -177,24 +261,56 @@ def visualize_constraints(frame, refined_landmarks, constrained_landmarks, camer
                end_u = int(end[0] * focal_x / end[2] + center_x)
                end_v = int(end[1] * focal_y / end[2] + center_y)
-                if (0 <= start_u < width and 0 <= start_v < height and
+                if (
-                    0 <= end_u < width and 0 <= end_v < height):
+                    0 <= start_u < width
-                    cv2.line(constraints_vis, (start_u, start_v), (end_u, end_v), (0, 255, 0), 2)
+                    and 0 <= start_v < height
                    and 0 <= end_u < width
                    and 0 <= end_v < height
                ):
                    cv2.line(
                        constraints_vis,
                        (start_u, start_v),
                        (end_u, end_v),
                        (0, 255, 0),
                        2,
                    )
    # Add legend
-    cv2.putText(constraints_vis, "Red: Before constraints", (10, 30), 
+    cv2.putText(
-                cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
+        constraints_vis,
-    cv2.putText(constraints_vis, "Green: After constraints", (10, 60), 
+        "Red: Before constraints",
-                cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
+        (10, 30),
        cv2.FONT_HERSHEY_SIMPLEX,
        0.7,
        (0, 0, 255),
        2,
    )
    cv2.putText(
        constraints_vis,
        "Green: After constraints",
        (10, 60),
        cv2.FONT_HERSHEY_SIMPLEX,
        0.7,
        (0, 255, 0),
        2,
    )
    return constraints_vis
-def visualize_robot_params(frame, position, direction, normal, gripper_width, camera_intrinsics):
+
-    """Visualize robot parameters (position, orientation, gripper width)"""
+def visualize_robot_params(
    frame, position, orientation_matrix, gripper_width, camera_intrinsics
 ):
    """Visualize robot parameters (position, orientation matrix, gripper width)"""
    robot_vis = frame.copy()
    height, width = frame.shape[:2]
    focal_x, focal_y, center_x, center_y = camera_intrinsics
    # Extract axes from orientation matrix
    x_axis = orientation_matrix[:, 0]  # First column (principal axis)
    y_axis = orientation_matrix[:, 1]  # Second column
    z_axis = orientation_matrix[:, 2]  # Third column (normal)
    # Project position to image coordinates
    x, y, z = position
    if z > 0:
@ -205,59 +321,113 @@ def visualize_robot_params(frame, position, direction, normal, gripper_width, ca
            # Draw end effector position
            cv2.circle(robot_vis, (u, v), 10, (255, 0, 0), -1)  # Blue circle
-            # Draw direction vector (X axis)
+            # Draw X axis (principal axis)
-            dx, dy, dz = direction
+            dx, dy, dz = x_axis
            scale = 50  # Scale for better visualization
            end_u = int((x + dx * scale) * focal_x / (z + dz * scale) + center_x)
            end_v = int((y + dy * scale) * focal_y / (z + dz * scale) + center_y)
            if 0 <= end_u < width and 0 <= end_v < height:
-                cv2.line(robot_vis, (u, v), (end_u, end_v), (0, 0, 255), 2)  # Red line (X axis)
+                cv2.line(
-                cv2.putText(robot_vis, "X", (end_u, end_v), 
+                    robot_vis, (u, v), (end_u, end_v), (0, 0, 255), 2
-                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)
+                )  # Red line (X axis)
                cv2.putText(
                    robot_vis,
                    "X",
                    (end_u, end_v),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.5,
                    (0, 0, 255),
                    2,
                )
-            # Draw normal vector (Z axis)
+            # Draw Z axis (normal)
-            nx, ny, nz = normal
+            nx, ny, nz = z_axis
            end_u = int((x + nx * scale) * focal_x / (z + nz * scale) + center_x)
            end_v = int((y + ny * scale) * focal_y / (z + nz * scale) + center_y)
            if 0 <= end_u < width and 0 <= end_v < height:
-                cv2.line(robot_vis, (u, v), (end_u, end_v), (0, 255, 0), 2)  # Green line (Z axis)
+                cv2.line(
-                cv2.putText(robot_vis, "Z", (end_u, end_v), 
+                    robot_vis, (u, v), (end_u, end_v), (0, 255, 0), 2
-                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
+                )  # Green line (Z axis)
                cv2.putText(
                    robot_vis,
                    "Z",
                    (end_u, end_v),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.5,
                    (0, 255, 0),
                    2,
                )
-            # Calculate Y axis (cross product of Z and X for right-hand coordinate system)
+            # Draw Y axis
            y_axis = np.cross(normal, direction)
            y_axis = y_axis / np.linalg.norm(y_axis)
            yx, yy, yz = y_axis
            end_u = int((x + yx * scale) * focal_x / (z + yz * scale) + center_x)
            end_v = int((y + yy * scale) * focal_y / (z + yz * scale) + center_y)
            if 0 <= end_u < width and 0 <= end_v < height:
-                cv2.line(robot_vis, (u, v), (end_u, end_v), (255, 0, 0), 2)  # Blue line (Y axis)
+                cv2.line(
-                cv2.putText(robot_vis, "Y", (end_u, end_v), 
+                    robot_vis, (u, v), (end_u, end_v), (255, 0, 0), 2
-                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)
+                )  # Blue line (Y axis)
                cv2.putText(
                    robot_vis,
                    "Y",
                    (end_u, end_v),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.5,
                    (255, 0, 0),
                    2,
                )
            # Visualize gripper width
            gripper_radius = int(gripper_width * 100)  # Scale for better visualization
-            cv2.circle(robot_vis, (u, v), gripper_radius, (0, 255, 255), 2)  # Yellow circle
+            cv2.circle(
                robot_vis, (u, v), gripper_radius, (0, 255, 255), 2
            )  # Yellow circle
            # Add parameter values as text
            y_offset = height - 160
-            cv2.putText(robot_vis, f"Position: ({x:.2f}, {y:.2f}, {z:.2f})", (10, y_offset), 
+            cv2.putText(
-                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
+                robot_vis,
                f"Position: ({x:.2f}, {y:.2f}, {z:.2f})",
                (10, y_offset),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.5,
                (255, 255, 255),
                1,
            )
            y_offset += 30
-            cv2.putText(robot_vis, f"Direction: ({dx:.2f}, {dy:.2f}, {dz:.2f})", (10, y_offset), 
+            cv2.putText(
-                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)
+                robot_vis,
                f"X axis: ({x_axis[0]:.2f}, {x_axis[1]:.2f}, {x_axis[2]:.2f})",
                (10, y_offset),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.5,
                (0, 0, 255),
                1,
            )
            y_offset += 30
-            cv2.putText(robot_vis, f"Normal: ({nx:.2f}, {ny:.2f}, {nz:.2f})", (10, y_offset), 
+            cv2.putText(
-                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
+                robot_vis,
                f"Z axis: ({z_axis[0]:.2f}, {z_axis[1]:.2f}, {z_axis[2]:.2f})",
                (10, y_offset),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.5,
                (0, 255, 0),
                1,
            )
            y_offset += 30
-            cv2.putText(robot_vis, f"Gripper Width: {gripper_width:.2f}", (10, y_offset), 
+            cv2.putText(
-                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 255), 1)
+                robot_vis,
                f"Gripper Width: {gripper_width:.2f}",
                (10, y_offset),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.5,
                (0, 255, 255),
                1,
            )
    return robot_vis
		`@ -0,0 +1 @@`
							`# TODO: Implement camera calibration functions to extract camera extrinsics and intrinsics`