open_phantom/phantom/hand_processor.py

import os
import cv2
import time
import torch
import numpy as np
import open3d as o3d
import mediapipe as mp

from tqdm import tqdm
from utils.visualizations import *
from phantom.robot_manager import RobotManager
# from diffusers import StableDiffusionInpaintPipeline


class HandProcessor:
    def __init__(self) -> None:
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        self.mp_hands = mp.solutions.hands
        self.mp_drawing = mp.solutions.drawing_utils
        self.mp_drawing_styles = mp.solutions.drawing_styles
        
        self.hands = self.mp_hands.Hands(
            model_complexity=1,
            max_num_hands=1,
            static_image_mode=False,
        )
        
        # NOTE: Look into better depth estimation models
        # Initialize MiDaS for depth estimation
        print("Loading MiDaS model...")
        self.midas = torch.hub.load("intel-isl/MiDaS", "DPT_Hybrid")
        self.midas.to(self.device)
        self.midas.eval()
        
        # Load MiDaS transforms to resize and normalize input images
        self.midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms")
        self.transform = self.midas_transforms.dpt_transform
    
    # Segment hand from image using MediaPipe Hands landmarks
    def _create_mask(self, image: np.ndarray, landmarks: list, thickness: int=5, padding:int=10) -> np.ndarray:
        height, width = image.shape[:2]
        mask = np.zeros((height, width), dtype=np.uint8)
        
        points = [(int(landmark.x * width), int(landmark.y * height)) for landmark in landmarks.landmark]
        
        # Draw filled circles at each landmark
        for point in points:
            cv2.circle(mask, point, thickness, 255, -1)
                
        # Connect all landmarks with thick lines
        for connection in self.mp_hands.HAND_CONNECTIONS:
            start_idx, end_idx = connection
            cv2.line(mask, points[start_idx], points[end_idx], 255, thickness)
        
        # Create palm by connecting base of fingers with wrist
        palm_points = [points[0], points[1], points[5], points[9], points[13], points[17]]
        cv2.fillPoly(mask, [np.array(palm_points)], 255)
        
        # Create shape between fingers
        finger_bases = [(1,5), (5,9), (9,13), (13,17)]
        for base1, base2 in finger_bases:
            triangle = np.array([points[0], points[base1], points[base2]])
            cv2.fillPoly(mask, [triangle], 255)
        
        # Dilate to smooth and expand slightly
        kernel = np.ones((padding, padding), np.uint8)
        mask = cv2.dilate(mask, kernel, iterations=1)
        
        # Smooth the mask
        mask = cv2.GaussianBlur(mask, (21, 21), 0)
        _, mask = cv2.threshold(mask, 50, 255, cv2.THRESH_BINARY)
        
        return mask

    """
    Transform input image to match MiDaS model requirements
    Estimate depth map using MiDaS model
    """
    # TODO: Look into why screen goes black sometimes
    def _estimate_depth(self, image: np.ndarray) -> tuple:
        img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        # Transform img for MiDaS model
        input_batch = self.transform(img).to(self.device)
        
        with torch.inference_mode():
            prediction = self.midas(input_batch)
            
            # Interpolate to original size
            prediction = torch.nn.functional.interpolate(
                prediction.unsqueeze(1),
                size=image.shape[:2],
                mode="bicubic",
                align_corners=False,
            ).squeeze()
        
        # Convert to numpy and normalize to 0-255 for visualization
        depth_map = prediction.cpu().numpy()
        depth_min = depth_map.min()
        depth_max = depth_map.max()
        depth_map_normalized = 255 * (depth_map - depth_min) / (depth_max - depth_min)
        
        return depth_map, depth_map_normalized.astype(np.uint8)
    
    # TODO: Look into better depth scaling
    def _create_cloud(self, depth_map: np.ndarray, hand_mask: np.ndarray) -> np.ndarray:
        focal_x, focal_y, center_x, center_y = self.camera_intrinsics
        
        v_coords, u_coords = np.where(hand_mask > 0)        
        z_values = depth_map[v_coords, u_coords]
        
        # Filter out zero depth values
        valid_indices = z_values > 0
        u_coords = u_coords[valid_indices]
        v_coords = v_coords[valid_indices]
        z_values = z_values[valid_indices]
        
        # NOTE: Abritrary scaling factor for depth
        z_metric = z_values * 0.5
        
        # Back-project to 3D using camera intrinsics
        x_values = (u_coords - center_x) * z_metric / focal_x
        y_values = (v_coords - center_y) * z_metric / focal_y
        
        points = np.column_stack((x_values, y_values, z_metric))
        
        return points
    
    # TODO: Look into better Z scaling
    def _create_mesh(self, landmarks: list, image_dims: tuple) -> np.ndarray:
        width, height = image_dims
        
        vertices = []
        for landmark in landmarks.landmark:
            vertices.append([
                landmark.x * width,
                landmark.y * height,
                landmark.z * width
            ])
            
        # Define faces (triangles) connecting landmarks
        faces = [
            # Palm
            [0, 1, 5], [0, 5, 9], [0, 9, 13], [0, 13, 17],
            # Thumb
            [1, 2, 3], [2, 3, 4],
            # Index finger
            [5, 6, 7], [6, 7, 8],
            # Middle finger
            [9, 10, 11], [10, 11, 12],
            # Ring finger
            [13, 14, 15], [14, 15, 16],
            # Pinky
            [17, 18, 19], [18, 19, 20]
        ]
        
        dense_vertices = list(vertices)
        
        # Add interpolated vertices along finger segments
        connections = self.mp_hands.HAND_CONNECTIONS
        for connection in connections:
            start_idx, end_idx = connection
            start_point = np.array(vertices[start_idx])
            end_point = np.array(vertices[end_idx])
            
            # Add 2 interpolated points between each connected pair
            for t in [0.33, 0.66]:
                interp_point = start_point * (1-t) + end_point * t
                dense_vertices.append(interp_point.tolist())
        
        return np.array(dense_vertices), np.array(faces)
    
    def _icp_registration(self, cloud: np.ndarray, vertices: np.ndarray) -> np.ndarray:
        # Normalize both point clouds to handle scale differences
        cloud_centroid = np.mean(cloud, axis=0)
        vertices_centroid = np.mean(vertices, axis=0)
        
        cloud_centered = cloud - cloud_centroid
        vertices_centered = vertices - vertices_centroid
        
        cloud_scale = np.max(np.linalg.norm(cloud_centered, axis=1))
        vertices_scale = np.max(np.linalg.norm(vertices_centered, axis=1))
        
        cloud_normalized = cloud_centered / cloud_scale
        vertices_normalized = vertices_centered / vertices_scale
        
        # Create Open3D point clouds
        source = o3d.geometry.PointCloud()
        source.points = o3d.utility.Vector3dVector(vertices_normalized)
        
        target = o3d.geometry.PointCloud()
        target.points = o3d.utility.Vector3dVector(cloud_normalized)
        
        # Optional: Downsample for better performance
        source = source.voxel_down_sample(voxel_size=0.01)
        target = target.voxel_down_sample(voxel_size=0.01)
        
        # Initial identity transformation
        trans_init = np.eye(4)
        
        # Run ICP with better parameters
        result = o3d.pipelines.registration.registration_icp(
            source,
            target, 
            max_correspondence_distance=0.2,  # Increased for better matching
            init=trans_init,
            estimation_method=o3d.pipelines.registration.TransformationEstimationPointToPlane(),
            criteria=o3d.pipelines.registration.ICPConvergenceCriteria(max_iteration=100)
        )
        
        # Create denormalization transformation
        denorm_transform = np.eye(4)
        denorm_transform[:3, :3] *= (cloud_scale / vertices_scale)
        
        # Combine transformations
        transform_to_origin = np.eye(4)
        transform_to_origin[:3, 3] = -vertices_centroid
        
        transform_from_origin = np.eye(4)
        transform_from_origin[:3, 3] = cloud_centroid
        
        final_transform = np.matmul(transform_from_origin, 
                                np.matmul(denorm_transform, 
                                            np.matmul(result.transformation, transform_to_origin)))
        
        return final_transform
    
    def _refine_landmarks(self, landmarks: list, transform: int, image_dims: tuple):
        width, height = image_dims
        
        refined_landmarks = []
        for landmark in landmarks.landmark:
            point = np.array([
                landmark.x * width,
                landmark.y * height,
                landmark.z * width, # TODO: Figure out better scaling factor
                1.0
            ])
            
            # Apply transformation to 3D point
            transformed = np.dot(transform, point)
            refined_landmarks.append(transformed[:3])
            
        return refined_landmarks
    
    def _apply_constraints(self, landmarks: list):
        constrained = np.array(landmarks)
        
        # Define finger joint indices
        # MediaPipe hand model: Wrist is 0, thumb is 1-4, index is 5-8, etc.
        thumb_indices = [1, 2, 3, 4]
        index_indices = [5, 6, 7, 8]
        
        # Constrain the last two joints of thumb and index finger
        # as mentioned in the paper
        for finger_indices in [thumb_indices, index_indices]:
            # Get the last three joints (two segments)
            if len(finger_indices) >= 3:
                # Get joint positions
                joint1 = constrained[finger_indices[-3]]
                joint2 = constrained[finger_indices[-2]]
                joint3 = constrained[finger_indices[-1]]
                
                # Direction of the first segment
                dir1 = joint2 - joint1
                dir1 = dir1 / np.linalg.norm(dir1)
                
                # Instead of full ball joint, constrain the last joint's direction
                # to follow similar direction as the previous segment
                ideal_dir = dir1.copy()
                actual_dir = joint3 - joint2
                actual_length = np.linalg.norm(actual_dir)
                
                # Force the direction to be within a cone of the previous segment
                # (limiting to single degree of freedom approximately)
                corrected_dir = ideal_dir * actual_length
                
                # Apply the correction
                constrained[finger_indices[-1]] = joint2 + corrected_dir
        
        return constrained
    
    def _get_robot_params(self, refined_landmarks) -> tuple:
        # Extract key landmarks
        wrist = refined_landmarks[0]  # Wrist landmark
        thumb_tip = refined_landmarks[4]  # Thumb tip
        index_tip = refined_landmarks[8]  # Index finger tip
        
        # Calculate end effector position (midpoint between thumb and index tips)
        position = (thumb_tip + index_tip) / 2
        
        # Calculate vectors for orientation
        v1 = thumb_tip - wrist  # Vector from wrist to thumb tip
        v2 = index_tip - wrist  # Vector from wrist to index tip
        
        # Calculate normal to hand plane
        normal = np.cross(v1, v2)
        if np.linalg.norm(normal) > 0:
            normal = normal / np.linalg.norm(normal)
        else:
            # Default if vectors are collinear
            normal = np.array([0, 0, 1])
        
        # Calculate main direction along thumb
        direction = thumb_tip - wrist
        if np.linalg.norm(direction) > 0:
            direction = direction / np.linalg.norm(direction)
        else:
            # Default if thumb is at wrist (unlikely)
            direction = np.array([1, 0, 0])
        
        # Calculate gripper width
        gripper_width = np.linalg.norm(thumb_tip - index_tip)
        
        return position, direction, normal, gripper_width
    
    def record_video(self) -> str:
        output_dir = os.path.join(os.getcwd(), 'recordings')
        os.makedirs(output_dir, exist_ok=True)
        
        timestamp = time.strftime("%Y%m%d-%H%M%S")
        video_filename = os.path.join(output_dir, f"recorded_video_{timestamp}.mp4")
        
        cap = cv2.VideoCapture(0)
        
        assert cap.isOpened(), "Failed to open camera."
        
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        fps = int(cap.get(cv2.CAP_PROP_FPS))
        
        # Update camera intrinsics based on video dimensions
        self.camera_intrinsics = (width * 0.8, height * 0.8, width / 2, height / 2)
        
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        out = None
        
        recording = False
        did_record = False
        
        print("Camera is active. Press 'r' to start/stop recording, 'q' to quit.")
        
        while cap.isOpened():
            success, frame = cap.read()
            assert success, "Failed to read from camera."
            
            # Mirror the image for more intuitive viewing
            frame = cv2.flip(frame, 1)
            
            # Create a separate display frame for showing the recording indicator
            display_frame = frame.copy()
            
            # Display recording indicator ONLY on the display frame
            if recording:
                cv2.circle(display_frame, (30, 30), 15, (0, 0, 255), -1)
                cv2.putText(display_frame, "RECORDING", (50, 40), cv2.FONT_HERSHEY_SIMPLEX, 
                            1, (0, 0, 255), 2, cv2.LINE_AA)
            
            # Show the display frame (with indicator if recording)
            cv2.imshow('Video Recording', display_frame)
            
            # Write the original frame (without indicator) to video file if recording
            if recording and out is not None:
                out.write(frame)
            
            key = cv2.waitKey(1) & 0xFF
            
            # NOTE: Haven't tested what happens if user records multiple videos in one session (probably overwrites?)
            if key == ord('q'):
                break
            elif key == ord('r'):
                recording = not recording
                if recording:
                    print(f"Started recording to {video_filename}")
                    out = cv2.VideoWriter(video_filename, fourcc, fps, (width, height))
                else:
                    if out is not None:
                        out.release()
                        print(f"Stopped recording. Video saved to {video_filename}")
                    did_record = True
        
        if out is not None:
            out.release()
        cap.release()
        cv2.destroyAllWindows()
        
        return video_filename if did_record else None
    
    def process_video(self, video_path: str) -> tuple:
        assert video_path, "Video path is required."
        
        cap = cv2.VideoCapture(video_path)
        
        assert cap.isOpened(), f"Failed to open video file {video_path}"
        
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        fps = int(cap.get(cv2.CAP_PROP_FPS))
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        
        # Update camera intrinsics based on video dimensions
        self.camera_intrinsics = (width * 0.8, height * 0.8, width / 2, height / 2)
        
        base_path = os.path.splitext(video_path)[0]
        segmented_output_path = f"{base_path}_masked.mp4"
        depth_output_path = f"{base_path}_depth.mp4"
        mesh_output_path = f"{base_path}_mesh.mp4"
        registration_output_path = f"{base_path}_registration.mp4"
        constraints_output_path = f"{base_path}_constraints.mp4"
        robot_output_path = f"{base_path}_robot.mp4"
        
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        segmented_out = cv2.VideoWriter(segmented_output_path, fourcc, fps, (width, height))
        depth_out = cv2.VideoWriter(depth_output_path, fourcc, fps, (width, height))
        mesh_out = cv2.VideoWriter(mesh_output_path, fourcc, fps, (width, height))
        reg_out = cv2.VideoWriter(registration_output_path, fourcc, fps, (640, 480))  # Fixed size
        constraints_out = cv2.VideoWriter(constraints_output_path, fourcc, fps, (width, height))
        robot_out = cv2.VideoWriter(robot_output_path, fourcc, fps, (width, height))
        
        print(f"Processing video with {total_frames} frames...")        
        for _ in tqdm(range(total_frames)):
            success, frame = cap.read()
            assert success, "Failed to read frame from video."
            
            # Convert image to RGB for MediaPipe
            image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            
            # Process with MediaPipe Hands
            results = self.hands.process(image_rgb)
            
            # Initialize output frames
            segmented_frame = frame.copy()
            depth_frame = np.zeros((height, width, 3), dtype=np.uint8)
            mesh_frame = frame.copy()
            reg_frame = np.ones((480, 640, 3), dtype=np.uint8) * 255  # Fixed size, white background
            constraints_frame = frame.copy()
            robot_frame = frame.copy()
            
            # Process if hand is detected
            if results.multi_hand_landmarks:
                for hand_landmarks in results.multi_hand_landmarks:
                    # Segment hand
                    hand_mask = self._create_mask(frame, hand_landmarks)
                    mask_overlay = frame.copy()
                    mask_overlay[hand_mask > 0] = [0, 0, 255]  # Red color for mask
                    segmented_frame = cv2.addWeighted(frame, 0.7, mask_overlay, 0.3, 0)
                    
                    # Depth estimation
                    depth_map, depth_vis = self._estimate_depth(frame)
                    depth_colored = cv2.applyColorMap(depth_vis, cv2.COLORMAP_JET)
                    depth_frame = depth_colored.copy()
                    
                    # Create hand mesh
                    hand_vertices, hand_faces = self._create_mesh(hand_landmarks, (width, height))
                    mesh_frame = visualize_mesh(frame, hand_vertices, hand_faces)
                    
                    # Create point cloud from depth
                    cloud = self._create_cloud(depth_map, hand_mask)
                    
                    # Perform ICP registration
                    icp_transform = self._icp_registration(cloud, hand_vertices)
                    reg_frame = visualize_registration(cloud, hand_vertices, icp_transform)
                    
                    # Refine landmarks using ICP transformation
                    refined_landmarks = self._refine_landmarks(hand_landmarks, icp_transform, (width, height))
                    
                    # Store pre-constraint landmarks for visualization
                    original_refined = refined_landmarks.copy()
                    
                    # Apply anatomical constraints
                    constrained_landmarks = self._apply_constraints(refined_landmarks)
                    constraints_frame = visualize_constraints(frame, original_refined, constrained_landmarks, self.camera_intrinsics)
                    
                    # # Calculate robot parameters
                    # position, direction, normal, gripper_width = self._get_robot_params(constrained_landmarks)
                    # robot_frame = visualize_robot_params(frame, position, direction, normal, gripper_width)
            
            segmented_out.write(segmented_frame)
            depth_out.write(depth_frame)
            mesh_out.write(mesh_frame)
            reg_out.write(reg_frame)
            constraints_out.write(constraints_frame)
            robot_out.write(robot_frame)
            
            display_scale = 0.5
            display_size = (int(width * display_scale), int(height * display_scale))
            reg_display_size = (int(640 * display_scale), int(480 * display_scale))
            
            cv2.imshow('Segmented', cv2.resize(segmented_frame, display_size))
            cv2.imshow('Depth', cv2.resize(depth_frame, display_size))
            cv2.imshow('Mesh', cv2.resize(mesh_frame, display_size))
            cv2.imshow('Registration', cv2.resize(reg_frame, reg_display_size))
            cv2.imshow('Constraints', cv2.resize(constraints_frame, display_size))
            cv2.imshow('Robot Parameters', cv2.resize(robot_frame, display_size))
            
            if cv2.waitKey(1) & 0xFF == 27:  # ESC to exit
                break
        
        cap.release()
        segmented_out.release()
        depth_out.release()
        mesh_out.release()
        reg_out.release()
        constraints_out.release()
        robot_out.release()
        cv2.destroyAllWindows()
        
        print(f"Processing complete. Results saved to:")
        print(f"- Hand mask: {segmented_output_path}")
        print(f"- Depth visualization: {depth_output_path}")
        print(f"- Mesh visualization: {mesh_output_path}")
        print(f"- Registration visualization: {registration_output_path}")
        print(f"- Constraints visualization: {constraints_output_path}")
        print(f"- Robot parameters: {robot_output_path}")
        
        return {
            "segmented": segmented_output_path,
            "depth": depth_output_path,
            "mesh": mesh_output_path,
            "registration": registration_output_path,
            "constraints": constraints_output_path,
            "robot": robot_output_path
        }
initial commit 2025-03-21 12:51:16 -07:00			`import os`
			`import cv2`
			`import time`
			`import torch`
			`import numpy as np`
			`import open3d as o3d`
			`import mediapipe as mp`

			`from tqdm import tqdm`
			`from utils.visualizations import *`
			`from phantom.robot_manager import RobotManager`
			`# from diffusers import StableDiffusionInpaintPipeline`


			`class HandProcessor:`
			`def __init__(self) -> None:`
			`self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")`

			`self.mp_hands = mp.solutions.hands`
			`self.mp_drawing = mp.solutions.drawing_utils`
			`self.mp_drawing_styles = mp.solutions.drawing_styles`

			`self.hands = self.mp_hands.Hands(`
			`model_complexity=1,`
			`max_num_hands=1,`
			`static_image_mode=False,`
			`)`

			`# NOTE: Look into better depth estimation models`
			`# Initialize MiDaS for depth estimation`
			`print("Loading MiDaS model...")`
			`self.midas = torch.hub.load("intel-isl/MiDaS", "DPT_Hybrid")`
			`self.midas.to(self.device)`
			`self.midas.eval()`

			`# Load MiDaS transforms to resize and normalize input images`
			`self.midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms")`
			`self.transform = self.midas_transforms.dpt_transform`

			`# Segment hand from image using MediaPipe Hands landmarks`
			`def _create_mask(self, image: np.ndarray, landmarks: list, thickness: int=5, padding:int=10) -> np.ndarray:`
			`height, width = image.shape[:2]`
			`mask = np.zeros((height, width), dtype=np.uint8)`

			`points = [(int(landmark.x * width), int(landmark.y * height)) for landmark in landmarks.landmark]`

			`# Draw filled circles at each landmark`
			`for point in points:`
			`cv2.circle(mask, point, thickness, 255, -1)`

			`# Connect all landmarks with thick lines`
			`for connection in self.mp_hands.HAND_CONNECTIONS:`
			`start_idx, end_idx = connection`
			`cv2.line(mask, points[start_idx], points[end_idx], 255, thickness)`

			`# Create palm by connecting base of fingers with wrist`
			`palm_points = [points[0], points[1], points[5], points[9], points[13], points[17]]`
			`cv2.fillPoly(mask, [np.array(palm_points)], 255)`

			`# Create shape between fingers`
			`finger_bases = [(1,5), (5,9), (9,13), (13,17)]`
			`for base1, base2 in finger_bases:`
			`triangle = np.array([points[0], points[base1], points[base2]])`
			`cv2.fillPoly(mask, [triangle], 255)`

			`# Dilate to smooth and expand slightly`
			`kernel = np.ones((padding, padding), np.uint8)`
			`mask = cv2.dilate(mask, kernel, iterations=1)`

			`# Smooth the mask`
			`mask = cv2.GaussianBlur(mask, (21, 21), 0)`
			`_, mask = cv2.threshold(mask, 50, 255, cv2.THRESH_BINARY)`

			`return mask`

			`"""`
			`Transform input image to match MiDaS model requirements`
			`Estimate depth map using MiDaS model`
			`"""`
			`# TODO: Look into why screen goes black sometimes`
			`def _estimate_depth(self, image: np.ndarray) -> tuple:`
			`img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)`

			`# Transform img for MiDaS model`
			`input_batch = self.transform(img).to(self.device)`

			`with torch.inference_mode():`
			`prediction = self.midas(input_batch)`

			`# Interpolate to original size`
			`prediction = torch.nn.functional.interpolate(`
			`prediction.unsqueeze(1),`
			`size=image.shape[:2],`
			`mode="bicubic",`
			`align_corners=False,`
			`).squeeze()`

			`# Convert to numpy and normalize to 0-255 for visualization`
			`depth_map = prediction.cpu().numpy()`
			`depth_min = depth_map.min()`
			`depth_max = depth_map.max()`
			`depth_map_normalized = 255 * (depth_map - depth_min) / (depth_max - depth_min)`

			`return depth_map, depth_map_normalized.astype(np.uint8)`

			`# TODO: Look into better depth scaling`
			`def _create_cloud(self, depth_map: np.ndarray, hand_mask: np.ndarray) -> np.ndarray:`
			`focal_x, focal_y, center_x, center_y = self.camera_intrinsics`

			`v_coords, u_coords = np.where(hand_mask > 0)`
			`z_values = depth_map[v_coords, u_coords]`

			`# Filter out zero depth values`
			`valid_indices = z_values > 0`
			`u_coords = u_coords[valid_indices]`
			`v_coords = v_coords[valid_indices]`
			`z_values = z_values[valid_indices]`

			`# NOTE: Abritrary scaling factor for depth`
			`z_metric = z_values * 0.5`

			`# Back-project to 3D using camera intrinsics`
			`x_values = (u_coords - center_x) * z_metric / focal_x`
			`y_values = (v_coords - center_y) * z_metric / focal_y`

			`points = np.column_stack((x_values, y_values, z_metric))`

			`return points`

			`# TODO: Look into better Z scaling`
			`def _create_mesh(self, landmarks: list, image_dims: tuple) -> np.ndarray:`
			`width, height = image_dims`

			`vertices = []`
			`for landmark in landmarks.landmark:`
			`vertices.append([`
			`landmark.x * width,`
			`landmark.y * height,`
			`landmark.z * width`
			`])`

			`# Define faces (triangles) connecting landmarks`
			`faces = [`
			`# Palm`
			`[0, 1, 5], [0, 5, 9], [0, 9, 13], [0, 13, 17],`
			`# Thumb`
			`[1, 2, 3], [2, 3, 4],`
			`# Index finger`
			`[5, 6, 7], [6, 7, 8],`
			`# Middle finger`
			`[9, 10, 11], [10, 11, 12],`
			`# Ring finger`
			`[13, 14, 15], [14, 15, 16],`
			`# Pinky`
			`[17, 18, 19], [18, 19, 20]`
			`]`

			`dense_vertices = list(vertices)`

			`# Add interpolated vertices along finger segments`
			`connections = self.mp_hands.HAND_CONNECTIONS`
			`for connection in connections:`
			`start_idx, end_idx = connection`
			`start_point = np.array(vertices[start_idx])`
			`end_point = np.array(vertices[end_idx])`

			`# Add 2 interpolated points between each connected pair`
			`for t in [0.33, 0.66]:`
			`interp_point = start_point * (1-t) + end_point * t`
			`dense_vertices.append(interp_point.tolist())`

			`return np.array(dense_vertices), np.array(faces)`

			`def _icp_registration(self, cloud: np.ndarray, vertices: np.ndarray) -> np.ndarray:`
			`# Normalize both point clouds to handle scale differences`
			`cloud_centroid = np.mean(cloud, axis=0)`
			`vertices_centroid = np.mean(vertices, axis=0)`

			`cloud_centered = cloud - cloud_centroid`
			`vertices_centered = vertices - vertices_centroid`

			`cloud_scale = np.max(np.linalg.norm(cloud_centered, axis=1))`
			`vertices_scale = np.max(np.linalg.norm(vertices_centered, axis=1))`

			`cloud_normalized = cloud_centered / cloud_scale`
			`vertices_normalized = vertices_centered / vertices_scale`

			`# Create Open3D point clouds`
			`source = o3d.geometry.PointCloud()`
			`source.points = o3d.utility.Vector3dVector(vertices_normalized)`

			`target = o3d.geometry.PointCloud()`
			`target.points = o3d.utility.Vector3dVector(cloud_normalized)`

			`# Optional: Downsample for better performance`
			`source = source.voxel_down_sample(voxel_size=0.01)`
			`target = target.voxel_down_sample(voxel_size=0.01)`

			`# Initial identity transformation`
			`trans_init = np.eye(4)`

			`# Run ICP with better parameters`
			`result = o3d.pipelines.registration.registration_icp(`
			`source,`
			`target,`
			`max_correspondence_distance=0.2, # Increased for better matching`
			`init=trans_init,`
			`estimation_method=o3d.pipelines.registration.TransformationEstimationPointToPlane(),`
			`criteria=o3d.pipelines.registration.ICPConvergenceCriteria(max_iteration=100)`
			`)`

			`# Create denormalization transformation`
			`denorm_transform = np.eye(4)`
			`denorm_transform[:3, :3] *= (cloud_scale / vertices_scale)`

			`# Combine transformations`
			`transform_to_origin = np.eye(4)`
			`transform_to_origin[:3, 3] = -vertices_centroid`

			`transform_from_origin = np.eye(4)`
			`transform_from_origin[:3, 3] = cloud_centroid`

			`final_transform = np.matmul(transform_from_origin,`
			`np.matmul(denorm_transform,`
			`np.matmul(result.transformation, transform_to_origin)))`

			`return final_transform`

			`def _refine_landmarks(self, landmarks: list, transform: int, image_dims: tuple):`
			`width, height = image_dims`

			`refined_landmarks = []`
			`for landmark in landmarks.landmark:`
			`point = np.array([`
			`landmark.x * width,`
			`landmark.y * height,`
			`landmark.z * width, # TODO: Figure out better scaling factor`
			`1.0`
			`])`

			`# Apply transformation to 3D point`
			`transformed = np.dot(transform, point)`
			`refined_landmarks.append(transformed[:3])`

			`return refined_landmarks`

			`def _apply_constraints(self, landmarks: list):`
			`constrained = np.array(landmarks)`

			`# Define finger joint indices`
			`# MediaPipe hand model: Wrist is 0, thumb is 1-4, index is 5-8, etc.`
			`thumb_indices = [1, 2, 3, 4]`
			`index_indices = [5, 6, 7, 8]`

			`# Constrain the last two joints of thumb and index finger`
			`# as mentioned in the paper`
			`for finger_indices in [thumb_indices, index_indices]:`
			`# Get the last three joints (two segments)`
			`if len(finger_indices) >= 3:`
			`# Get joint positions`
			`joint1 = constrained[finger_indices[-3]]`
			`joint2 = constrained[finger_indices[-2]]`
			`joint3 = constrained[finger_indices[-1]]`

			`# Direction of the first segment`
			`dir1 = joint2 - joint1`
			`dir1 = dir1 / np.linalg.norm(dir1)`

			`# Instead of full ball joint, constrain the last joint's direction`
			`# to follow similar direction as the previous segment`
			`ideal_dir = dir1.copy()`
			`actual_dir = joint3 - joint2`
			`actual_length = np.linalg.norm(actual_dir)`

			`# Force the direction to be within a cone of the previous segment`
			`# (limiting to single degree of freedom approximately)`
			`corrected_dir = ideal_dir * actual_length`

			`# Apply the correction`
			`constrained[finger_indices[-1]] = joint2 + corrected_dir`

			`return constrained`

			`def _get_robot_params(self, refined_landmarks) -> tuple:`
			`# Extract key landmarks`
			`wrist = refined_landmarks[0] # Wrist landmark`
			`thumb_tip = refined_landmarks[4] # Thumb tip`
			`index_tip = refined_landmarks[8] # Index finger tip`

			`# Calculate end effector position (midpoint between thumb and index tips)`
			`position = (thumb_tip + index_tip) / 2`

			`# Calculate vectors for orientation`
			`v1 = thumb_tip - wrist # Vector from wrist to thumb tip`
			`v2 = index_tip - wrist # Vector from wrist to index tip`

			`# Calculate normal to hand plane`
			`normal = np.cross(v1, v2)`
			`if np.linalg.norm(normal) > 0:`
			`normal = normal / np.linalg.norm(normal)`
			`else:`
			`# Default if vectors are collinear`
			`normal = np.array([0, 0, 1])`

			`# Calculate main direction along thumb`
			`direction = thumb_tip - wrist`
			`if np.linalg.norm(direction) > 0:`
			`direction = direction / np.linalg.norm(direction)`
			`else:`
			`# Default if thumb is at wrist (unlikely)`
			`direction = np.array([1, 0, 0])`

			`# Calculate gripper width`
			`gripper_width = np.linalg.norm(thumb_tip - index_tip)`

			`return position, direction, normal, gripper_width`

			`def record_video(self) -> str:`
			`output_dir = os.path.join(os.getcwd(), 'recordings')`
			`os.makedirs(output_dir, exist_ok=True)`

			`timestamp = time.strftime("%Y%m%d-%H%M%S")`
			`video_filename = os.path.join(output_dir, f"recorded_video_{timestamp}.mp4")`

			`cap = cv2.VideoCapture(0)`

			`assert cap.isOpened(), "Failed to open camera."`

			`width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))`
			`height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))`
			`fps = int(cap.get(cv2.CAP_PROP_FPS))`

			`# Update camera intrinsics based on video dimensions`
			`self.camera_intrinsics = (width * 0.8, height * 0.8, width / 2, height / 2)`

			`fourcc = cv2.VideoWriter_fourcc(*'mp4v')`
			`out = None`

			`recording = False`
			`did_record = False`

			`print("Camera is active. Press 'r' to start/stop recording, 'q' to quit.")`

			`while cap.isOpened():`
			`success, frame = cap.read()`
			`assert success, "Failed to read from camera."`

			`# Mirror the image for more intuitive viewing`
			`frame = cv2.flip(frame, 1)`

			`# Create a separate display frame for showing the recording indicator`
			`display_frame = frame.copy()`

			`# Display recording indicator ONLY on the display frame`
			`if recording:`
			`cv2.circle(display_frame, (30, 30), 15, (0, 0, 255), -1)`
			`cv2.putText(display_frame, "RECORDING", (50, 40), cv2.FONT_HERSHEY_SIMPLEX,`
			`1, (0, 0, 255), 2, cv2.LINE_AA)`

			`# Show the display frame (with indicator if recording)`
			`cv2.imshow('Video Recording', display_frame)`

			`# Write the original frame (without indicator) to video file if recording`
			`if recording and out is not None:`
			`out.write(frame)`

			`key = cv2.waitKey(1) & 0xFF`

			`# NOTE: Haven't tested what happens if user records multiple videos in one session (probably overwrites?)`
			`if key == ord('q'):`
			`break`
			`elif key == ord('r'):`
			`recording = not recording`
			`if recording:`
			`print(f"Started recording to {video_filename}")`
			`out = cv2.VideoWriter(video_filename, fourcc, fps, (width, height))`
			`else:`
			`if out is not None:`
			`out.release()`
			`print(f"Stopped recording. Video saved to {video_filename}")`
			`did_record = True`

			`if out is not None:`
			`out.release()`
			`cap.release()`
			`cv2.destroyAllWindows()`

			`return video_filename if did_record else None`

			`def process_video(self, video_path: str) -> tuple:`
			`assert video_path, "Video path is required."`

			`cap = cv2.VideoCapture(video_path)`

			`assert cap.isOpened(), f"Failed to open video file {video_path}"`

			`width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))`
			`height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))`
			`fps = int(cap.get(cv2.CAP_PROP_FPS))`
			`total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))`

			`# Update camera intrinsics based on video dimensions`
			`self.camera_intrinsics = (width * 0.8, height * 0.8, width / 2, height / 2)`

			`base_path = os.path.splitext(video_path)[0]`
			`segmented_output_path = f"{base_path}_masked.mp4"`
			`depth_output_path = f"{base_path}_depth.mp4"`
			`mesh_output_path = f"{base_path}_mesh.mp4"`
			`registration_output_path = f"{base_path}_registration.mp4"`
			`constraints_output_path = f"{base_path}_constraints.mp4"`
			`robot_output_path = f"{base_path}_robot.mp4"`

			`fourcc = cv2.VideoWriter_fourcc(*'mp4v')`
			`segmented_out = cv2.VideoWriter(segmented_output_path, fourcc, fps, (width, height))`
			`depth_out = cv2.VideoWriter(depth_output_path, fourcc, fps, (width, height))`
			`mesh_out = cv2.VideoWriter(mesh_output_path, fourcc, fps, (width, height))`
			`reg_out = cv2.VideoWriter(registration_output_path, fourcc, fps, (640, 480)) # Fixed size`
			`constraints_out = cv2.VideoWriter(constraints_output_path, fourcc, fps, (width, height))`
			`robot_out = cv2.VideoWriter(robot_output_path, fourcc, fps, (width, height))`

			`print(f"Processing video with {total_frames} frames...")`
			`for _ in tqdm(range(total_frames)):`
			`success, frame = cap.read()`
			`assert success, "Failed to read frame from video."`

			`# Convert image to RGB for MediaPipe`
			`image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)`

			`# Process with MediaPipe Hands`
			`results = self.hands.process(image_rgb)`

			`# Initialize output frames`
			`segmented_frame = frame.copy()`
			`depth_frame = np.zeros((height, width, 3), dtype=np.uint8)`
			`mesh_frame = frame.copy()`
			`reg_frame = np.ones((480, 640, 3), dtype=np.uint8) * 255 # Fixed size, white background`
			`constraints_frame = frame.copy()`
			`robot_frame = frame.copy()`

			`# Process if hand is detected`
			`if results.multi_hand_landmarks:`
			`for hand_landmarks in results.multi_hand_landmarks:`
			`# Segment hand`
			`hand_mask = self._create_mask(frame, hand_landmarks)`
			`mask_overlay = frame.copy()`
			`mask_overlay[hand_mask > 0] = [0, 0, 255] # Red color for mask`
			`segmented_frame = cv2.addWeighted(frame, 0.7, mask_overlay, 0.3, 0)`

			`# Depth estimation`
			`depth_map, depth_vis = self._estimate_depth(frame)`
			`depth_colored = cv2.applyColorMap(depth_vis, cv2.COLORMAP_JET)`
			`depth_frame = depth_colored.copy()`

			`# Create hand mesh`
			`hand_vertices, hand_faces = self._create_mesh(hand_landmarks, (width, height))`
			`mesh_frame = visualize_mesh(frame, hand_vertices, hand_faces)`

			`# Create point cloud from depth`
			`cloud = self._create_cloud(depth_map, hand_mask)`

			`# Perform ICP registration`
			`icp_transform = self._icp_registration(cloud, hand_vertices)`
			`reg_frame = visualize_registration(cloud, hand_vertices, icp_transform)`

			`# Refine landmarks using ICP transformation`
			`refined_landmarks = self._refine_landmarks(hand_landmarks, icp_transform, (width, height))`

			`# Store pre-constraint landmarks for visualization`
			`original_refined = refined_landmarks.copy()`

			`# Apply anatomical constraints`
			`constrained_landmarks = self._apply_constraints(refined_landmarks)`
			`constraints_frame = visualize_constraints(frame, original_refined, constrained_landmarks, self.camera_intrinsics)`

			`# # Calculate robot parameters`
			`# position, direction, normal, gripper_width = self._get_robot_params(constrained_landmarks)`
			`# robot_frame = visualize_robot_params(frame, position, direction, normal, gripper_width)`

			`segmented_out.write(segmented_frame)`
			`depth_out.write(depth_frame)`
			`mesh_out.write(mesh_frame)`
			`reg_out.write(reg_frame)`
			`constraints_out.write(constraints_frame)`
			`robot_out.write(robot_frame)`

			`display_scale = 0.5`
			`display_size = (int(width * display_scale), int(height * display_scale))`
			`reg_display_size = (int(640 * display_scale), int(480 * display_scale))`

			`cv2.imshow('Segmented', cv2.resize(segmented_frame, display_size))`
			`cv2.imshow('Depth', cv2.resize(depth_frame, display_size))`
			`cv2.imshow('Mesh', cv2.resize(mesh_frame, display_size))`
			`cv2.imshow('Registration', cv2.resize(reg_frame, reg_display_size))`
			`cv2.imshow('Constraints', cv2.resize(constraints_frame, display_size))`
			`cv2.imshow('Robot Parameters', cv2.resize(robot_frame, display_size))`

			`if cv2.waitKey(1) & 0xFF == 27: # ESC to exit`
			`break`

			`cap.release()`
			`segmented_out.release()`
			`depth_out.release()`
			`mesh_out.release()`
			`reg_out.release()`
			`constraints_out.release()`
			`robot_out.release()`
			`cv2.destroyAllWindows()`

			`print(f"Processing complete. Results saved to:")`
			`print(f"- Hand mask: {segmented_output_path}")`
			`print(f"- Depth visualization: {depth_output_path}")`
			`print(f"- Mesh visualization: {mesh_output_path}")`
			`print(f"- Registration visualization: {registration_output_path}")`
			`print(f"- Constraints visualization: {constraints_output_path}")`
			`print(f"- Robot parameters: {robot_output_path}")`

			`return {`
			`"segmented": segmented_output_path,`
			`"depth": depth_output_path,`
			`"mesh": mesh_output_path,`
			`"registration": registration_output_path,`
			`"constraints": constraints_output_path,`
			`"robot": robot_output_path`
			`}`