diff --git a/open_phantom/hand_processor.py b/open_phantom/process_hand.py
similarity index 77%
rename from open_phantom/hand_processor.py
rename to open_phantom/process_hand.py
index b74b51e..9aa577b 100644
--- a/open_phantom/hand_processor.py
+++ b/open_phantom/process_hand.py
@@ -1,23 +1,30 @@
 import os
 import time
-
 import cv2
-import depth_pro
-import mediapipe as mp
+import torch
 import numpy as np
 import open3d as o3d
-import torch
-from PIL import Image
-from robot_manager import RobotManager
-from sam2.sam2_image_predictor import SAM2ImagePredictor
+import mediapipe as mp
+
 from tqdm import tqdm
+from collections import deque
 from utils.visualizations import *
+from sam2.sam2_image_predictor import SAM2ImagePredictor
 
 
-class HandProcessor:
+# TODO: Optimize these constants for better results
+HAND_WIDTH_MM = 90.0  # Average width of male hand in mm
+CLOUD_Z_SCALE = 5.0
+# Maximum expected distance between human thumb and index finger in mm when fully extended.
+MAXIMUM_HAND_WIDTH_MM = 100.0
+
+
+class ProcessHand:
     def __init__(self) -> None:
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
+        # Initialize MediaPipe Hands for hand detection
+        print("Loading MediaPipe Hands model...")
         self.mp_hands = mp.solutions.hands
         self.mp_drawing = mp.solutions.drawing_utils
         self.mp_drawing_styles = mp.solutions.drawing_styles
@@ -28,10 +35,9 @@ class HandProcessor:
             static_image_mode=False,
         )
 
-        # NOTE: Look into better depth estimation models
         # Initialize MiDaS for depth estimation
         print("Loading MiDaS model...")
-        self.midas = torch.hub.load("intel-isl/MiDaS", "DPT_Hybrid")
+        self.midas = torch.hub.load("intel-isl/MiDaS", "DPT_Large")
         self.midas.to(self.device)
         self.midas.eval()
 
@@ -45,11 +51,17 @@ class HandProcessor:
             "facebook/sam2-hiera-large"
         )
 
+        self.gripper_width_buffer = deque(maxlen=100)
+
+    """
+    Create a segmentation mask over the hand using SAM2 model
+    """
+
     def _create_mask(self, frame: np.ndarray, landmarks: list) -> np.ndarray:
         height, width = frame.shape[:2]
+        # Convert image to RGB for SAM2 model
         frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
 
-        # Set image in SAM2 predictor
         with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
             self.sam2_predictor.set_image(frame_rgb)  # Set image for prediction
 
@@ -61,7 +73,7 @@ class HandProcessor:
 
             input_points = np.array(points)
 
-            # Predict mask with point prompts
+            # Predict mask using point prompts
             masks, _, _ = self.sam2_predictor.predict(
                 point_coords=input_points,  # Pass the points as prompts
                 point_labels=np.ones(
@@ -79,7 +91,6 @@ class HandProcessor:
     Estimate depth map using MiDaS model
     """
 
-    # TODO: Swap MiDaS for ml-depth-pro model
     def _estimate_depth(self, image: np.ndarray) -> tuple:
         img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
 
@@ -97,7 +108,7 @@ class HandProcessor:
                 align_corners=False,
             ).squeeze()
 
-        # Convert to numpy and normalize to 0-255 for visualization
+        # Convert to numpy and normalize for visualization
         depth_map = prediction.cpu().numpy()
         depth_min = depth_map.min()
         depth_max = depth_map.max()
@@ -105,11 +116,17 @@ class HandProcessor:
 
         return depth_map, depth_map_normalized.astype(np.uint8)
 
-    # TODO: Look into better depth scaling
-    def _create_cloud(self, depth_map: np.ndarray, hand_mask: np.ndarray) -> np.ndarray:
+    """
+    Create a point cloud from combining depth map and segmented mask
+    by back-projecting to 3D using camera intrinsics and depth values
+    """
+
+    def _create_cloud(
+        self, depth_map: np.ndarray, segmented_mask: np.ndarray
+    ) -> np.ndarray:
         focal_x, focal_y, center_x, center_y = self.camera_intrinsics
 
-        v_coords, u_coords = np.where(hand_mask > 0)
+        v_coords, u_coords = np.where(segmented_mask > 0)
         z_values = depth_map[v_coords, u_coords]
 
         # Filter out zero depth values
@@ -118,7 +135,7 @@ class HandProcessor:
         v_coords = v_coords[valid_indices]
         z_values = z_values[valid_indices]
 
-        z_metric = z_values * 0.5
+        z_metric = z_values * HAND_WIDTH_MM * CLOUD_Z_SCALE / depth_map.max()
 
         # Back-project to 3D using camera intrinsics
         x_values = (u_coords - center_x) * z_metric / focal_x
@@ -128,15 +145,25 @@ class HandProcessor:
 
         return points
 
-    # TODO: Look into better depth scaling
+    """
+    Create hand mesh from hand landmarks
+    """
+
     def _create_mesh(self, landmarks: list, image_dims: tuple) -> np.ndarray:
         width, height = image_dims
 
+        # Extract just z values to understand their range
+        z_values = [landmark.z for landmark in landmarks.landmark]
+        z_min = min(z_values)
+        z_max = max(z_values)
+
         vertices = []
         for landmark in landmarks.landmark:
-            vertices.append(
-                [landmark.x * width, landmark.y * height, landmark.z * width]
-            )
+            # Scale z to same range as HAND_WIDTH_MM
+            normalized_z = (landmark.z - z_min) / (z_max - z_min + 1e-6)
+            scaled_z = normalized_z * HAND_WIDTH_MM
+
+            vertices.append([landmark.x * width, landmark.y * height, scaled_z])
 
         # Define faces (triangles) connecting landmarks
         faces = [
@@ -178,7 +205,10 @@ class HandProcessor:
 
         return np.array(dense_vertices), np.array(faces)
 
-    # TODO: Improve ICP registration for better alignment
+    """
+    Align hand mesh to point cloud using ICP for accurate 3D reconstruction
+    """
+
     def _icp_registration(self, cloud: np.ndarray, vertices: np.ndarray) -> np.ndarray:
         source = o3d.geometry.PointCloud()
         source.points = o3d.utility.Vector3dVector(vertices)
@@ -200,27 +230,39 @@ class HandProcessor:
         result = o3d.pipelines.registration.registration_icp(
             source,
             target,
-            max_correspondence_distance=0.1,  # Increased distance
+            max_correspondence_distance=0.05,
             init=trans_init,  # Initial transformation
             estimation_method=o3d.pipelines.registration.TransformationEstimationPointToPoint(),
             criteria=o3d.pipelines.registration.ICPConvergenceCriteria(
                 max_iteration=100
-            ),  # Increased iterations
+            ),
         )
 
         transformation = result.transformation
 
         return transformation
 
-    # TODO: Look into better depth scaling
-    def _refine_landmarks(self, landmarks: list, transform: int, image_dims: tuple):
+    """
+    Refine landmarks based on the icp transformation
+    """
+
+    def _refine_landmarks(
+        self, landmarks: list, transform: int, image_dims: tuple
+    ) -> list:
         width, height = image_dims
 
+        # Extract z range for normalization, similar to _create_mesh
+        z_values = [landmark.z for landmark in landmarks.landmark]
+        z_min = min(z_values)
+        z_max = max(z_values)
+
         refined_landmarks = []
         for landmark in landmarks.landmark:
-            point = np.array(
-                [landmark.x * width, landmark.y * height, landmark.z * width, 1.0]
-            )
+            # Use consistent scaling with _create_mesh
+            normalized_z = (landmark.z - z_min) / (z_max - z_min + 1e-6)
+            scaled_z = normalized_z * HAND_WIDTH_MM
+
+            point = np.array([landmark.x * width, landmark.y * height, scaled_z, 1.0])
 
             # Apply transformation to 3D point
             transformed = np.dot(transform, point)
@@ -228,6 +270,7 @@ class HandProcessor:
 
         return refined_landmarks
 
+    # TODO: Implement better constraints that limit last joint of each finger to a single DOF
     def _apply_constraints(self, landmarks: list):
         constrained = np.array(landmarks)
 
@@ -236,8 +279,7 @@ class HandProcessor:
         thumb_indices = [1, 2, 3, 4]
         index_indices = [5, 6, 7, 8]
 
-        # Constrain the last two joints of thumb and index finger
-        # as mentioned in the paper
+        # Constrain the last two joints of thumb and index finger as per the paper
         for finger_indices in [thumb_indices, index_indices]:
             # Get the last three joints (two segments)
             if len(finger_indices) >= 3:
@@ -250,8 +292,7 @@ class HandProcessor:
                 dir1 = joint2 - joint1
                 dir1 = dir1 / np.linalg.norm(dir1)
 
-                # Instead of full ball joint, constrain the last joint's direction
-                # to follow similar direction as the previous segment
+                # Instead of full ball joint, constrain last joint's direction to follow previous segment
                 ideal_dir = dir1.copy()
                 actual_dir = joint3 - joint2
                 actual_length = np.linalg.norm(actual_dir)
@@ -265,8 +306,14 @@ class HandProcessor:
 
         return constrained
 
-    def _get_robot_params(self, refined_landmarks):
-        # Extract keypoints
+    """
+    Extract robot parameters from refined landmarks:
+    1. Target Position: Midpoint between thumb tip and index tip
+    2. Target Orientation: Normal to the best-fitting plane of thumb and index finger
+    3. Gripper Width: Distance between thumb tip and index tip
+    """
+
+    def _get_robot_params(self, refined_landmarks: list) -> tuple:
         landmarks = np.array(refined_landmarks)
 
         # Define indices for specific parts of the hand
@@ -293,54 +340,52 @@ class HandProcessor:
         # Use SVD to find the normal to the best-fitting plane
         u, s, vh = np.linalg.svd(centered_points)
         # The normal is the last right singular vector
-        normal = vh[2, :]
-
-        # Ensure normal is a unit vector
-        normal = normal / np.linalg.norm(normal)
+        plane_normal = vh[2, :]
+        plane_normal = plane_normal / np.linalg.norm(plane_normal)
 
         # Fit a principal axis through thumb points
-        thumb_centroid = np.mean(thumb_points, axis=0)
-        thumb_centered = thumb_points - thumb_centroid
+        # Using direction from thumb base to tip for more robustness
+        thumb_direction = landmarks[thumb_tip_idx] - landmarks[thumb_indices[0]]
+        thumb_axis = thumb_direction / np.linalg.norm(thumb_direction)
 
-        # Use SVD again to find direction of maximum variance (principal axis)
-        u, s, vh = np.linalg.svd(thumb_centered)
-        principal_axis = vh[0, :]  # First singular vector
+        # Ensure thumb_axis is orthogonal to plane_normal
+        thumb_axis = thumb_axis - np.dot(thumb_axis, plane_normal) * plane_normal
+        thumb_axis = thumb_axis / np.linalg.norm(thumb_axis)
 
-        # Ensure principal axis is a unit vector
-        principal_axis = principal_axis / np.linalg.norm(principal_axis)
+        # Compute third axis as cross product to create orthogonal frame
+        cross_axis = np.cross(plane_normal, thumb_axis)
+        cross_axis = cross_axis / np.linalg.norm(cross_axis)
 
-        # Construct orthogonal vectors for orientation matrix
-        z_axis = normal  # Normal to the plane
-        x_axis = principal_axis  # Along the thumb
-
-        # Ensure x_axis is orthogonal to z_axis
-        x_axis = x_axis - np.dot(x_axis, z_axis) * z_axis
-        x_axis = x_axis / np.linalg.norm(x_axis)
-
-        # Compute y_axis as cross product to ensure orthogonality
-        y_axis = np.cross(z_axis, x_axis)
-
-        # Create rotation matrix
-        rotation_matrix = np.column_stack([x_axis, y_axis, z_axis])
+        # Create rotation matrix that aligns with the paper's description
+        # z-axis as normal, x-axis along thumb direction
+        rotation_matrix = np.column_stack([thumb_axis, cross_axis, plane_normal])
 
         # 3. Calculate gripper width
         gripper_width = np.linalg.norm(thumb_tip - index_tip)
-
-        # Apply percentile threshold to mitigate slippage (as mentioned in the paper)
-        # Store gripper widths in a buffer to compute percentiles
-        if not hasattr(self, "gripper_width_buffer"):
-            self.gripper_width_buffer = []
-
         self.gripper_width_buffer.append(gripper_width)
-        # Keep only the last 100 values
-        if len(self.gripper_width_buffer) > 100:
-            self.gripper_width_buffer.pop(0)
 
-        # Apply 20th percentile threshold
+        # Apply 20th percentile threshold as specified in the paper
         if len(self.gripper_width_buffer) > 5:  # Need enough samples
             min_width = np.percentile(self.gripper_width_buffer, 20)
             if gripper_width < min_width:
-                gripper_width = min_width
+                gripper_width = 0.0  # Fully closed gripper when below threshold
+            else:
+                # Scale gripper_width to robot's specific range
+                gripper_width = min(1.0, gripper_width / MAXIMUM_HAND_WIDTH_MM)
+
+        # Convert from camera frame to robot frame
+        # Note: This requires the extrinsic matrix from camera to robot
+        # If extrinsics are available, uncomment and use this code:
+        # if hasattr(self, "camera_to_robot_transform"):
+        #     # Convert position to homogeneous coordinates
+        #     pos_homogeneous = np.append(position, 1.0)
+        #     # Apply transformation
+        #     robot_pos_homogeneous = np.dot(self.camera_to_robot_transform, pos_homogeneous)
+        #     position = robot_pos_homogeneous[:3]
+        #
+        #     # Convert rotation (special orthogonal transformation)
+        #     rotation_in_robot_frame = np.dot(self.camera_to_robot_transform[:3, :3], rotation_matrix)
+        #     rotation_matrix = rotation_in_robot_frame
 
         return position, rotation_matrix, gripper_width
 
@@ -483,16 +528,16 @@ class HandProcessor:
             # Process if hand is detected
             if results.multi_hand_landmarks:
                 for hand_landmarks in results.multi_hand_landmarks:
-                    hand_mask = self._create_mask(frame, hand_landmarks)
+                    segmented_mask = self._create_mask(frame, hand_landmarks)
                     mask_overlay = frame.copy()
-                    mask_overlay[hand_mask > 0] = [0, 0, 255]  # Red color for mask
+                    mask_overlay[segmented_mask > 0] = [0, 0, 255]  # Red color for mask
                     segmented_frame = cv2.addWeighted(frame, 0.7, mask_overlay, 0.3, 0)
 
                     depth_map, depth_vis = self._estimate_depth(frame)
                     depth_colored = cv2.applyColorMap(depth_vis, cv2.COLORMAP_JET)
                     depth_frame = depth_colored.copy()
 
-                    cloud = self._create_cloud(depth_map, hand_mask)
+                    cloud = self._create_cloud(depth_map, segmented_mask)
 
                     hand_vertices, hand_faces = self._create_mesh(
                         hand_landmarks, (width, height)