From 867f067b247f59ec95aec5c41ec67f0b14dbac3a Mon Sep 17 00:00:00 2001
From: Ethan Clark <eclark715@gmail.com>
Date: Wed, 26 Mar 2025 14:35:56 -0700
Subject: [PATCH] build out initial calibration for extrinsic parameters

---
 open_phantom/utils/calibration.py           | 214 -----
 open_phantom/utils/calibration/extrinsic.py | 825 ++++++++++++++++++++
 2 files changed, 825 insertions(+), 214 deletions(-)
 delete mode 100644 open_phantom/utils/calibration.py
 create mode 100644 open_phantom/utils/calibration/extrinsic.py

diff --git a/open_phantom/utils/calibration.py b/open_phantom/utils/calibration.py
deleted file mode 100644
index 9e85cc5..0000000
--- a/open_phantom/utils/calibration.py
+++ /dev/null
@@ -1,214 +0,0 @@
-import cv2
-import numpy as np
-import mediapipe as mp
-
-from collections import deque
-
-
-# Convert 3D robot positions to 2D screen positions
-def robot_to_screen_pos(robot_pos: list, width: int, height: int) -> tuple:
-    center_x, center_y = width // 2, height // 2
-    scale_factor = min(width, height) / 2
-
-    # Y-axis maps to horizontal
-    screen_x = int(center_x + robot_pos[1] * scale_factor)
-    # Z-axis maps to vertical (inverted)
-    screen_y = int(center_y - robot_pos[2] * scale_factor)
-
-    return screen_x, screen_y
-
-
-# Return 3D hand position from MediaPipe landmarks
-def get_hand_pos(landmarks, width: int, height: int):
-    # Use thumb tip and index finger tip
-    thumb_tip = landmarks.landmark[4]  # Thumb tip index
-    index_tip = landmarks.landmark[8]  # Index finger tip index
-
-    # Convert to 3D position
-    thumb_pos = np.array(
-        [thumb_tip.x * width, thumb_tip.y * height, thumb_tip.z * 100]
-    )  # Rough scaling
-    index_pos = np.array([index_tip.x * width, index_tip.y * height, index_tip.z * 100])
-
-    # Midpoint between thumb and index finger
-    position = (thumb_pos + index_pos) / 2
-
-    return position
-
-
-# Check if hand position is stable at current location
-def is_hand_stable(
-    position, history: deque, frames: int = 50, threshold: float = 8.0
-) -> bool:
-    history.append(position)
-
-    if len(history) < frames:
-        return False
-
-    positions = np.array(history)
-    max_movement = np.max(np.linalg.norm(positions - positions[-1], axis=1))
-
-    return max_movement < threshold
-
-
-def calibrate_camera() -> np.ndarray:
-    mp_hands = mp.solutions.hands
-    mp_drawing = mp.solutions.drawing_utils
-    hands = mp_hands.Hands(
-        model_complexity=1,
-        max_num_hands=1,
-        static_image_mode=False,
-    )
-
-    # Robot frame positions
-    reference_positions = [
-        [0.3, 0.0, 0.4],  # Center
-        [0.3, 0.2, 0.4],  # Right
-        [0.3, -0.2, 0.4],  # Left
-        [0.3, 0.0, 0.6],  # Higher
-    ]
-
-    cap = cv2.VideoCapture(0)
-
-    # Get camera dimensions
-    success, frame = cap.read()
-    assert success, "Failed to access camera"
-
-    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-
-    position_history = deque(maxlen=15)
-
-    current_target = 0
-    camera_positions = []
-    calibration_complete = False
-
-    print("Automatic calibration starting.")
-    print("Please move your hand to the highlighted positions on screen.")
-
-    while not calibration_complete:
-        success, frame = cap.read()
-        if not success:
-            continue
-
-        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-        results = hands.process(frame_rgb)
-
-        target_position = reference_positions[current_target]
-        screen_target = robot_to_screen_pos(target_position, width, height)
-
-        # Draw target
-        cv2.circle(frame, screen_target, 50, (0, 255, 0), 2)
-        cv2.circle(frame, screen_target, 10, (0, 255, 0), -1)
-        cv2.putText(
-            frame,
-            f"Target {current_target+1}: Move hand here",
-            (screen_target[0] - 100, screen_target[1] - 20),
-            cv2.FONT_HERSHEY_SIMPLEX,
-            0.7,
-            (0, 255, 0),
-            2,
-        )
-
-        hand_position = None
-
-        if results.multi_hand_landmarks:
-            hand_landmarks = results.multi_hand_landmarks[0]  # Just use the first hand
-
-            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
-
-            hand_position = get_hand_pos(hand_landmarks)
-
-            # Draw hand position indicator
-            hand_x, hand_y = int(hand_position[0]), int(hand_position[1])
-            cv2.circle(frame, (hand_x, hand_y), 15, (255, 0, 0), -1)
-
-            # Check if hand is close to target (in 2D screen space for simplicity)
-            distance = np.sqrt(
-                (hand_x - screen_target[0]) ** 2 + (hand_y - screen_target[1]) ** 2
-            )
-
-            if distance < 50:  # Threshold in pixels
-                cv2.putText(
-                    frame,
-                    "Hold position steady...",
-                    (50, 50),
-                    cv2.FONT_HERSHEY_SIMPLEX,
-                    1.0,
-                    (0, 0, 255),
-                    2,
-                )
-
-                if is_hand_stable(hand_position, position_history):
-                    camera_positions.append(hand_position)
-                    print(f"Position {current_target+1} recorded!")
-
-                    # Move to next target and reset history
-                    current_target += 1
-                    position_history.clear()
-
-                    if current_target >= len(reference_positions):
-                        calibration_complete = True
-                    else:
-                        print(f"Please move to position {current_target+1}")
-
-        cv2.putText(
-            frame,
-            f"Calibrating: {current_target}/{len(reference_positions)} positions",
-            (10, height - 20),
-            cv2.FONT_HERSHEY_SIMPLEX,
-            0.7,
-            (255, 255, 255),
-            2,
-        )
-
-        cv2.imshow("Calibration", frame)
-
-        if cv2.waitKey(1) & 0xFF == ord("q"):
-            break
-
-    cap.release()
-    cv2.destroyAllWindows()
-
-    if not calibration_complete:
-        print("Calibration aborted.")
-        return None
-
-    robot_points = np.array(reference_positions)
-    robot_centroid = np.mean(robot_points, axis=0)
-    robot_centered = robot_points - robot_centroid
-
-    camera_points = np.array(camera_positions)
-    camera_centroid = np.mean(camera_points, axis=0)
-    camera_centered = camera_points - camera_centroid
-
-    # Find rotation using SVD
-    H = np.dot(camera_centered.T, robot_centered)
-    U, S, Vt = np.linalg.svd(H)
-    rotation = np.dot(U, Vt)
-
-    # Check that it's a proper rotation matrix
-    if np.linalg.det(rotation) < 0:
-        Vt[-1, :] *= -1
-        rotation = np.dot(U, Vt)
-
-    translation = robot_centroid - np.dot(rotation, camera_centroid)
-
-    # Create homogeneous transform matrix
-    transform = np.eye(4)
-    transform[:3, :3] = rotation
-    transform[:3, 3] = translation
-
-    print("Calibration complete!")
-
-    np.save("camera_extrinsics.npy", transform)
-
-    return transform
-
-
-if __name__ == "__main__":
-    transform = calibrate_camera()
-    if transform is not None:
-        print("Transform matrix:")
-        print(transform)
-        print("Saved to camera_to_robot_transform.npy")
diff --git a/open_phantom/utils/calibration/extrinsic.py b/open_phantom/utils/calibration/extrinsic.py
new file mode 100644
index 0000000..a41a02d
--- /dev/null
+++ b/open_phantom/utils/calibration/extrinsic.py
@@ -0,0 +1,825 @@
+import os
+import re
+import cv2
+import torch
+import numpy as np
+import mediapipe as mp
+
+from collections import deque
+from scipy.signal import savgol_filter
+from scipy.optimize import least_squares
+from intrinsic import calibrate_intrinsics
+
+
+REFERENCE_OBJECT_SIZE = 0.50  # 50cm
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+def robot_to_screen_pos(robot_pos: list, K: np.ndarray) -> tuple:
+    """
+    Project 3D robot position to 2D screen position using intrinsic matrix
+    """
+    # Create homogeneous point
+    robot_pt_homogeneous = np.append(robot_pos, 1.0)
+
+    # Project using intrinsic matrix
+    screen_pt = np.dot(K, robot_pt_homogeneous[:3])
+
+    # Normalize by z coordinate to get pixel coordinates
+    screen_x, screen_y = screen_pt[:2] / screen_pt[2]
+
+    # Convert to integers
+    screen_x, screen_y = int(screen_x), int(screen_y)
+
+    return screen_x, screen_y
+
+
+# Filter hand position using Savitzky-Golay to smooth out noise
+def filter_hand_position(position: list, history: deque, window_size: int = 7) -> list:
+    history.append(position)
+    if len(history) < 5:
+        return position
+
+    history_array = np.array(history)
+
+    if len(history) >= window_size:
+        filtered_x = savgol_filter(history_array[:, 0], window_size, 2)
+        filtered_y = savgol_filter(history_array[:, 1], window_size, 2)
+        filtered_z = savgol_filter(history_array[:, 2], window_size, 2)
+        filtered_position = np.array([filtered_x[-1], filtered_y[-1], filtered_z[-1]])
+    else:
+        # Simple average if not enough history
+        filtered_position = np.mean(history_array, axis=0)
+
+    return filtered_position
+
+
+# Return 3D hand position from MediaPipe landmarks using intrinsic matrix
+def get_hand_pos(
+    landmarks: list, intrinsic_matrix: np.ndarray, width: int, height: int
+) -> np.ndarray:
+    """
+    Get 3D hand position using camera intrinsics for proper projection
+    """
+    # Use thumb tip and index finger tip
+    thumb_tip = landmarks.landmark[4]  # Thumb tip index
+    index_tip = landmarks.landmark[8]  # Index finger tip index
+
+    # Convert landmark coordinates to pixel coordinates
+    thumb_img = np.array([thumb_tip.x * width, thumb_tip.y * height, 1.0])
+    index_img = np.array([index_tip.x * width, index_tip.y * height, 1.0])
+
+    # Apply inverse of intrinsic matrix to get normalized camera coordinates
+    K_inv = np.linalg.inv(intrinsic_matrix)
+    thumb_norm = np.dot(K_inv, thumb_img)
+    index_norm = np.dot(K_inv, index_img)
+
+    # Scale by depth (using MediaPipe's relative depth estimate)
+    # Note: This scaling is approximate, ideally would use a proper depth sensor
+    thumb_depth = max(0.1, thumb_tip.z * 100)  # Avoid negative/zero depth
+    index_depth = max(0.1, index_tip.z * 100)  # Avoid negative/zero depth
+
+    thumb_pos = thumb_norm * thumb_depth
+    index_pos = index_norm * index_depth
+
+    # Midpoint between thumb and index finger
+    position = (thumb_pos + index_pos) / 2
+
+    return position[:3]  # Return x, y, z
+
+
+def measure_pinch_size(
+    landmarks: list,
+    intrinsic_matrix: np.ndarray,
+    frame: np.ndarray,
+    midas_model,
+    midas_transform,
+    width: int,
+    height: int,
+) -> float:
+    """
+    Measure distance between thumb and index finger in 3D space
+    using intrinsics and MiDaS depth estimation
+    """
+    # Use thumb tip and index finger tip
+    thumb_tip = landmarks.landmark[4]  # Thumb tip
+    index_tip = landmarks.landmark[8]  # Index finger tip
+
+    # Convert to pixel coordinates
+    thumb_pixel = (int(thumb_tip.x * width), int(thumb_tip.y * height))
+    index_pixel = (int(index_tip.x * width), int(index_tip.y * height))
+
+    # Get depth map using MiDaS
+    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+    img_input = midas_transform(img).to(DEVICE)
+
+    with torch.no_grad():
+        depth_map = midas_model(img_input)
+        depth_map = torch.nn.functional.interpolate(
+            depth_map.unsqueeze(1),
+            size=(height, width),
+            mode="bicubic",
+            align_corners=False,
+        ).squeeze()
+        depth_map = depth_map.cpu().numpy()
+
+    # Sample depths at thumb and index finger locations
+    thumb_depth = depth_map[thumb_pixel[1], thumb_pixel[0]]
+    index_depth = depth_map[index_pixel[1], index_pixel[0]]
+
+    # Normalize depths (MiDaS gives relative depths)
+    # We need to convert to metric scale
+    # TODO: This scaling factor needs calibration
+    depth_scale = 5.0
+    thumb_depth_metric = thumb_depth * depth_scale
+    index_depth_metric = index_depth * depth_scale
+
+    # Apply inverse of intrinsic matrix to get normalized camera coordinates
+    K_inv = np.linalg.inv(intrinsic_matrix)
+
+    # Convert to homogeneous coordinates
+    thumb_img = np.array([thumb_pixel[0], thumb_pixel[1], 1.0])
+    index_img = np.array([index_pixel[0], index_pixel[1], 1.0])
+
+    # Get normalized camera coordinates
+    thumb_norm = np.dot(K_inv, thumb_img)
+    index_norm = np.dot(K_inv, index_img)
+
+    # Scale by depth to get 3D coordinates
+    thumb_pos = thumb_norm * thumb_depth_metric
+    index_pos = index_norm * index_depth_metric
+
+    # Calculate Euclidean distance
+    distance = np.linalg.norm(thumb_pos - index_pos)
+
+    # Visualize depths for debugging
+    cv2.putText(
+        frame,
+        f"Thumb depth: {thumb_depth:.2f}, Index depth: {index_depth:.2f}",
+        (10, height - 50),
+        cv2.FONT_HERSHEY_SIMPLEX,
+        0.6,
+        (255, 0, 0),
+        2,
+    )
+
+    return distance
+
+
+def is_hand_stable(
+    position, history: deque, n_frames: int = 50, threshold: float = 8.0
+) -> bool:
+    """Check if hand position is stable using a simple threshold"""
+    history.append(position)
+
+    if len(history) < n_frames:
+        return False
+
+    positions = np.array(history)
+    max_movement = np.max(np.linalg.norm(positions - positions[-1], axis=1))
+
+    return max_movement < threshold
+
+
+# Compute extrinsics using PnP (Perspective-n-Point)
+def compute_extrinsics_pnp(
+    camera_points: np.ndarray, robot_points: np.ndarray, intrinsic_matrix: np.ndarray
+) -> np.ndarray:
+    """
+    Compute camera extrinsics using PnP algorithm.
+
+    Args:
+        camera_points: 3D points in camera frame
+        robot_points: 3D points in robot frame
+        intrinsic_matrix: Camera intrinsic matrix
+
+    Returns:
+        4x4 transformation matrix from camera to robot frame
+    """
+    # Convert 3D camera points to 2D image points using intrinsic matrix
+    image_points = []
+    for pt in camera_points:
+        # Project 3D point to 2D using the camera model
+        pt_homogeneous = np.append(pt, 1.0)
+        img_pt = np.dot(intrinsic_matrix, pt_homogeneous[:3])
+        # Normalize to get image coordinates
+        img_pt = img_pt[:2] / img_pt[2]
+        image_points.append(img_pt)
+
+    image_points = np.array(image_points, dtype=np.float32)
+    robot_points = np.array(robot_points, dtype=np.float32)
+
+    # Use OpenCV's solvePnP to get camera pose relative to robot
+    success, rvec, tvec = cv2.solvePnP(
+        robot_points,
+        image_points,
+        intrinsic_matrix,
+        distCoeffs=None,  # Add distortion coefficients if available
+        flags=cv2.SOLVEPNP_ITERATIVE,
+    )
+
+    # Convert rotation vector to rotation matrix
+    rot_matrix, _ = cv2.Rodrigues(rvec)
+
+    # Create transformation matrix
+    transform = np.eye(4)
+    transform[:3, :3] = rot_matrix
+    transform[:3, 3] = tvec.flatten()
+
+    return transform
+
+
+def refine_calibration(
+    camera_points: np.ndarray,
+    robot_points: np.ndarray,
+    initial_transform: np.ndarray,
+    intrinsic_matrix: np.ndarray,
+) -> np.ndarray:
+    """
+    Refine calibration using bundle adjustment optimization
+
+    Args:
+        camera_points: 3D points in camera frame
+        robot_points: 3D points in robot frame
+        initial_transform: Initial transformation matrix from camera to robot
+        intrinsic_matrix: Camera intrinsic matrix
+
+    Returns:
+        Refined 4x4 transformation matrix
+    """
+    # Extract rotation and translation from initial transform
+    R = initial_transform[:3, :3]
+    t = initial_transform[:3, 3]
+
+    # Convert rotation matrix to Rodriguez vector
+    rvec, _ = cv2.Rodrigues(R)
+
+    # Initial parameters (rotation vector and translation)
+    params = np.concatenate([rvec.flatten(), t])
+
+    # Define the error function
+    def error_function(params):
+        rvec = params[:3].reshape(3, 1)
+        tvec = params[3:].reshape(3, 1)
+
+        errors = []
+        for robot_pt, camera_pt in zip(robot_points, camera_points):
+            robot_pt = robot_pt.reshape(1, 3)
+
+            # Project robot point to camera using the current parameters
+            img_pt, _ = cv2.projectPoints(robot_pt, rvec, tvec, intrinsic_matrix, None)
+            img_pt = img_pt.flatten()
+
+            # Convert camera point to image using intrinsics
+            camera_pt_h = np.append(camera_pt, 1.0)
+            proj_camera_pt = np.dot(intrinsic_matrix, camera_pt_h[:3])
+            proj_camera_pt = proj_camera_pt[:2] / proj_camera_pt[2]
+
+            # Calculate reprojection error
+            error = img_pt - proj_camera_pt
+            errors.extend(error)
+
+        return np.array(errors)
+
+    # Run optimization
+    result = least_squares(error_function, params, method="lm")
+
+    # Convert optimized parameters back to transformation matrix
+    rvec_opt = result.x[:3].reshape(3, 1)
+    tvec_opt = result.x[3:].reshape(3, 1)
+
+    R_opt, _ = cv2.Rodrigues(rvec_opt)
+
+    transform_opt = np.eye(4)
+    transform_opt[:3, :3] = R_opt
+    transform_opt[:3, 3] = tvec_opt.flatten()
+
+    return transform_opt
+
+
+def calc_reprojection_error(
+    robot_points: list,
+    camera_points: list,
+    transform: np.ndarray,
+    intrinsic_matrix: np.ndarray,
+) -> tuple:
+    """
+    Calculate reprojection error using the intrinsic matrix for proper projection
+    """
+    # Extract rotation and translation
+    R = transform[:3, :3]
+    t = transform[:3, 3]
+
+    # Convert to rotation vector
+    rvec, _ = cv2.Rodrigues(R)
+    tvec = t.reshape(3, 1)
+
+    # Project robot points to image space
+    robot_points = np.array(robot_points, dtype=np.float32)
+    projected_pts, _ = cv2.projectPoints(
+        robot_points, rvec, tvec, intrinsic_matrix, None
+    )
+    projected_pts = projected_pts.reshape(-1, 2)
+
+    # Convert camera points to image space
+    camera_img_pts = []
+    for pt in camera_points:
+        pt_h = np.append(pt, 1.0)
+        img_pt = np.dot(intrinsic_matrix, pt_h[:3])
+        img_pt = img_pt[:2] / img_pt[2]
+        camera_img_pts.append(img_pt)
+
+    camera_img_pts = np.array(camera_img_pts)
+
+    # Calculate errors in image space (pixel distances)
+    errors = np.linalg.norm(projected_pts - camera_img_pts, axis=1)
+
+    return errors, np.mean(errors)
+
+
+def get_intrinsics() -> np.ndarray:
+    file_dir = os.path.dirname(os.path.abspath(__file__))
+    files = os.listdir(file_dir)
+
+    filename = "intrinsics.npz"
+    if filename in files:
+        try:
+            calibration_data = np.load(os.path.join(file_dir, filename))
+            intrinsics = calibration_data["K"]
+            print("Intrinsic matrix loaded.")
+            print(intrinsics)
+            return intrinsics
+        except (KeyError, FileNotFoundError) as e:
+            print(f"Error loading intrinsic matrix: {e}")
+
+    print("Intrinsic matrix not found or invalid.")
+    print("Calibrating intrinsic matrix...")
+    intrinsics = calibrate_intrinsics()
+    print("Intrinsic matrix calibrated.")
+
+    return intrinsics
+
+
+def calibrate_extrinsics() -> np.ndarray:
+    # Load intrinsic matrix
+    intrinsics = get_intrinsics()
+    if intrinsics is None:
+        print(
+            "Failed to load or calibrate intrinsic matrix. Cannot proceed with extrinsic calibration."
+        )
+        return None
+
+    mp_hands = mp.solutions.hands
+    mp_drawing = mp.solutions.drawing_utils
+    hands = mp_hands.Hands(
+        model_complexity=1,
+        max_num_hands=1,
+        static_image_mode=False,
+    )
+
+    midas = torch.hub.load("intel-isl/MiDaS", "MiDaS_small")
+    midas.to(DEVICE)
+    midas.eval()
+    transform = torch.hub.load("intel-isl/MiDaS", "transforms")
+    transform = transform.small_transform
+
+    # All in robot coordinates
+    reference_positions = [
+        [0.3, 0.0, 0.4],  # Center
+        [0.3, 0.2, 0.4],  # Right
+        [0.3, -0.2, 0.4],  # Left
+        [0.3, 0.0, 0.6],  # Higher
+        [0.3, 0.0, 0.2],  # Lower
+        [0.2, 0.2, 0.3],  # Front right bottom
+        [0.2, -0.2, 0.3],  # Front left bottom
+        [0.2, 0.2, 0.5],  # Front right top
+        [0.2, -0.2, 0.5],  # Front left top
+    ]
+
+    # Validation positions for reprojection error
+    validation_positions = [[0.25, 0.1, 0.35], [0.35, -0.1, 0.45], [0.3, 0.15, 0.5]]
+
+    cap = cv2.VideoCapture(0)
+
+    success, frame = cap.read()
+    assert success, "Failed to access camera"
+
+    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+
+    n_frames = 50
+    position_history = deque(maxlen=2 * n_frames)
+
+    print("Step 1: Scale calibration")
+    print(
+        f"Hold your thumb and index finger exactly {REFERENCE_OBJECT_SIZE*100}cm apart"
+    )
+
+    scale_factor = None
+    scale_calibration_complete = False
+
+    reference_distances = []
+
+    while not scale_calibration_complete:
+        success, frame = cap.read()
+        if not success:
+            continue
+
+        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        results = hands.process(frame_rgb)
+
+        cv2.putText(
+            frame,
+            f"Hold thumb and index finger exactly {REFERENCE_OBJECT_SIZE*100}cm apart",
+            (10, 30),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.7,
+            (0, 255, 0),
+            2,
+        )
+        cv2.putText(
+            frame,
+            "Hold position steady...",
+            (10, 70),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.7,
+            (0, 255, 0),
+            2,
+        )
+
+        if results.multi_hand_landmarks:
+            hand_landmarks = results.multi_hand_landmarks[0]
+            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
+
+            distance = measure_pinch_size(hand_landmarks, intrinsics, width, height)
+
+            cv2.putText(
+                frame,
+                f"Current distance: {distance:.2f}",
+                (10, 110),
+                cv2.FONT_HERSHEY_SIMPLEX,
+                0.7,
+                (0, 255, 0),
+                2,
+            )
+
+            hand_position = get_hand_pos(hand_landmarks, intrinsics, width, height)
+            if is_hand_stable(hand_position, position_history, n_frames=30):
+                reference_distances.append(distance)
+                cv2.putText(
+                    frame,
+                    "Measurement taken!",
+                    (10, 150),
+                    cv2.FONT_HERSHEY_SIMPLEX,
+                    0.7,
+                    (0, 0, 255),
+                    2,
+                )
+
+                # Take 5 measurements for robustness
+                if len(reference_distances) >= 5:
+                    # Calculate scale factor (median to avoid outliers)
+                    median_distance = np.median(reference_distances)
+                    scale_factor = REFERENCE_OBJECT_SIZE / median_distance
+                    print(f"Scale factor: {scale_factor}")
+                    scale_calibration_complete = True
+                    position_history.clear()
+                else:
+                    # Wait a bit between measurements
+                    for i in range(30):
+                        cv2.imshow("Calibration", frame)
+                        cv2.waitKey(100)
+                    position_history.clear()
+
+        cv2.putText(
+            frame,
+            f"Measurements: {len(reference_distances)}/5",
+            (10, height - 20),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.7,
+            (255, 255, 255),
+            2,
+        )
+
+        cv2.imshow("Calibration", frame)
+
+        if cv2.waitKey(1) & 0xFF == ord("q"):
+            break
+
+    print("Step 2: Position calibration")
+    print("Please move your hand to the highlighted positions on screen.")
+
+    current_target = 0
+    camera_positions = []
+    filtering_history = deque(maxlen=n_frames)
+
+    calibration_complete = False
+
+    while not calibration_complete:
+        success, frame = cap.read()
+        if not success:
+            continue
+
+        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        results = hands.process(frame_rgb)
+
+        target_position = reference_positions[current_target]
+        # Use intrinsic matrix for accurate projection
+        screen_target = robot_to_screen_pos(target_position, intrinsics)
+
+        # Draw target
+        cv2.circle(frame, screen_target, 50, (0, 255, 0), 2)
+        cv2.circle(frame, screen_target, 10, (0, 255, 0), -1)
+        cv2.putText(
+            frame,
+            f"Target {current_target+1}: Move hand here",
+            (screen_target[0] - 100, screen_target[1] - 20),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.7,
+            (0, 255, 0),
+            2,
+        )
+
+        hand_position = None
+
+        if results.multi_hand_landmarks:
+            hand_landmarks = results.multi_hand_landmarks[0]
+            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
+
+            raw_hand_position = get_hand_pos(hand_landmarks, intrinsics, width, height)
+
+            hand_position = filter_hand_position(raw_hand_position, filtering_history)
+
+            hand_position_scaled = hand_position * scale_factor
+
+            # Project hand position to screen
+            hand_pt_h = np.append(hand_position, 1.0)
+            screen_pt = np.dot(intrinsics, hand_pt_h[:3])
+            screen_pt = (
+                int(screen_pt[0] / screen_pt[2]),
+                int(screen_pt[1] / screen_pt[2]),
+            )
+
+            # Draw hand position indicator
+            cv2.circle(frame, screen_pt, 15, (255, 0, 0), -1)
+
+            # Check if hand is close to target (in 2D screen space for simplicity)
+            distance = np.sqrt(
+                (screen_pt[0] - screen_target[0]) ** 2
+                + (screen_pt[1] - screen_target[1]) ** 2
+            )
+
+            if distance < 50:  # Threshold in pixels
+                cv2.putText(
+                    frame,
+                    "Hold position steady...",
+                    (50, 50),
+                    cv2.FONT_HERSHEY_SIMPLEX,
+                    1.0,
+                    (0, 0, 255),
+                    2,
+                )
+
+                if is_hand_stable(hand_position, position_history, n_frames):
+                    camera_positions.append(hand_position_scaled)
+                    print(f"Position {current_target+1} recorded!")
+
+                    # Move to next target and reset history
+                    current_target += 1
+                    position_history.clear()
+
+                    if current_target >= len(reference_positions):
+                        calibration_complete = True
+                    else:
+                        print(f"Please move to position {current_target+1}")
+
+        cv2.putText(
+            frame,
+            f"Calibrating: {current_target}/{len(reference_positions)} positions",
+            (10, height - 20),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.7,
+            (255, 255, 255),
+            2,
+        )
+
+        cv2.imshow("Calibration", frame)
+
+        if cv2.waitKey(1) & 0xFF == ord("q"):
+            break
+
+    if not calibration_complete:
+        print("Calibration aborted.")
+        cap.release()
+        cv2.destroyAllWindows()
+        return None
+
+    # Use PnP for initial extrinsic estimation
+    robot_points = np.array(reference_positions)
+    camera_points = np.array(camera_positions)
+
+    # Compute initial transform using PnP
+    initial_transform = compute_extrinsics_pnp(camera_points, robot_points, intrinsics)
+    print("Initial transform computed using PnP:")
+    print(initial_transform)
+
+    # Refine using bundle adjustment
+    refined_transform = refine_calibration(
+        camera_points, robot_points, initial_transform, intrinsics
+    )
+    print("Refined transform after bundle adjustment:")
+    print(refined_transform)
+
+    print("Calibration complete! Now validating...")
+
+    validation_camera_positions = []
+    current_target = 0
+    position_history.clear()
+
+    while current_target < len(validation_positions):
+        success, frame = cap.read()
+        if not success:
+            continue
+
+        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        results = hands.process(frame_rgb)
+
+        target_position = validation_positions[current_target]
+        screen_target = robot_to_screen_pos(target_position, intrinsics)
+
+        # Draw validation target
+        cv2.circle(frame, screen_target, 50, (255, 165, 0), 2)  # Orange
+        cv2.circle(frame, screen_target, 10, (255, 165, 0), -1)
+        cv2.putText(
+            frame,
+            f"Validation {current_target+1}: Move hand here",
+            (screen_target[0] - 100, screen_target[1] - 20),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.7,
+            (255, 165, 0),
+            2,
+        )
+
+        if results.multi_hand_landmarks:
+            hand_landmarks = results.multi_hand_landmarks[0]
+            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
+
+            raw_hand_position = get_hand_pos(hand_landmarks, intrinsics, width, height)
+            hand_position = filter_hand_position(raw_hand_position, filtering_history)
+            hand_position_scaled = hand_position * scale_factor
+
+            # Project hand position to screen for visualization
+            hand_pt_h = np.append(hand_position, 1.0)
+            screen_pt = np.dot(intrinsics, hand_pt_h[:3])
+            screen_pt = (
+                int(screen_pt[0] / screen_pt[2]),
+                int(screen_pt[1] / screen_pt[2]),
+            )
+
+            # Draw hand position indicator
+            cv2.circle(frame, screen_pt, 15, (255, 0, 0), -1)
+
+            # Check if hand is close to target
+            distance = np.sqrt(
+                (screen_pt[0] - screen_target[0]) ** 2
+                + (screen_pt[1] - screen_target[1]) ** 2
+            )
+
+            if distance < 50:
+                cv2.putText(
+                    frame,
+                    "Hold position steady for validation...",
+                    (50, 50),
+                    cv2.FONT_HERSHEY_SIMPLEX,
+                    0.8,
+                    (255, 165, 0),
+                    2,
+                )
+
+                if is_hand_stable(hand_position, position_history, n_frames=30):
+                    validation_camera_positions.append(hand_position_scaled)
+                    print(f"Validation position {current_target+1} recorded!")
+                    current_target += 1
+                    position_history.clear()
+
+        cv2.putText(
+            frame,
+            f"Validation: {current_target}/{len(validation_positions)} positions",
+            (10, height - 20),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.7,
+            (255, 255, 255),
+            2,
+        )
+
+        cv2.imshow("Calibration", frame)
+
+        if cv2.waitKey(1) & 0xFF == ord("q"):
+            break
+
+    # Use the refined transform for validation
+    errors, mean_error = calc_reprojection_error(
+        validation_positions, validation_camera_positions, refined_transform, intrinsics
+    )
+
+    print("Validation complete!")
+    print(f"Mean reprojection error: {mean_error:.4f} pixels")
+    print(f"Max reprojection error: {np.max(errors):.4f} pixels")
+    print(f"Min reprojection error: {np.min(errors):.4f} pixels")
+
+    calibration_data = {
+        "transform": refined_transform,
+        "scale_factor": scale_factor,
+        "reprojection_error": mean_error,
+        "intrinsic_matrix": intrinsics,
+        "calibration_date": np.datetime64("now"),
+    }
+    np.save("extrinsics.npy", calibration_data)
+
+    # Show validation results visually
+    results_img = np.zeros((300, 600, 3), dtype=np.uint8)
+    cv2.putText(
+        results_img,
+        "Calibration Results",
+        (150, 40),
+        cv2.FONT_HERSHEY_SIMPLEX,
+        1.0,
+        (255, 255, 255),
+        2,
+    )
+    cv2.putText(
+        results_img,
+        f"Scale Factor: {scale_factor:.6f}",
+        (30, 100),
+        cv2.FONT_HERSHEY_SIMPLEX,
+        0.7,
+        (0, 255, 0),
+        2,
+    )
+    cv2.putText(
+        results_img,
+        f"Mean Reprojection Error: {mean_error:.4f} pixels",
+        (30, 140),
+        cv2.FONT_HERSHEY_SIMPLEX,
+        0.7,
+        (0, 255, 0)
+        if mean_error < 10
+        else (0, 165, 255)
+        if mean_error < 20
+        else (0, 0, 255),
+        2,
+    )
+    cv2.putText(
+        results_img,
+        f"Max Reprojection Error: {np.max(errors):.4f} pixels",
+        (30, 180),
+        cv2.FONT_HERSHEY_SIMPLEX,
+        0.7,
+        (0, 255, 0)
+        if np.max(errors) < 15
+        else (0, 165, 255)
+        if np.max(errors) < 30
+        else (0, 0, 255),
+        2,
+    )
+
+    quality = (
+        "Excellent"
+        if mean_error < 5
+        else "Good"
+        if mean_error < 10
+        else "Acceptable"
+        if mean_error < 20
+        else "Poor"
+    )
+    cv2.putText(
+        results_img,
+        f"Calibration Quality: {quality}",
+        (30, 240),
+        cv2.FONT_HERSHEY_SIMPLEX,
+        0.9,
+        (0, 255, 0)
+        if quality in ["Excellent", "Good"]
+        else (0, 165, 255)
+        if quality == "Acceptable"
+        else (0, 0, 255),
+        2,
+    )
+
+    cv2.imshow("Calibration Results", results_img)
+    cv2.waitKey(0)
+
+    cap.release()
+    cv2.destroyAllWindows()
+
+    return refined_transform
+
+
+if __name__ == "__main__":
+    transform = calibrate_extrinsics()
+
+    if transform is not None:
+        print("Calibration successful!")
+        print(transform)
+    else:
+        print("Calibration failed.")