build out initial calibration for extrinsic parameters

This commit is contained in:
Ethan Clark 2025-03-26 14:35:56 -07:00
parent 83d06a07be
commit 867f067b24
2 changed files with 825 additions and 214 deletions

View File

@ -1,214 +0,0 @@
import cv2
import numpy as np
import mediapipe as mp
from collections import deque
# Convert 3D robot positions to 2D screen positions
def robot_to_screen_pos(robot_pos: list, width: int, height: int) -> tuple:
center_x, center_y = width // 2, height // 2
scale_factor = min(width, height) / 2
# Y-axis maps to horizontal
screen_x = int(center_x + robot_pos[1] * scale_factor)
# Z-axis maps to vertical (inverted)
screen_y = int(center_y - robot_pos[2] * scale_factor)
return screen_x, screen_y
# Return 3D hand position from MediaPipe landmarks
def get_hand_pos(landmarks, width: int, height: int):
# Use thumb tip and index finger tip
thumb_tip = landmarks.landmark[4] # Thumb tip index
index_tip = landmarks.landmark[8] # Index finger tip index
# Convert to 3D position
thumb_pos = np.array(
[thumb_tip.x * width, thumb_tip.y * height, thumb_tip.z * 100]
) # Rough scaling
index_pos = np.array([index_tip.x * width, index_tip.y * height, index_tip.z * 100])
# Midpoint between thumb and index finger
position = (thumb_pos + index_pos) / 2
return position
# Check if hand position is stable at current location
def is_hand_stable(
position, history: deque, frames: int = 50, threshold: float = 8.0
) -> bool:
history.append(position)
if len(history) < frames:
return False
positions = np.array(history)
max_movement = np.max(np.linalg.norm(positions - positions[-1], axis=1))
return max_movement < threshold
def calibrate_camera() -> np.ndarray:
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(
model_complexity=1,
max_num_hands=1,
static_image_mode=False,
)
# Robot frame positions
reference_positions = [
[0.3, 0.0, 0.4], # Center
[0.3, 0.2, 0.4], # Right
[0.3, -0.2, 0.4], # Left
[0.3, 0.0, 0.6], # Higher
]
cap = cv2.VideoCapture(0)
# Get camera dimensions
success, frame = cap.read()
assert success, "Failed to access camera"
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
position_history = deque(maxlen=15)
current_target = 0
camera_positions = []
calibration_complete = False
print("Automatic calibration starting.")
print("Please move your hand to the highlighted positions on screen.")
while not calibration_complete:
success, frame = cap.read()
if not success:
continue
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
results = hands.process(frame_rgb)
target_position = reference_positions[current_target]
screen_target = robot_to_screen_pos(target_position, width, height)
# Draw target
cv2.circle(frame, screen_target, 50, (0, 255, 0), 2)
cv2.circle(frame, screen_target, 10, (0, 255, 0), -1)
cv2.putText(
frame,
f"Target {current_target+1}: Move hand here",
(screen_target[0] - 100, screen_target[1] - 20),
cv2.FONT_HERSHEY_SIMPLEX,
0.7,
(0, 255, 0),
2,
)
hand_position = None
if results.multi_hand_landmarks:
hand_landmarks = results.multi_hand_landmarks[0] # Just use the first hand
mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
hand_position = get_hand_pos(hand_landmarks)
# Draw hand position indicator
hand_x, hand_y = int(hand_position[0]), int(hand_position[1])
cv2.circle(frame, (hand_x, hand_y), 15, (255, 0, 0), -1)
# Check if hand is close to target (in 2D screen space for simplicity)
distance = np.sqrt(
(hand_x - screen_target[0]) ** 2 + (hand_y - screen_target[1]) ** 2
)
if distance < 50: # Threshold in pixels
cv2.putText(
frame,
"Hold position steady...",
(50, 50),
cv2.FONT_HERSHEY_SIMPLEX,
1.0,
(0, 0, 255),
2,
)
if is_hand_stable(hand_position, position_history):
camera_positions.append(hand_position)
print(f"Position {current_target+1} recorded!")
# Move to next target and reset history
current_target += 1
position_history.clear()
if current_target >= len(reference_positions):
calibration_complete = True
else:
print(f"Please move to position {current_target+1}")
cv2.putText(
frame,
f"Calibrating: {current_target}/{len(reference_positions)} positions",
(10, height - 20),
cv2.FONT_HERSHEY_SIMPLEX,
0.7,
(255, 255, 255),
2,
)
cv2.imshow("Calibration", frame)
if cv2.waitKey(1) & 0xFF == ord("q"):
break
cap.release()
cv2.destroyAllWindows()
if not calibration_complete:
print("Calibration aborted.")
return None
robot_points = np.array(reference_positions)
robot_centroid = np.mean(robot_points, axis=0)
robot_centered = robot_points - robot_centroid
camera_points = np.array(camera_positions)
camera_centroid = np.mean(camera_points, axis=0)
camera_centered = camera_points - camera_centroid
# Find rotation using SVD
H = np.dot(camera_centered.T, robot_centered)
U, S, Vt = np.linalg.svd(H)
rotation = np.dot(U, Vt)
# Check that it's a proper rotation matrix
if np.linalg.det(rotation) < 0:
Vt[-1, :] *= -1
rotation = np.dot(U, Vt)
translation = robot_centroid - np.dot(rotation, camera_centroid)
# Create homogeneous transform matrix
transform = np.eye(4)
transform[:3, :3] = rotation
transform[:3, 3] = translation
print("Calibration complete!")
np.save("camera_extrinsics.npy", transform)
return transform
if __name__ == "__main__":
transform = calibrate_camera()
if transform is not None:
print("Transform matrix:")
print(transform)
print("Saved to camera_to_robot_transform.npy")

View File

@ -0,0 +1,825 @@
import os
import re
import cv2
import torch
import numpy as np
import mediapipe as mp
from collections import deque
from scipy.signal import savgol_filter
from scipy.optimize import least_squares
from intrinsic import calibrate_intrinsics
REFERENCE_OBJECT_SIZE = 0.50 # 50cm
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def robot_to_screen_pos(robot_pos: list, K: np.ndarray) -> tuple:
"""
Project 3D robot position to 2D screen position using intrinsic matrix
"""
# Create homogeneous point
robot_pt_homogeneous = np.append(robot_pos, 1.0)
# Project using intrinsic matrix
screen_pt = np.dot(K, robot_pt_homogeneous[:3])
# Normalize by z coordinate to get pixel coordinates
screen_x, screen_y = screen_pt[:2] / screen_pt[2]
# Convert to integers
screen_x, screen_y = int(screen_x), int(screen_y)
return screen_x, screen_y
# Filter hand position using Savitzky-Golay to smooth out noise
def filter_hand_position(position: list, history: deque, window_size: int = 7) -> list:
history.append(position)
if len(history) < 5:
return position
history_array = np.array(history)
if len(history) >= window_size:
filtered_x = savgol_filter(history_array[:, 0], window_size, 2)
filtered_y = savgol_filter(history_array[:, 1], window_size, 2)
filtered_z = savgol_filter(history_array[:, 2], window_size, 2)
filtered_position = np.array([filtered_x[-1], filtered_y[-1], filtered_z[-1]])
else:
# Simple average if not enough history
filtered_position = np.mean(history_array, axis=0)
return filtered_position
# Return 3D hand position from MediaPipe landmarks using intrinsic matrix
def get_hand_pos(
landmarks: list, intrinsic_matrix: np.ndarray, width: int, height: int
) -> np.ndarray:
"""
Get 3D hand position using camera intrinsics for proper projection
"""
# Use thumb tip and index finger tip
thumb_tip = landmarks.landmark[4] # Thumb tip index
index_tip = landmarks.landmark[8] # Index finger tip index
# Convert landmark coordinates to pixel coordinates
thumb_img = np.array([thumb_tip.x * width, thumb_tip.y * height, 1.0])
index_img = np.array([index_tip.x * width, index_tip.y * height, 1.0])
# Apply inverse of intrinsic matrix to get normalized camera coordinates
K_inv = np.linalg.inv(intrinsic_matrix)
thumb_norm = np.dot(K_inv, thumb_img)
index_norm = np.dot(K_inv, index_img)
# Scale by depth (using MediaPipe's relative depth estimate)
# Note: This scaling is approximate, ideally would use a proper depth sensor
thumb_depth = max(0.1, thumb_tip.z * 100) # Avoid negative/zero depth
index_depth = max(0.1, index_tip.z * 100) # Avoid negative/zero depth
thumb_pos = thumb_norm * thumb_depth
index_pos = index_norm * index_depth
# Midpoint between thumb and index finger
position = (thumb_pos + index_pos) / 2
return position[:3] # Return x, y, z
def measure_pinch_size(
landmarks: list,
intrinsic_matrix: np.ndarray,
frame: np.ndarray,
midas_model,
midas_transform,
width: int,
height: int,
) -> float:
"""
Measure distance between thumb and index finger in 3D space
using intrinsics and MiDaS depth estimation
"""
# Use thumb tip and index finger tip
thumb_tip = landmarks.landmark[4] # Thumb tip
index_tip = landmarks.landmark[8] # Index finger tip
# Convert to pixel coordinates
thumb_pixel = (int(thumb_tip.x * width), int(thumb_tip.y * height))
index_pixel = (int(index_tip.x * width), int(index_tip.y * height))
# Get depth map using MiDaS
img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
img_input = midas_transform(img).to(DEVICE)
with torch.no_grad():
depth_map = midas_model(img_input)
depth_map = torch.nn.functional.interpolate(
depth_map.unsqueeze(1),
size=(height, width),
mode="bicubic",
align_corners=False,
).squeeze()
depth_map = depth_map.cpu().numpy()
# Sample depths at thumb and index finger locations
thumb_depth = depth_map[thumb_pixel[1], thumb_pixel[0]]
index_depth = depth_map[index_pixel[1], index_pixel[0]]
# Normalize depths (MiDaS gives relative depths)
# We need to convert to metric scale
# TODO: This scaling factor needs calibration
depth_scale = 5.0
thumb_depth_metric = thumb_depth * depth_scale
index_depth_metric = index_depth * depth_scale
# Apply inverse of intrinsic matrix to get normalized camera coordinates
K_inv = np.linalg.inv(intrinsic_matrix)
# Convert to homogeneous coordinates
thumb_img = np.array([thumb_pixel[0], thumb_pixel[1], 1.0])
index_img = np.array([index_pixel[0], index_pixel[1], 1.0])
# Get normalized camera coordinates
thumb_norm = np.dot(K_inv, thumb_img)
index_norm = np.dot(K_inv, index_img)
# Scale by depth to get 3D coordinates
thumb_pos = thumb_norm * thumb_depth_metric
index_pos = index_norm * index_depth_metric
# Calculate Euclidean distance
distance = np.linalg.norm(thumb_pos - index_pos)
# Visualize depths for debugging
cv2.putText(
frame,
f"Thumb depth: {thumb_depth:.2f}, Index depth: {index_depth:.2f}",
(10, height - 50),
cv2.FONT_HERSHEY_SIMPLEX,
0.6,
(255, 0, 0),
2,
)
return distance
def is_hand_stable(
position, history: deque, n_frames: int = 50, threshold: float = 8.0
) -> bool:
"""Check if hand position is stable using a simple threshold"""
history.append(position)
if len(history) < n_frames:
return False
positions = np.array(history)
max_movement = np.max(np.linalg.norm(positions - positions[-1], axis=1))
return max_movement < threshold
# Compute extrinsics using PnP (Perspective-n-Point)
def compute_extrinsics_pnp(
camera_points: np.ndarray, robot_points: np.ndarray, intrinsic_matrix: np.ndarray
) -> np.ndarray:
"""
Compute camera extrinsics using PnP algorithm.
Args:
camera_points: 3D points in camera frame
robot_points: 3D points in robot frame
intrinsic_matrix: Camera intrinsic matrix
Returns:
4x4 transformation matrix from camera to robot frame
"""
# Convert 3D camera points to 2D image points using intrinsic matrix
image_points = []
for pt in camera_points:
# Project 3D point to 2D using the camera model
pt_homogeneous = np.append(pt, 1.0)
img_pt = np.dot(intrinsic_matrix, pt_homogeneous[:3])
# Normalize to get image coordinates
img_pt = img_pt[:2] / img_pt[2]
image_points.append(img_pt)
image_points = np.array(image_points, dtype=np.float32)
robot_points = np.array(robot_points, dtype=np.float32)
# Use OpenCV's solvePnP to get camera pose relative to robot
success, rvec, tvec = cv2.solvePnP(
robot_points,
image_points,
intrinsic_matrix,
distCoeffs=None, # Add distortion coefficients if available
flags=cv2.SOLVEPNP_ITERATIVE,
)
# Convert rotation vector to rotation matrix
rot_matrix, _ = cv2.Rodrigues(rvec)
# Create transformation matrix
transform = np.eye(4)
transform[:3, :3] = rot_matrix
transform[:3, 3] = tvec.flatten()
return transform
def refine_calibration(
camera_points: np.ndarray,
robot_points: np.ndarray,
initial_transform: np.ndarray,
intrinsic_matrix: np.ndarray,
) -> np.ndarray:
"""
Refine calibration using bundle adjustment optimization
Args:
camera_points: 3D points in camera frame
robot_points: 3D points in robot frame
initial_transform: Initial transformation matrix from camera to robot
intrinsic_matrix: Camera intrinsic matrix
Returns:
Refined 4x4 transformation matrix
"""
# Extract rotation and translation from initial transform
R = initial_transform[:3, :3]
t = initial_transform[:3, 3]
# Convert rotation matrix to Rodriguez vector
rvec, _ = cv2.Rodrigues(R)
# Initial parameters (rotation vector and translation)
params = np.concatenate([rvec.flatten(), t])
# Define the error function
def error_function(params):
rvec = params[:3].reshape(3, 1)
tvec = params[3:].reshape(3, 1)
errors = []
for robot_pt, camera_pt in zip(robot_points, camera_points):
robot_pt = robot_pt.reshape(1, 3)
# Project robot point to camera using the current parameters
img_pt, _ = cv2.projectPoints(robot_pt, rvec, tvec, intrinsic_matrix, None)
img_pt = img_pt.flatten()
# Convert camera point to image using intrinsics
camera_pt_h = np.append(camera_pt, 1.0)
proj_camera_pt = np.dot(intrinsic_matrix, camera_pt_h[:3])
proj_camera_pt = proj_camera_pt[:2] / proj_camera_pt[2]
# Calculate reprojection error
error = img_pt - proj_camera_pt
errors.extend(error)
return np.array(errors)
# Run optimization
result = least_squares(error_function, params, method="lm")
# Convert optimized parameters back to transformation matrix
rvec_opt = result.x[:3].reshape(3, 1)
tvec_opt = result.x[3:].reshape(3, 1)
R_opt, _ = cv2.Rodrigues(rvec_opt)
transform_opt = np.eye(4)
transform_opt[:3, :3] = R_opt
transform_opt[:3, 3] = tvec_opt.flatten()
return transform_opt
def calc_reprojection_error(
robot_points: list,
camera_points: list,
transform: np.ndarray,
intrinsic_matrix: np.ndarray,
) -> tuple:
"""
Calculate reprojection error using the intrinsic matrix for proper projection
"""
# Extract rotation and translation
R = transform[:3, :3]
t = transform[:3, 3]
# Convert to rotation vector
rvec, _ = cv2.Rodrigues(R)
tvec = t.reshape(3, 1)
# Project robot points to image space
robot_points = np.array(robot_points, dtype=np.float32)
projected_pts, _ = cv2.projectPoints(
robot_points, rvec, tvec, intrinsic_matrix, None
)
projected_pts = projected_pts.reshape(-1, 2)
# Convert camera points to image space
camera_img_pts = []
for pt in camera_points:
pt_h = np.append(pt, 1.0)
img_pt = np.dot(intrinsic_matrix, pt_h[:3])
img_pt = img_pt[:2] / img_pt[2]
camera_img_pts.append(img_pt)
camera_img_pts = np.array(camera_img_pts)
# Calculate errors in image space (pixel distances)
errors = np.linalg.norm(projected_pts - camera_img_pts, axis=1)
return errors, np.mean(errors)
def get_intrinsics() -> np.ndarray:
file_dir = os.path.dirname(os.path.abspath(__file__))
files = os.listdir(file_dir)
filename = "intrinsics.npz"
if filename in files:
try:
calibration_data = np.load(os.path.join(file_dir, filename))
intrinsics = calibration_data["K"]
print("Intrinsic matrix loaded.")
print(intrinsics)
return intrinsics
except (KeyError, FileNotFoundError) as e:
print(f"Error loading intrinsic matrix: {e}")
print("Intrinsic matrix not found or invalid.")
print("Calibrating intrinsic matrix...")
intrinsics = calibrate_intrinsics()
print("Intrinsic matrix calibrated.")
return intrinsics
def calibrate_extrinsics() -> np.ndarray:
# Load intrinsic matrix
intrinsics = get_intrinsics()
if intrinsics is None:
print(
"Failed to load or calibrate intrinsic matrix. Cannot proceed with extrinsic calibration."
)
return None
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(
model_complexity=1,
max_num_hands=1,
static_image_mode=False,
)
midas = torch.hub.load("intel-isl/MiDaS", "MiDaS_small")
midas.to(DEVICE)
midas.eval()
transform = torch.hub.load("intel-isl/MiDaS", "transforms")
transform = transform.small_transform
# All in robot coordinates
reference_positions = [
[0.3, 0.0, 0.4], # Center
[0.3, 0.2, 0.4], # Right
[0.3, -0.2, 0.4], # Left
[0.3, 0.0, 0.6], # Higher
[0.3, 0.0, 0.2], # Lower
[0.2, 0.2, 0.3], # Front right bottom
[0.2, -0.2, 0.3], # Front left bottom
[0.2, 0.2, 0.5], # Front right top
[0.2, -0.2, 0.5], # Front left top
]
# Validation positions for reprojection error
validation_positions = [[0.25, 0.1, 0.35], [0.35, -0.1, 0.45], [0.3, 0.15, 0.5]]
cap = cv2.VideoCapture(0)
success, frame = cap.read()
assert success, "Failed to access camera"
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
n_frames = 50
position_history = deque(maxlen=2 * n_frames)
print("Step 1: Scale calibration")
print(
f"Hold your thumb and index finger exactly {REFERENCE_OBJECT_SIZE*100}cm apart"
)
scale_factor = None
scale_calibration_complete = False
reference_distances = []
while not scale_calibration_complete:
success, frame = cap.read()
if not success:
continue
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
results = hands.process(frame_rgb)
cv2.putText(
frame,
f"Hold thumb and index finger exactly {REFERENCE_OBJECT_SIZE*100}cm apart",
(10, 30),
cv2.FONT_HERSHEY_SIMPLEX,
0.7,
(0, 255, 0),
2,
)
cv2.putText(
frame,
"Hold position steady...",
(10, 70),
cv2.FONT_HERSHEY_SIMPLEX,
0.7,
(0, 255, 0),
2,
)
if results.multi_hand_landmarks:
hand_landmarks = results.multi_hand_landmarks[0]
mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
distance = measure_pinch_size(hand_landmarks, intrinsics, width, height)
cv2.putText(
frame,
f"Current distance: {distance:.2f}",
(10, 110),
cv2.FONT_HERSHEY_SIMPLEX,
0.7,
(0, 255, 0),
2,
)
hand_position = get_hand_pos(hand_landmarks, intrinsics, width, height)
if is_hand_stable(hand_position, position_history, n_frames=30):
reference_distances.append(distance)
cv2.putText(
frame,
"Measurement taken!",
(10, 150),
cv2.FONT_HERSHEY_SIMPLEX,
0.7,
(0, 0, 255),
2,
)
# Take 5 measurements for robustness
if len(reference_distances) >= 5:
# Calculate scale factor (median to avoid outliers)
median_distance = np.median(reference_distances)
scale_factor = REFERENCE_OBJECT_SIZE / median_distance
print(f"Scale factor: {scale_factor}")
scale_calibration_complete = True
position_history.clear()
else:
# Wait a bit between measurements
for i in range(30):
cv2.imshow("Calibration", frame)
cv2.waitKey(100)
position_history.clear()
cv2.putText(
frame,
f"Measurements: {len(reference_distances)}/5",
(10, height - 20),
cv2.FONT_HERSHEY_SIMPLEX,
0.7,
(255, 255, 255),
2,
)
cv2.imshow("Calibration", frame)
if cv2.waitKey(1) & 0xFF == ord("q"):
break
print("Step 2: Position calibration")
print("Please move your hand to the highlighted positions on screen.")
current_target = 0
camera_positions = []
filtering_history = deque(maxlen=n_frames)
calibration_complete = False
while not calibration_complete:
success, frame = cap.read()
if not success:
continue
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
results = hands.process(frame_rgb)
target_position = reference_positions[current_target]
# Use intrinsic matrix for accurate projection
screen_target = robot_to_screen_pos(target_position, intrinsics)
# Draw target
cv2.circle(frame, screen_target, 50, (0, 255, 0), 2)
cv2.circle(frame, screen_target, 10, (0, 255, 0), -1)
cv2.putText(
frame,
f"Target {current_target+1}: Move hand here",
(screen_target[0] - 100, screen_target[1] - 20),
cv2.FONT_HERSHEY_SIMPLEX,
0.7,
(0, 255, 0),
2,
)
hand_position = None
if results.multi_hand_landmarks:
hand_landmarks = results.multi_hand_landmarks[0]
mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
raw_hand_position = get_hand_pos(hand_landmarks, intrinsics, width, height)
hand_position = filter_hand_position(raw_hand_position, filtering_history)
hand_position_scaled = hand_position * scale_factor
# Project hand position to screen
hand_pt_h = np.append(hand_position, 1.0)
screen_pt = np.dot(intrinsics, hand_pt_h[:3])
screen_pt = (
int(screen_pt[0] / screen_pt[2]),
int(screen_pt[1] / screen_pt[2]),
)
# Draw hand position indicator
cv2.circle(frame, screen_pt, 15, (255, 0, 0), -1)
# Check if hand is close to target (in 2D screen space for simplicity)
distance = np.sqrt(
(screen_pt[0] - screen_target[0]) ** 2
+ (screen_pt[1] - screen_target[1]) ** 2
)
if distance < 50: # Threshold in pixels
cv2.putText(
frame,
"Hold position steady...",
(50, 50),
cv2.FONT_HERSHEY_SIMPLEX,
1.0,
(0, 0, 255),
2,
)
if is_hand_stable(hand_position, position_history, n_frames):
camera_positions.append(hand_position_scaled)
print(f"Position {current_target+1} recorded!")
# Move to next target and reset history
current_target += 1
position_history.clear()
if current_target >= len(reference_positions):
calibration_complete = True
else:
print(f"Please move to position {current_target+1}")
cv2.putText(
frame,
f"Calibrating: {current_target}/{len(reference_positions)} positions",
(10, height - 20),
cv2.FONT_HERSHEY_SIMPLEX,
0.7,
(255, 255, 255),
2,
)
cv2.imshow("Calibration", frame)
if cv2.waitKey(1) & 0xFF == ord("q"):
break
if not calibration_complete:
print("Calibration aborted.")
cap.release()
cv2.destroyAllWindows()
return None
# Use PnP for initial extrinsic estimation
robot_points = np.array(reference_positions)
camera_points = np.array(camera_positions)
# Compute initial transform using PnP
initial_transform = compute_extrinsics_pnp(camera_points, robot_points, intrinsics)
print("Initial transform computed using PnP:")
print(initial_transform)
# Refine using bundle adjustment
refined_transform = refine_calibration(
camera_points, robot_points, initial_transform, intrinsics
)
print("Refined transform after bundle adjustment:")
print(refined_transform)
print("Calibration complete! Now validating...")
validation_camera_positions = []
current_target = 0
position_history.clear()
while current_target < len(validation_positions):
success, frame = cap.read()
if not success:
continue
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
results = hands.process(frame_rgb)
target_position = validation_positions[current_target]
screen_target = robot_to_screen_pos(target_position, intrinsics)
# Draw validation target
cv2.circle(frame, screen_target, 50, (255, 165, 0), 2) # Orange
cv2.circle(frame, screen_target, 10, (255, 165, 0), -1)
cv2.putText(
frame,
f"Validation {current_target+1}: Move hand here",
(screen_target[0] - 100, screen_target[1] - 20),
cv2.FONT_HERSHEY_SIMPLEX,
0.7,
(255, 165, 0),
2,
)
if results.multi_hand_landmarks:
hand_landmarks = results.multi_hand_landmarks[0]
mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
raw_hand_position = get_hand_pos(hand_landmarks, intrinsics, width, height)
hand_position = filter_hand_position(raw_hand_position, filtering_history)
hand_position_scaled = hand_position * scale_factor
# Project hand position to screen for visualization
hand_pt_h = np.append(hand_position, 1.0)
screen_pt = np.dot(intrinsics, hand_pt_h[:3])
screen_pt = (
int(screen_pt[0] / screen_pt[2]),
int(screen_pt[1] / screen_pt[2]),
)
# Draw hand position indicator
cv2.circle(frame, screen_pt, 15, (255, 0, 0), -1)
# Check if hand is close to target
distance = np.sqrt(
(screen_pt[0] - screen_target[0]) ** 2
+ (screen_pt[1] - screen_target[1]) ** 2
)
if distance < 50:
cv2.putText(
frame,
"Hold position steady for validation...",
(50, 50),
cv2.FONT_HERSHEY_SIMPLEX,
0.8,
(255, 165, 0),
2,
)
if is_hand_stable(hand_position, position_history, n_frames=30):
validation_camera_positions.append(hand_position_scaled)
print(f"Validation position {current_target+1} recorded!")
current_target += 1
position_history.clear()
cv2.putText(
frame,
f"Validation: {current_target}/{len(validation_positions)} positions",
(10, height - 20),
cv2.FONT_HERSHEY_SIMPLEX,
0.7,
(255, 255, 255),
2,
)
cv2.imshow("Calibration", frame)
if cv2.waitKey(1) & 0xFF == ord("q"):
break
# Use the refined transform for validation
errors, mean_error = calc_reprojection_error(
validation_positions, validation_camera_positions, refined_transform, intrinsics
)
print("Validation complete!")
print(f"Mean reprojection error: {mean_error:.4f} pixels")
print(f"Max reprojection error: {np.max(errors):.4f} pixels")
print(f"Min reprojection error: {np.min(errors):.4f} pixels")
calibration_data = {
"transform": refined_transform,
"scale_factor": scale_factor,
"reprojection_error": mean_error,
"intrinsic_matrix": intrinsics,
"calibration_date": np.datetime64("now"),
}
np.save("extrinsics.npy", calibration_data)
# Show validation results visually
results_img = np.zeros((300, 600, 3), dtype=np.uint8)
cv2.putText(
results_img,
"Calibration Results",
(150, 40),
cv2.FONT_HERSHEY_SIMPLEX,
1.0,
(255, 255, 255),
2,
)
cv2.putText(
results_img,
f"Scale Factor: {scale_factor:.6f}",
(30, 100),
cv2.FONT_HERSHEY_SIMPLEX,
0.7,
(0, 255, 0),
2,
)
cv2.putText(
results_img,
f"Mean Reprojection Error: {mean_error:.4f} pixels",
(30, 140),
cv2.FONT_HERSHEY_SIMPLEX,
0.7,
(0, 255, 0)
if mean_error < 10
else (0, 165, 255)
if mean_error < 20
else (0, 0, 255),
2,
)
cv2.putText(
results_img,
f"Max Reprojection Error: {np.max(errors):.4f} pixels",
(30, 180),
cv2.FONT_HERSHEY_SIMPLEX,
0.7,
(0, 255, 0)
if np.max(errors) < 15
else (0, 165, 255)
if np.max(errors) < 30
else (0, 0, 255),
2,
)
quality = (
"Excellent"
if mean_error < 5
else "Good"
if mean_error < 10
else "Acceptable"
if mean_error < 20
else "Poor"
)
cv2.putText(
results_img,
f"Calibration Quality: {quality}",
(30, 240),
cv2.FONT_HERSHEY_SIMPLEX,
0.9,
(0, 255, 0)
if quality in ["Excellent", "Good"]
else (0, 165, 255)
if quality == "Acceptable"
else (0, 0, 255),
2,
)
cv2.imshow("Calibration Results", results_img)
cv2.waitKey(0)
cap.release()
cv2.destroyAllWindows()
return refined_transform
if __name__ == "__main__":
transform = calibrate_extrinsics()
if transform is not None:
print("Calibration successful!")
print(transform)
else:
print("Calibration failed.")