mirror of
https://github.com/luckyrobots/open_phantom.git
synced 2025-04-03 10:32:19 +00:00
improve icp and robot params
This commit is contained in:
parent
aae4e91f54
commit
b38527b5db
@ -1,92 +1,94 @@
|
||||
import os
|
||||
import cv2
|
||||
import time
|
||||
import torch
|
||||
|
||||
import cv2
|
||||
import depth_pro
|
||||
import mediapipe as mp
|
||||
import numpy as np
|
||||
import open3d as o3d
|
||||
import mediapipe as mp
|
||||
|
||||
import torch
|
||||
from PIL import Image
|
||||
from robot_manager import RobotManager
|
||||
from sam2.sam2_image_predictor import SAM2ImagePredictor
|
||||
from tqdm import tqdm
|
||||
from utils.visualizations import *
|
||||
from robot_manager import RobotManager
|
||||
# from diffusers import StableDiffusionInpaintPipeline
|
||||
|
||||
|
||||
class HandProcessor:
|
||||
def __init__(self) -> None:
|
||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
|
||||
self.mp_hands = mp.solutions.hands
|
||||
self.mp_drawing = mp.solutions.drawing_utils
|
||||
self.mp_drawing_styles = mp.solutions.drawing_styles
|
||||
|
||||
|
||||
self.hands = self.mp_hands.Hands(
|
||||
model_complexity=1,
|
||||
max_num_hands=1,
|
||||
static_image_mode=False,
|
||||
)
|
||||
|
||||
|
||||
# NOTE: Look into better depth estimation models
|
||||
# Initialize MiDaS for depth estimation
|
||||
print("Loading MiDaS model...")
|
||||
self.midas = torch.hub.load("intel-isl/MiDaS", "DPT_Hybrid")
|
||||
self.midas.to(self.device)
|
||||
self.midas.eval()
|
||||
|
||||
|
||||
# Load MiDaS transforms to resize and normalize input images
|
||||
self.midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms")
|
||||
self.transform = self.midas_transforms.dpt_transform
|
||||
|
||||
# Segment hand from image using MediaPipe Hands landmarks
|
||||
def _create_mask(self, image: np.ndarray, landmarks: list, thickness: int=5, padding:int=10) -> np.ndarray:
|
||||
height, width = image.shape[:2]
|
||||
mask = np.zeros((height, width), dtype=np.uint8)
|
||||
|
||||
points = [(int(landmark.x * width), int(landmark.y * height)) for landmark in landmarks.landmark]
|
||||
|
||||
# Draw filled circles at each landmark
|
||||
for point in points:
|
||||
cv2.circle(mask, point, thickness, 255, -1)
|
||||
|
||||
# Connect all landmarks with thick lines
|
||||
for connection in self.mp_hands.HAND_CONNECTIONS:
|
||||
start_idx, end_idx = connection
|
||||
cv2.line(mask, points[start_idx], points[end_idx], 255, thickness)
|
||||
|
||||
# Create palm by connecting base of fingers with wrist
|
||||
palm_points = [points[0], points[1], points[5], points[9], points[13], points[17]]
|
||||
cv2.fillPoly(mask, [np.array(palm_points)], 255)
|
||||
|
||||
# Create shape between fingers
|
||||
finger_bases = [(1,5), (5,9), (9,13), (13,17)]
|
||||
for base1, base2 in finger_bases:
|
||||
triangle = np.array([points[0], points[base1], points[base2]])
|
||||
cv2.fillPoly(mask, [triangle], 255)
|
||||
|
||||
# Dilate to smooth and expand slightly
|
||||
kernel = np.ones((padding, padding), np.uint8)
|
||||
mask = cv2.dilate(mask, kernel, iterations=1)
|
||||
|
||||
# Smooth the mask
|
||||
mask = cv2.GaussianBlur(mask, (21, 21), 0)
|
||||
_, mask = cv2.threshold(mask, 50, 255, cv2.THRESH_BINARY)
|
||||
|
||||
|
||||
# Load SAM2 model for hand segmentation
|
||||
print("Loading SAM2 model...")
|
||||
self.sam2_predictor = SAM2ImagePredictor.from_pretrained(
|
||||
"facebook/sam2-hiera-large"
|
||||
)
|
||||
|
||||
def _create_mask(self, frame: np.ndarray, landmarks: list) -> np.ndarray:
|
||||
height, width = frame.shape[:2]
|
||||
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
||||
|
||||
# Set image in SAM2 predictor
|
||||
with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
|
||||
self.sam2_predictor.set_image(frame_rgb) # Set image for prediction
|
||||
|
||||
# Convert landmarks to point prompts
|
||||
points = []
|
||||
for landmark in landmarks.landmark:
|
||||
x, y = int(landmark.x * width), int(landmark.y * height)
|
||||
points.append([x, y])
|
||||
|
||||
input_points = np.array(points)
|
||||
|
||||
# Predict mask with point prompts
|
||||
masks, _, _ = self.sam2_predictor.predict(
|
||||
point_coords=input_points, # Pass the points as prompts
|
||||
point_labels=np.ones(
|
||||
len(input_points)
|
||||
), # All points from hand are foreground
|
||||
multimask_output=False, # Just get one mask
|
||||
)
|
||||
|
||||
mask = masks[0].astype(np.uint8) * 255
|
||||
|
||||
return mask
|
||||
|
||||
"""
|
||||
Transform input image to match MiDaS model requirements
|
||||
Estimate depth map using MiDaS model
|
||||
"""
|
||||
# TODO: Look into why screen goes black sometimes
|
||||
|
||||
# TODO: Swap MiDaS for ml-depth-pro model
|
||||
def _estimate_depth(self, image: np.ndarray) -> tuple:
|
||||
img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
||||
|
||||
|
||||
# Transform img for MiDaS model
|
||||
input_batch = self.transform(img).to(self.device)
|
||||
|
||||
|
||||
with torch.inference_mode():
|
||||
prediction = self.midas(input_batch)
|
||||
|
||||
|
||||
# Interpolate to original size
|
||||
prediction = torch.nn.functional.interpolate(
|
||||
prediction.unsqueeze(1),
|
||||
@ -94,127 +96,146 @@ class HandProcessor:
|
||||
mode="bicubic",
|
||||
align_corners=False,
|
||||
).squeeze()
|
||||
|
||||
|
||||
# Convert to numpy and normalize to 0-255 for visualization
|
||||
depth_map = prediction.cpu().numpy()
|
||||
depth_min = depth_map.min()
|
||||
depth_max = depth_map.max()
|
||||
depth_map_normalized = 255 * (depth_map - depth_min) / (depth_max - depth_min)
|
||||
|
||||
|
||||
return depth_map, depth_map_normalized.astype(np.uint8)
|
||||
|
||||
|
||||
# TODO: Look into better depth scaling
|
||||
def _create_cloud(self, depth_map: np.ndarray, hand_mask: np.ndarray) -> np.ndarray:
|
||||
focal_x, focal_y, center_x, center_y = self.camera_intrinsics
|
||||
|
||||
v_coords, u_coords = np.where(hand_mask > 0)
|
||||
|
||||
v_coords, u_coords = np.where(hand_mask > 0)
|
||||
z_values = depth_map[v_coords, u_coords]
|
||||
|
||||
|
||||
# Filter out zero depth values
|
||||
valid_indices = z_values > 0
|
||||
u_coords = u_coords[valid_indices]
|
||||
v_coords = v_coords[valid_indices]
|
||||
z_values = z_values[valid_indices]
|
||||
|
||||
# NOTE: Abritrary scaling factor for depth
|
||||
|
||||
z_metric = z_values * 0.5
|
||||
|
||||
|
||||
# Back-project to 3D using camera intrinsics
|
||||
x_values = (u_coords - center_x) * z_metric / focal_x
|
||||
y_values = (v_coords - center_y) * z_metric / focal_y
|
||||
|
||||
|
||||
points = np.column_stack((x_values, y_values, z_metric))
|
||||
|
||||
|
||||
return points
|
||||
|
||||
# TODO: Look into better Z scaling
|
||||
|
||||
# TODO: Look into better depth scaling
|
||||
def _create_mesh(self, landmarks: list, image_dims: tuple) -> np.ndarray:
|
||||
width, height = image_dims
|
||||
|
||||
|
||||
vertices = []
|
||||
for landmark in landmarks.landmark:
|
||||
vertices.append([
|
||||
landmark.x * width,
|
||||
landmark.y * height,
|
||||
landmark.z * width
|
||||
])
|
||||
|
||||
vertices.append(
|
||||
[landmark.x * width, landmark.y * height, landmark.z * width]
|
||||
)
|
||||
|
||||
# Define faces (triangles) connecting landmarks
|
||||
faces = [
|
||||
# Palm
|
||||
[0, 1, 5], [0, 5, 9], [0, 9, 13], [0, 13, 17],
|
||||
[0, 1, 5],
|
||||
[0, 5, 9],
|
||||
[0, 9, 13],
|
||||
[0, 13, 17],
|
||||
# Thumb
|
||||
[1, 2, 3], [2, 3, 4],
|
||||
[1, 2, 3],
|
||||
[2, 3, 4],
|
||||
# Index finger
|
||||
[5, 6, 7], [6, 7, 8],
|
||||
[5, 6, 7],
|
||||
[6, 7, 8],
|
||||
# Middle finger
|
||||
[9, 10, 11], [10, 11, 12],
|
||||
[9, 10, 11],
|
||||
[10, 11, 12],
|
||||
# Ring finger
|
||||
[13, 14, 15], [14, 15, 16],
|
||||
[13, 14, 15],
|
||||
[14, 15, 16],
|
||||
# Pinky
|
||||
[17, 18, 19], [18, 19, 20]
|
||||
[17, 18, 19],
|
||||
[18, 19, 20],
|
||||
]
|
||||
|
||||
|
||||
dense_vertices = list(vertices)
|
||||
|
||||
|
||||
# Add interpolated vertices along finger segments
|
||||
connections = self.mp_hands.HAND_CONNECTIONS
|
||||
for connection in connections:
|
||||
start_idx, end_idx = connection
|
||||
start_point = np.array(vertices[start_idx])
|
||||
end_point = np.array(vertices[end_idx])
|
||||
|
||||
|
||||
# Add 2 interpolated points between each connected pair
|
||||
for t in [0.33, 0.66]:
|
||||
interp_point = start_point * (1-t) + end_point * t
|
||||
interp_point = start_point * (1 - t) + end_point * t
|
||||
dense_vertices.append(interp_point.tolist())
|
||||
|
||||
|
||||
return np.array(dense_vertices), np.array(faces)
|
||||
|
||||
|
||||
# TODO: Improve ICP registration for better alignment
|
||||
def _icp_registration(self, cloud: np.ndarray, vertices: np.ndarray) -> np.ndarray:
|
||||
source = o3d.geometry.PointCloud()
|
||||
source.points = o3d.utility.Vector3dVector(vertices)
|
||||
|
||||
|
||||
target = o3d.geometry.PointCloud()
|
||||
target.points = o3d.utility.Vector3dVector(cloud)
|
||||
|
||||
|
||||
# Calculate centroids
|
||||
source_centroid = np.mean(vertices, axis=0)
|
||||
target_centroid = np.mean(cloud, axis=0)
|
||||
|
||||
# Calculate initial translation to align centroids
|
||||
initial_translation = target_centroid - source_centroid
|
||||
|
||||
# Create initial transformation matrix
|
||||
trans_init = np.eye(4)
|
||||
trans_init[:3, 3] = initial_translation
|
||||
|
||||
result = o3d.pipelines.registration.registration_icp(
|
||||
source,
|
||||
target,
|
||||
max_correspondence_distance=0.05,
|
||||
target,
|
||||
max_correspondence_distance=0.1, # Increased distance
|
||||
init=trans_init, # Initial transformation
|
||||
estimation_method=o3d.pipelines.registration.TransformationEstimationPointToPoint(),
|
||||
criteria=o3d.pipelines.registration.ICPConvergenceCriteria(
|
||||
max_iteration=100
|
||||
), # Increased iterations
|
||||
)
|
||||
|
||||
|
||||
transformation = result.transformation
|
||||
|
||||
|
||||
return transformation
|
||||
|
||||
|
||||
# TODO: Look into better depth scaling
|
||||
def _refine_landmarks(self, landmarks: list, transform: int, image_dims: tuple):
|
||||
width, height = image_dims
|
||||
|
||||
|
||||
refined_landmarks = []
|
||||
for landmark in landmarks.landmark:
|
||||
point = np.array([
|
||||
landmark.x * width,
|
||||
landmark.y * height,
|
||||
landmark.z * width, # TODO: Figure out better scaling factor
|
||||
1.0
|
||||
])
|
||||
|
||||
point = np.array(
|
||||
[landmark.x * width, landmark.y * height, landmark.z * width, 1.0]
|
||||
)
|
||||
|
||||
# Apply transformation to 3D point
|
||||
transformed = np.dot(transform, point)
|
||||
refined_landmarks.append(transformed[:3])
|
||||
|
||||
|
||||
return refined_landmarks
|
||||
|
||||
|
||||
def _apply_constraints(self, landmarks: list):
|
||||
constrained = np.array(landmarks)
|
||||
|
||||
|
||||
# Define finger joint indices
|
||||
# MediaPipe hand model: Wrist is 0, thumb is 1-4, index is 5-8, etc.
|
||||
thumb_indices = [1, 2, 3, 4]
|
||||
index_indices = [5, 6, 7, 8]
|
||||
|
||||
|
||||
# Constrain the last two joints of thumb and index finger
|
||||
# as mentioned in the paper
|
||||
for finger_indices in [thumb_indices, index_indices]:
|
||||
@ -224,115 +245,165 @@ class HandProcessor:
|
||||
joint1 = constrained[finger_indices[-3]]
|
||||
joint2 = constrained[finger_indices[-2]]
|
||||
joint3 = constrained[finger_indices[-1]]
|
||||
|
||||
|
||||
# Direction of the first segment
|
||||
dir1 = joint2 - joint1
|
||||
dir1 = dir1 / np.linalg.norm(dir1)
|
||||
|
||||
|
||||
# Instead of full ball joint, constrain the last joint's direction
|
||||
# to follow similar direction as the previous segment
|
||||
ideal_dir = dir1.copy()
|
||||
actual_dir = joint3 - joint2
|
||||
actual_length = np.linalg.norm(actual_dir)
|
||||
|
||||
|
||||
# Force the direction to be within a cone of the previous segment
|
||||
# (limiting to single degree of freedom approximately)
|
||||
corrected_dir = ideal_dir * actual_length
|
||||
|
||||
|
||||
# Apply the correction
|
||||
constrained[finger_indices[-1]] = joint2 + corrected_dir
|
||||
|
||||
|
||||
return constrained
|
||||
|
||||
def _get_robot_params(self, refined_landmarks) -> tuple:
|
||||
# Extract key landmarks
|
||||
wrist = refined_landmarks[0] # Wrist landmark
|
||||
thumb_tip = refined_landmarks[4] # Thumb tip
|
||||
index_tip = refined_landmarks[8] # Index finger tip
|
||||
|
||||
# Calculate end effector position (midpoint between thumb and index tips)
|
||||
|
||||
def _get_robot_params(self, refined_landmarks):
|
||||
# Extract keypoints
|
||||
landmarks = np.array(refined_landmarks)
|
||||
|
||||
# Define indices for specific parts of the hand
|
||||
thumb_indices = [1, 2, 3, 4] # Thumb landmarks
|
||||
index_indices = [5, 6, 7, 8] # Index finger landmarks
|
||||
thumb_tip_idx = 4
|
||||
index_tip_idx = 8
|
||||
|
||||
# 1. Set target position as midpoint between thumb tip and index tip
|
||||
thumb_tip = landmarks[thumb_tip_idx]
|
||||
index_tip = landmarks[index_tip_idx]
|
||||
position = (thumb_tip + index_tip) / 2
|
||||
|
||||
# Calculate vectors for orientation
|
||||
v1 = thumb_tip - wrist # Vector from wrist to thumb tip
|
||||
v2 = index_tip - wrist # Vector from wrist to index tip
|
||||
|
||||
# Calculate normal to hand plane
|
||||
normal = np.cross(v1, v2)
|
||||
if np.linalg.norm(normal) > 0:
|
||||
normal = normal / np.linalg.norm(normal)
|
||||
else:
|
||||
# Default if vectors are collinear
|
||||
normal = np.array([0, 0, 1])
|
||||
|
||||
# Calculate main direction along thumb
|
||||
direction = thumb_tip - wrist
|
||||
if np.linalg.norm(direction) > 0:
|
||||
direction = direction / np.linalg.norm(direction)
|
||||
else:
|
||||
# Default if thumb is at wrist (unlikely)
|
||||
direction = np.array([1, 0, 0])
|
||||
|
||||
# Calculate gripper width
|
||||
|
||||
# 2. Calculate orientation
|
||||
# Get all thumb and index finger points for plane fitting
|
||||
thumb_points = landmarks[thumb_indices]
|
||||
index_points = landmarks[index_indices]
|
||||
finger_points = np.vstack([thumb_points, index_points])
|
||||
|
||||
# Fit plane to finger points
|
||||
centroid = np.mean(finger_points, axis=0)
|
||||
centered_points = finger_points - centroid
|
||||
|
||||
# Use SVD to find the normal to the best-fitting plane
|
||||
u, s, vh = np.linalg.svd(centered_points)
|
||||
# The normal is the last right singular vector
|
||||
normal = vh[2, :]
|
||||
|
||||
# Ensure normal is a unit vector
|
||||
normal = normal / np.linalg.norm(normal)
|
||||
|
||||
# Fit a principal axis through thumb points
|
||||
thumb_centroid = np.mean(thumb_points, axis=0)
|
||||
thumb_centered = thumb_points - thumb_centroid
|
||||
|
||||
# Use SVD again to find direction of maximum variance (principal axis)
|
||||
u, s, vh = np.linalg.svd(thumb_centered)
|
||||
principal_axis = vh[0, :] # First singular vector
|
||||
|
||||
# Ensure principal axis is a unit vector
|
||||
principal_axis = principal_axis / np.linalg.norm(principal_axis)
|
||||
|
||||
# Construct orthogonal vectors for orientation matrix
|
||||
z_axis = normal # Normal to the plane
|
||||
x_axis = principal_axis # Along the thumb
|
||||
|
||||
# Ensure x_axis is orthogonal to z_axis
|
||||
x_axis = x_axis - np.dot(x_axis, z_axis) * z_axis
|
||||
x_axis = x_axis / np.linalg.norm(x_axis)
|
||||
|
||||
# Compute y_axis as cross product to ensure orthogonality
|
||||
y_axis = np.cross(z_axis, x_axis)
|
||||
|
||||
# Create rotation matrix
|
||||
rotation_matrix = np.column_stack([x_axis, y_axis, z_axis])
|
||||
|
||||
# 3. Calculate gripper width
|
||||
gripper_width = np.linalg.norm(thumb_tip - index_tip)
|
||||
|
||||
return position, direction, normal, gripper_width
|
||||
|
||||
|
||||
# Apply percentile threshold to mitigate slippage (as mentioned in the paper)
|
||||
# Store gripper widths in a buffer to compute percentiles
|
||||
if not hasattr(self, "gripper_width_buffer"):
|
||||
self.gripper_width_buffer = []
|
||||
|
||||
self.gripper_width_buffer.append(gripper_width)
|
||||
# Keep only the last 100 values
|
||||
if len(self.gripper_width_buffer) > 100:
|
||||
self.gripper_width_buffer.pop(0)
|
||||
|
||||
# Apply 20th percentile threshold
|
||||
if len(self.gripper_width_buffer) > 5: # Need enough samples
|
||||
min_width = np.percentile(self.gripper_width_buffer, 20)
|
||||
if gripper_width < min_width:
|
||||
gripper_width = min_width
|
||||
|
||||
return position, rotation_matrix, gripper_width
|
||||
|
||||
def record_video(self) -> str:
|
||||
output_dir = os.path.join(os.getcwd(), 'recordings')
|
||||
output_dir = os.path.join(os.getcwd(), "recordings")
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
|
||||
timestamp = time.strftime("%Y%m%d-%H%M%S")
|
||||
video_filename = os.path.join(output_dir, f"recorded_video_{timestamp}.mp4")
|
||||
|
||||
|
||||
cap = cv2.VideoCapture(0)
|
||||
|
||||
|
||||
assert cap.isOpened(), "Failed to open camera."
|
||||
|
||||
|
||||
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||
fps = int(cap.get(cv2.CAP_PROP_FPS))
|
||||
|
||||
|
||||
# Update camera intrinsics based on video dimensions
|
||||
self.camera_intrinsics = (width * 0.8, height * 0.8, width / 2, height / 2)
|
||||
|
||||
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
|
||||
|
||||
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
|
||||
out = None
|
||||
|
||||
|
||||
recording = False
|
||||
did_record = False
|
||||
|
||||
|
||||
print("Camera is active. Press 'r' to start/stop recording, 'q' to quit.")
|
||||
|
||||
|
||||
while cap.isOpened():
|
||||
success, frame = cap.read()
|
||||
assert success, "Failed to read from camera."
|
||||
|
||||
|
||||
# Mirror the image for more intuitive viewing
|
||||
frame = cv2.flip(frame, 1)
|
||||
|
||||
|
||||
# Create a separate display frame for showing the recording indicator
|
||||
display_frame = frame.copy()
|
||||
|
||||
|
||||
# Display recording indicator ONLY on the display frame
|
||||
if recording:
|
||||
cv2.circle(display_frame, (30, 30), 15, (0, 0, 255), -1)
|
||||
cv2.putText(display_frame, "RECORDING", (50, 40), cv2.FONT_HERSHEY_SIMPLEX,
|
||||
1, (0, 0, 255), 2, cv2.LINE_AA)
|
||||
|
||||
cv2.putText(
|
||||
display_frame,
|
||||
"RECORDING",
|
||||
(50, 40),
|
||||
cv2.FONT_HERSHEY_SIMPLEX,
|
||||
1,
|
||||
(0, 0, 255),
|
||||
2,
|
||||
cv2.LINE_AA,
|
||||
)
|
||||
|
||||
# Show the display frame (with indicator if recording)
|
||||
cv2.imshow('Video Recording', display_frame)
|
||||
|
||||
cv2.imshow("Video Recording", display_frame)
|
||||
|
||||
# Write the original frame (without indicator) to video file if recording
|
||||
if recording and out is not None:
|
||||
out.write(frame)
|
||||
|
||||
|
||||
key = cv2.waitKey(1) & 0xFF
|
||||
|
||||
# NOTE: Haven't tested what happens if user records multiple videos in one session (probably overwrites?)
|
||||
if key == ord('q'):
|
||||
break
|
||||
elif key == ord('r'):
|
||||
|
||||
if key == ord("r"):
|
||||
recording = not recording
|
||||
if recording:
|
||||
print(f"Started recording to {video_filename}")
|
||||
@ -342,29 +413,30 @@ class HandProcessor:
|
||||
out.release()
|
||||
print(f"Stopped recording. Video saved to {video_filename}")
|
||||
did_record = True
|
||||
|
||||
break
|
||||
|
||||
if out is not None:
|
||||
out.release()
|
||||
cap.release()
|
||||
cv2.destroyAllWindows()
|
||||
|
||||
|
||||
return video_filename if did_record else None
|
||||
|
||||
|
||||
def process_video(self, video_path: str) -> tuple:
|
||||
assert video_path, "Video path is required."
|
||||
|
||||
|
||||
cap = cv2.VideoCapture(video_path)
|
||||
|
||||
|
||||
assert cap.isOpened(), f"Failed to open video file {video_path}"
|
||||
|
||||
|
||||
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||
fps = int(cap.get(cv2.CAP_PROP_FPS))
|
||||
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
|
||||
|
||||
# Update camera intrinsics based on video dimensions
|
||||
self.camera_intrinsics = (width * 0.8, height * 0.8, width / 2, height / 2)
|
||||
|
||||
|
||||
base_path = os.path.splitext(video_path)[0]
|
||||
segmented_output_path = f"{base_path}_masked.mp4"
|
||||
depth_output_path = f"{base_path}_depth.mp4"
|
||||
@ -372,94 +444,112 @@ class HandProcessor:
|
||||
registration_output_path = f"{base_path}_registration.mp4"
|
||||
constraints_output_path = f"{base_path}_constraints.mp4"
|
||||
robot_output_path = f"{base_path}_robot.mp4"
|
||||
|
||||
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
|
||||
segmented_out = cv2.VideoWriter(segmented_output_path, fourcc, fps, (width, height))
|
||||
|
||||
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
|
||||
segmented_out = cv2.VideoWriter(
|
||||
segmented_output_path, fourcc, fps, (width, height)
|
||||
)
|
||||
depth_out = cv2.VideoWriter(depth_output_path, fourcc, fps, (width, height))
|
||||
mesh_out = cv2.VideoWriter(mesh_output_path, fourcc, fps, (width, height))
|
||||
reg_out = cv2.VideoWriter(registration_output_path, fourcc, fps, (640, 480)) # Fixed size
|
||||
constraints_out = cv2.VideoWriter(constraints_output_path, fourcc, fps, (width, height))
|
||||
reg_out = cv2.VideoWriter(
|
||||
registration_output_path, fourcc, fps, (640, 480)
|
||||
) # Fixed size
|
||||
constraints_out = cv2.VideoWriter(
|
||||
constraints_output_path, fourcc, fps, (width, height)
|
||||
)
|
||||
robot_out = cv2.VideoWriter(robot_output_path, fourcc, fps, (width, height))
|
||||
|
||||
print(f"Processing video with {total_frames} frames...")
|
||||
|
||||
print(f"Processing video with {total_frames} frames...")
|
||||
for _ in tqdm(range(total_frames)):
|
||||
success, frame = cap.read()
|
||||
assert success, "Failed to read frame from video."
|
||||
|
||||
|
||||
# Convert image to RGB for MediaPipe
|
||||
image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
||||
|
||||
|
||||
# Process with MediaPipe Hands
|
||||
results = self.hands.process(image_rgb)
|
||||
|
||||
|
||||
# Initialize output frames
|
||||
segmented_frame = frame.copy()
|
||||
depth_frame = np.zeros((height, width, 3), dtype=np.uint8)
|
||||
mesh_frame = frame.copy()
|
||||
reg_frame = np.ones((480, 640, 3), dtype=np.uint8) * 255 # Fixed size, white background
|
||||
reg_frame = (
|
||||
np.ones((480, 640, 3), dtype=np.uint8) * 255
|
||||
) # Fixed size, white background
|
||||
constraints_frame = frame.copy()
|
||||
robot_frame = frame.copy()
|
||||
|
||||
|
||||
# Process if hand is detected
|
||||
if results.multi_hand_landmarks:
|
||||
for hand_landmarks in results.multi_hand_landmarks:
|
||||
# Segment hand
|
||||
hand_mask = self._create_mask(frame, hand_landmarks)
|
||||
mask_overlay = frame.copy()
|
||||
mask_overlay[hand_mask > 0] = [0, 0, 255] # Red color for mask
|
||||
segmented_frame = cv2.addWeighted(frame, 0.7, mask_overlay, 0.3, 0)
|
||||
|
||||
# Depth estimation
|
||||
|
||||
depth_map, depth_vis = self._estimate_depth(frame)
|
||||
depth_colored = cv2.applyColorMap(depth_vis, cv2.COLORMAP_JET)
|
||||
depth_frame = depth_colored.copy()
|
||||
|
||||
# Create hand mesh
|
||||
hand_vertices, hand_faces = self._create_mesh(hand_landmarks, (width, height))
|
||||
mesh_frame = visualize_mesh(frame, hand_vertices, hand_faces)
|
||||
|
||||
# Create point cloud from depth
|
||||
|
||||
cloud = self._create_cloud(depth_map, hand_mask)
|
||||
|
||||
# Perform ICP registration
|
||||
|
||||
hand_vertices, hand_faces = self._create_mesh(
|
||||
hand_landmarks, (width, height)
|
||||
)
|
||||
mesh_frame = visualize_mesh(frame, hand_vertices, hand_faces)
|
||||
|
||||
icp_transform = self._icp_registration(cloud, hand_vertices)
|
||||
reg_frame = visualize_registration(cloud, hand_vertices, icp_transform)
|
||||
|
||||
# Refine landmarks using ICP transformation
|
||||
refined_landmarks = self._refine_landmarks(hand_landmarks, icp_transform, (width, height))
|
||||
|
||||
# Store pre-constraint landmarks for visualization
|
||||
reg_frame = visualize_registration(
|
||||
cloud, hand_vertices, icp_transform
|
||||
)
|
||||
|
||||
refined_landmarks = self._refine_landmarks(
|
||||
hand_landmarks, icp_transform, (width, height)
|
||||
)
|
||||
|
||||
original_refined = refined_landmarks.copy()
|
||||
|
||||
# Apply anatomical constraints
|
||||
|
||||
constrained_landmarks = self._apply_constraints(refined_landmarks)
|
||||
constraints_frame = visualize_constraints(frame, original_refined, constrained_landmarks, self.camera_intrinsics)
|
||||
|
||||
# # Calculate robot parameters
|
||||
# position, direction, normal, gripper_width = self._get_robot_params(constrained_landmarks)
|
||||
# robot_frame = visualize_robot_params(frame, position, direction, normal, gripper_width)
|
||||
|
||||
constraints_frame = visualize_constraints(
|
||||
frame,
|
||||
original_refined,
|
||||
constrained_landmarks,
|
||||
self.camera_intrinsics,
|
||||
)
|
||||
|
||||
position, orientation, gripper_width = self._get_robot_params(
|
||||
constrained_landmarks
|
||||
)
|
||||
robot_frame = visualize_robot_params(
|
||||
frame,
|
||||
position,
|
||||
orientation,
|
||||
gripper_width,
|
||||
self.camera_intrinsics,
|
||||
)
|
||||
|
||||
segmented_out.write(segmented_frame)
|
||||
depth_out.write(depth_frame)
|
||||
mesh_out.write(mesh_frame)
|
||||
reg_out.write(reg_frame)
|
||||
constraints_out.write(constraints_frame)
|
||||
robot_out.write(robot_frame)
|
||||
|
||||
|
||||
display_scale = 0.5
|
||||
display_size = (int(width * display_scale), int(height * display_scale))
|
||||
reg_display_size = (int(640 * display_scale), int(480 * display_scale))
|
||||
|
||||
cv2.imshow('Segmented', cv2.resize(segmented_frame, display_size))
|
||||
cv2.imshow('Depth', cv2.resize(depth_frame, display_size))
|
||||
cv2.imshow('Mesh', cv2.resize(mesh_frame, display_size))
|
||||
cv2.imshow('Registration', cv2.resize(reg_frame, reg_display_size))
|
||||
cv2.imshow('Constraints', cv2.resize(constraints_frame, display_size))
|
||||
cv2.imshow('Robot Parameters', cv2.resize(robot_frame, display_size))
|
||||
|
||||
|
||||
cv2.imshow("Segmented", cv2.resize(segmented_frame, display_size))
|
||||
cv2.imshow("Depth", cv2.resize(depth_frame, display_size))
|
||||
cv2.imshow("Mesh", cv2.resize(mesh_frame, display_size))
|
||||
cv2.imshow("Registration", cv2.resize(reg_frame, reg_display_size))
|
||||
cv2.imshow("Constraints", cv2.resize(constraints_frame, display_size))
|
||||
cv2.imshow("Robot Parameters", cv2.resize(robot_frame, display_size))
|
||||
|
||||
if cv2.waitKey(1) & 0xFF == 27: # ESC to exit
|
||||
break
|
||||
|
||||
|
||||
cap.release()
|
||||
segmented_out.release()
|
||||
depth_out.release()
|
||||
@ -468,7 +558,7 @@ class HandProcessor:
|
||||
constraints_out.release()
|
||||
robot_out.release()
|
||||
cv2.destroyAllWindows()
|
||||
|
||||
|
||||
print(f"Processing complete. Results saved to:")
|
||||
print(f"- Hand mask: {segmented_output_path}")
|
||||
print(f"- Depth visualization: {depth_output_path}")
|
||||
@ -476,12 +566,12 @@ class HandProcessor:
|
||||
print(f"- Registration visualization: {registration_output_path}")
|
||||
print(f"- Constraints visualization: {constraints_output_path}")
|
||||
print(f"- Robot parameters: {robot_output_path}")
|
||||
|
||||
|
||||
return {
|
||||
"segmented": segmented_output_path,
|
||||
"depth": depth_output_path,
|
||||
"mesh": mesh_output_path,
|
||||
"registration": registration_output_path,
|
||||
"constraints": constraints_output_path,
|
||||
"robot": robot_output_path
|
||||
}
|
||||
"robot": robot_output_path,
|
||||
}
|
||||
|
@ -5,16 +5,16 @@ from hand_processor import HandProcessor
|
||||
|
||||
def main():
|
||||
processor = HandProcessor()
|
||||
|
||||
|
||||
record_option = input("Record a new video? (y/n): ")
|
||||
|
||||
if record_option.lower() == 'y':
|
||||
|
||||
if record_option.lower() == "y":
|
||||
print("Starting video recording...")
|
||||
video_path = processor.record_video()
|
||||
if not video_path:
|
||||
print("Video recording failed.")
|
||||
return
|
||||
|
||||
|
||||
print(f"Video recorded successfully to {video_path}")
|
||||
process_option = input("Process the video now? (y/n): ")
|
||||
if process_option.lower() == "n":
|
||||
@ -25,9 +25,9 @@ def main():
|
||||
while not os.path.exists(video_path):
|
||||
print("Error: Video file not found.")
|
||||
video_path = input("Enter the path to the video file: ")
|
||||
|
||||
|
||||
processor.process_video(video_path)
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
@ -1,72 +1,75 @@
|
||||
import os
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import pybullet as p
|
||||
|
||||
from utils.math import *
|
||||
from utils.handle_urdf import handle_urdf
|
||||
from scipy.spatial.transform import Rotation as R
|
||||
|
||||
|
||||
|
||||
class RobotManager:
|
||||
def __init__(self, urdf_path: str, camera_intrinsics: tuple) -> None:
|
||||
self.physics_client = p.connect(p.DIRECT)
|
||||
|
||||
|
||||
robot_urdf = handle_urdf(urdf_path)
|
||||
self.robot_id = self._load_robot(robot_urdf)
|
||||
|
||||
|
||||
self.joint_count = p.getNumJoints(self.robot_id)
|
||||
self.end_effector_index = self._find_end_effector()
|
||||
|
||||
|
||||
self.fx, self.fy, self.cx, self.cy = camera_intrinsics
|
||||
# Set up rendering parameters
|
||||
self.img_width = int(self.cx * 2)
|
||||
self.img_height = int(self.cy * 2)
|
||||
|
||||
# Load the robot URDF into PyBullet
|
||||
# Load the robot URDF into PyBullet
|
||||
def _load_robot(self, robot_urdf: str) -> int:
|
||||
try:
|
||||
robot_id = p.loadURDF(
|
||||
robot_urdf,
|
||||
basePosition=[0, 0, 0],
|
||||
useFixedBase=True,
|
||||
flags=p.URDF_USE_SELF_COLLISION | p.URDF_USE_INERTIA_FROM_FILE
|
||||
flags=p.URDF_USE_SELF_COLLISION | p.URDF_USE_INERTIA_FROM_FILE,
|
||||
)
|
||||
except p.error as e:
|
||||
print(f"PyBullet error when loading URDF: {e}")
|
||||
raise e
|
||||
|
||||
robot_name = p.getBodyInfo(robot_id)[1].decode('utf-8')
|
||||
|
||||
robot_name = p.getBodyInfo(robot_id)[1].decode("utf-8")
|
||||
print(f"Successfully loaded {robot_name} robot with ID: {robot_id}")
|
||||
|
||||
|
||||
return robot_id
|
||||
|
||||
|
||||
# NOTE: Only applicable if the robot has one end effector
|
||||
def _find_end_effector(self) -> int:
|
||||
assert self.joint_count > 0, "Robot has no joints"
|
||||
|
||||
|
||||
# Keywords to look for in joint names to identify end effector
|
||||
keywords = ['gripper', 'tool', 'tcp', 'end_effector', 'hand']
|
||||
|
||||
keywords = ["gripper", "tool", "tcp", "end_effector", "hand"]
|
||||
|
||||
for i in range(self.joint_count):
|
||||
info = p.getJointInfo(self.robot_id, i)
|
||||
joint_name = info[1].decode('utf-8').lower()
|
||||
|
||||
joint_name = info[1].decode("utf-8").lower()
|
||||
|
||||
# Check if any keyword is in the joint name
|
||||
if any(keyword in joint_name for keyword in keywords):
|
||||
return i
|
||||
|
||||
|
||||
# If no specific end effector found, use the last joint in the chain
|
||||
return self.joint_count - 1
|
||||
|
||||
|
||||
# TODO: Use inverse kinematics to set the robot pose
|
||||
def set_robot_pose(self, position: np.ndarray, orientation_vectors: np.ndarray, gripper_width: float) -> None:
|
||||
def set_robot_pose(
|
||||
self,
|
||||
position: np.ndarray,
|
||||
orientation_vectors: np.ndarray,
|
||||
gripper_width: float,
|
||||
) -> None:
|
||||
pass
|
||||
|
||||
|
||||
# Render the robot in some scene using some camera parameters
|
||||
def render_robot(self, bg_image=None, camera_params=None):
|
||||
assert self.robot_id >= 0, "Robot not properly loaded"
|
||||
|
||||
|
||||
# Set up camera parameters
|
||||
if camera_params is None:
|
||||
# Default camera setup
|
||||
@ -77,40 +80,37 @@ class RobotManager:
|
||||
cam_roll = 0
|
||||
else:
|
||||
cam_target, cam_distance, cam_yaw, cam_pitch, cam_roll = camera_params
|
||||
|
||||
|
||||
# Compute view matrix
|
||||
view_matrix = p.computeViewMatrixFromYawPitchRoll(
|
||||
cameraTargetPosition=cam_target,
|
||||
distance=cam_distance,
|
||||
yaw=cam_yaw,
|
||||
yaw=cam_yaw,
|
||||
pitch=cam_pitch,
|
||||
roll=cam_roll,
|
||||
upAxisIndex=2
|
||||
upAxisIndex=2,
|
||||
)
|
||||
|
||||
|
||||
# Compute projection matrix
|
||||
aspect = self.img_width / self.img_height
|
||||
proj_matrix = p.computeProjectionMatrixFOV(
|
||||
fov=60,
|
||||
aspect=aspect,
|
||||
nearVal=0.01,
|
||||
farVal=100.0
|
||||
fov=60, aspect=aspect, nearVal=0.01, farVal=100.0
|
||||
)
|
||||
|
||||
|
||||
# Render the scene
|
||||
img_arr = p.getCameraImage(
|
||||
width=self.img_width,
|
||||
height=self.img_height,
|
||||
viewMatrix=view_matrix,
|
||||
projectionMatrix=proj_matrix,
|
||||
renderer=p.ER_BULLET_HARDWARE_OPENGL
|
||||
renderer=p.ER_BULLET_HARDWARE_OPENGL,
|
||||
)
|
||||
|
||||
|
||||
# Extract RGB and depth
|
||||
rgb = np.reshape(img_arr[2], (self.img_height, self.img_width, 4))
|
||||
rgb = rgb[:, :, :3] # Remove alpha channel
|
||||
depth = np.reshape(img_arr[3], (self.img_height, self.img_width))
|
||||
|
||||
|
||||
# If background image is provided, composite
|
||||
if bg_image is not None:
|
||||
# Resize background if needed
|
||||
@ -119,31 +119,33 @@ class RobotManager:
|
||||
bg_resized = cv2.resize(bg_image, (self.img_width, self.img_height))
|
||||
else:
|
||||
bg_resized = bg_image
|
||||
|
||||
|
||||
# Create mask from depth
|
||||
mask = (depth < 0.99).astype(np.float32)
|
||||
mask = np.stack([mask, mask, mask], axis=2)
|
||||
|
||||
|
||||
# Composite
|
||||
composite = bg_resized * (1 - mask) + rgb * mask
|
||||
return composite.astype(np.uint8)
|
||||
|
||||
|
||||
return rgb.astype(np.uint8)
|
||||
|
||||
|
||||
def __del__(self) -> None:
|
||||
if hasattr(self, 'physics_client'):
|
||||
if hasattr(self, "physics_client"):
|
||||
try:
|
||||
p.disconnect(self.physics_client)
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cwd = os.getcwd()
|
||||
urdf_path = os.path.join(cwd, "notebook/phantom/urdf/SO_5DOF_ARM100_05d.SLDASM/urdf/SO_5DOF_ARM100_05d.SLDASM.urdf")
|
||||
camera_intrinsics = (320, 240, 320, 240) # Random intrinsics for example
|
||||
|
||||
urdf_path = os.path.join(
|
||||
cwd,
|
||||
"notebook/phantom/urdf/SO_5DOF_ARM100_05d.SLDASM/urdf/SO_5DOF_ARM100_05d.SLDASM.urdf",
|
||||
)
|
||||
camera_intrinsics = (320, 240, 320, 240) # Random intrinsics for example
|
||||
|
||||
robot_vis = RobotManager(urdf_path, camera_intrinsics)
|
||||
rendered_image = robot_vis.render_robot()
|
||||
|
||||
@ -155,4 +157,4 @@ if __name__ == "__main__":
|
||||
# Option 2: Save the image to a file
|
||||
output_path = "robot_render.png"
|
||||
cv2.imwrite(output_path, rendered_image)
|
||||
print(f"Render saved to {output_path}")
|
||||
print(f"Render saved to {output_path}")
|
||||
|
1
open_phantom/utils/calibrations.py
Normal file
1
open_phantom/utils/calibrations.py
Normal file
@ -0,0 +1 @@
|
||||
# TODO: Implement camera calibration functions to extract camera extrinsics and intrinsics
|
@ -2,7 +2,6 @@ import os
|
||||
import xml.dom.minidom as minidom
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
|
||||
"""
|
||||
Fixes a specific path that uses the ROS-style 'package://' prefix.
|
||||
The 'package://' prefix is used in URDF files to refer to files in the same package.
|
||||
@ -10,26 +9,28 @@ However, when we're not running in a ROS environment, the paths are not valid.
|
||||
This function tries to find the absolute path of the mesh file.
|
||||
If the mesh file is not found, the original path is used.
|
||||
"""
|
||||
|
||||
|
||||
def fix_path(path: str, urdf_dir: str) -> str:
|
||||
if path.startswith('package://'):
|
||||
parts = path[len('package://'):].split('/', 1)
|
||||
if path.startswith("package://"):
|
||||
parts = path[len("package://") :].split("/", 1)
|
||||
if len(parts) == 2:
|
||||
package_name, rel_path = parts
|
||||
|
||||
|
||||
# Try potential locations for the mesh
|
||||
potential_paths = [
|
||||
os.path.join(urdf_dir, rel_path),
|
||||
os.path.join(urdf_dir, '../meshes', rel_path),
|
||||
os.path.join(urdf_dir, f'../{package_name}', rel_path),
|
||||
os.path.join(urdf_dir, '../..', rel_path)
|
||||
os.path.join(urdf_dir, "../meshes", rel_path),
|
||||
os.path.join(urdf_dir, f"../{package_name}", rel_path),
|
||||
os.path.join(urdf_dir, "../..", rel_path),
|
||||
]
|
||||
|
||||
|
||||
for possible_path in potential_paths:
|
||||
if os.path.exists(possible_path):
|
||||
return possible_path
|
||||
|
||||
print(f"Failed to find mesh for package path: {path}")
|
||||
|
||||
|
||||
print(f"Failed to find mesh for package path: {path}")
|
||||
|
||||
return path
|
||||
|
||||
|
||||
@ -37,58 +38,67 @@ def fix_path(path: str, urdf_dir: str) -> str:
|
||||
Iterates through the URDF file and fixes the paths of all mesh files.
|
||||
The URDF file is parsed and the mesh paths are modified in-place.
|
||||
"""
|
||||
def fix_mesh_paths(urdf_path:str, urdf_dir: str) -> str:
|
||||
|
||||
|
||||
def fix_mesh_paths(urdf_path: str, urdf_dir: str) -> str:
|
||||
root = ET.parse(urdf_path).getroot()
|
||||
|
||||
|
||||
try:
|
||||
for mesh in root.findall('.//mesh'):
|
||||
if 'filename' in mesh.attrib:
|
||||
mesh.attrib['filename'] = fix_path(mesh.attrib['filename'], urdf_dir)
|
||||
for mesh in root.findall(".//mesh"):
|
||||
if "filename" in mesh.attrib:
|
||||
mesh.attrib["filename"] = fix_path(mesh.attrib["filename"], urdf_dir)
|
||||
except Exception as e:
|
||||
print(f"Error fixing mesh paths: {e}")
|
||||
raise e
|
||||
|
||||
fixed_path = urdf_path.replace('.urdf', '_fixed.urdf')
|
||||
|
||||
|
||||
fixed_path = urdf_path.replace(".urdf", "_fixed.urdf")
|
||||
|
||||
return root, fixed_path
|
||||
|
||||
|
||||
"""
|
||||
Formats the XML tree to be human-readable and writes it to a file.
|
||||
"""
|
||||
|
||||
|
||||
def format_xml(root: ET.Element, fixed_path: str) -> None:
|
||||
xml_str = ET.tostring(root, encoding='utf-8')
|
||||
xml_str = ET.tostring(root, encoding="utf-8")
|
||||
dom = minidom.parseString(xml_str)
|
||||
|
||||
with open(fixed_path, 'w', encoding='utf-8') as f:
|
||||
|
||||
with open(fixed_path, "w", encoding="utf-8") as f:
|
||||
# Write with nice indentation but remove extra whitespace
|
||||
pretty_xml = dom.toprettyxml(indent=' ')
|
||||
pretty_xml = dom.toprettyxml(indent=" ")
|
||||
# Remove extra blank lines that minidom sometimes adds
|
||||
pretty_xml = '\n'.join([line for line in pretty_xml.split('\n') if line.strip()])
|
||||
pretty_xml = "\n".join(
|
||||
[line for line in pretty_xml.split("\n") if line.strip()]
|
||||
)
|
||||
f.write(pretty_xml)
|
||||
|
||||
|
||||
|
||||
def handle_urdf(urdf_path: str) -> str:
|
||||
# Check if URDF is valid
|
||||
if not os.path.exists(urdf_path):
|
||||
print(f"Invalid URDF path: {urdf_path}")
|
||||
return None
|
||||
|
||||
|
||||
# FIXME: Add check to see if URDF needs to be processed
|
||||
|
||||
|
||||
try:
|
||||
urdf_dir = os.path.dirname(urdf_path)
|
||||
root, fixed_path = fix_mesh_paths(urdf_path, urdf_dir)
|
||||
root, fixed_path = fix_mesh_paths(urdf_path, urdf_dir)
|
||||
format_xml(root, fixed_path)
|
||||
print(f"Successfully processed URDF: {fixed_path}")
|
||||
return fixed_path
|
||||
except Exception as e:
|
||||
print(f"Failed to process URDF: {e}")
|
||||
raise e
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Example usage
|
||||
cwd = os.getcwd()
|
||||
urdf_path = os.path.join(cwd, "notebook/phantom/urdf/SO_5DOF_ARM100_05d.SLDASM/urdf/SO_5DOF_ARM100_05d.SLDASM.urdf")
|
||||
handle_urdf(urdf_path)
|
||||
urdf_path = os.path.join(
|
||||
cwd,
|
||||
"open_phantom/urdf/SO_5DOF_ARM100_05d.SLDASM/urdf/SO_5DOF_ARM100_05d.SLDASM.urdf",
|
||||
)
|
||||
handle_urdf(urdf_path)
|
||||
|
@ -6,75 +6,107 @@ def visualize_mesh(frame, hand_vertices, faces):
|
||||
"""Visualize the hand mesh on the frame"""
|
||||
mesh_vis = frame.copy()
|
||||
height, width = frame.shape[:2]
|
||||
|
||||
|
||||
# Draw the base vertices of the mesh (original landmarks)
|
||||
for i in range(21):
|
||||
pt = tuple(map(int, hand_vertices[i][:2]))
|
||||
if 0 <= pt[0] < width and 0 <= pt[1] < height:
|
||||
cv2.circle(mesh_vis, pt, 4, (255, 0, 0), -1) # Red circles for base vertices
|
||||
|
||||
cv2.circle(
|
||||
mesh_vis, pt, 4, (255, 0, 0), -1
|
||||
) # Red circles for base vertices
|
||||
|
||||
# Draw the interpolated vertices
|
||||
for i in range(21, len(hand_vertices)):
|
||||
pt = tuple(map(int, hand_vertices[i][:2]))
|
||||
if 0 <= pt[0] < width and 0 <= pt[1] < height:
|
||||
cv2.circle(mesh_vis, pt, 2, (0, 255, 255), -1) # Yellow circles for interpolated vertices
|
||||
|
||||
cv2.circle(
|
||||
mesh_vis, pt, 2, (0, 255, 255), -1
|
||||
) # Yellow circles for interpolated vertices
|
||||
|
||||
# Draw the faces of the mesh
|
||||
for face in faces:
|
||||
if len(face) == 3: # Triangle face
|
||||
pt1 = tuple(map(int, hand_vertices[face[0]][:2]))
|
||||
pt2 = tuple(map(int, hand_vertices[face[1]][:2]))
|
||||
pt3 = tuple(map(int, hand_vertices[face[2]][:2]))
|
||||
|
||||
if (0 <= pt1[0] < width and 0 <= pt1[1] < height and
|
||||
0 <= pt2[0] < width and 0 <= pt2[1] < height and
|
||||
0 <= pt3[0] < width and 0 <= pt3[1] < height):
|
||||
cv2.line(mesh_vis, pt1, pt2, (0, 255, 0), 1) # Green lines for mesh edges
|
||||
|
||||
if (
|
||||
0 <= pt1[0] < width
|
||||
and 0 <= pt1[1] < height
|
||||
and 0 <= pt2[0] < width
|
||||
and 0 <= pt2[1] < height
|
||||
and 0 <= pt3[0] < width
|
||||
and 0 <= pt3[1] < height
|
||||
):
|
||||
cv2.line(
|
||||
mesh_vis, pt1, pt2, (0, 255, 0), 1
|
||||
) # Green lines for mesh edges
|
||||
cv2.line(mesh_vis, pt2, pt3, (0, 255, 0), 1)
|
||||
cv2.line(mesh_vis, pt3, pt1, (0, 255, 0), 1)
|
||||
|
||||
|
||||
# Add explanation text
|
||||
cv2.putText(mesh_vis, "Red: Base vertices", (10, 30),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 0), 2)
|
||||
cv2.putText(mesh_vis, "Yellow: Interpolated vertices", (10, 60),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 255), 2)
|
||||
cv2.putText(mesh_vis, "Green: Mesh edges", (10, 90),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)
|
||||
|
||||
cv2.putText(
|
||||
mesh_vis,
|
||||
"Red: Base vertices",
|
||||
(10, 30),
|
||||
cv2.FONT_HERSHEY_SIMPLEX,
|
||||
0.6,
|
||||
(255, 0, 0),
|
||||
2,
|
||||
)
|
||||
cv2.putText(
|
||||
mesh_vis,
|
||||
"Yellow: Interpolated vertices",
|
||||
(10, 60),
|
||||
cv2.FONT_HERSHEY_SIMPLEX,
|
||||
0.6,
|
||||
(0, 255, 255),
|
||||
2,
|
||||
)
|
||||
cv2.putText(
|
||||
mesh_vis,
|
||||
"Green: Mesh edges",
|
||||
(10, 90),
|
||||
cv2.FONT_HERSHEY_SIMPLEX,
|
||||
0.6,
|
||||
(0, 255, 0),
|
||||
2,
|
||||
)
|
||||
|
||||
return mesh_vis
|
||||
|
||||
|
||||
def visualize_registration(cloud, vertices, icp_transform):
|
||||
"""Visualize the ICP registration process"""
|
||||
height, width = 480, 640 # Fixed size visualization
|
||||
reg_vis = np.ones((height, width, 3), dtype=np.uint8) * 255 # White background
|
||||
|
||||
|
||||
# Define visualization boundaries
|
||||
margin = 50
|
||||
view_width = width - 2 * margin
|
||||
view_height = height - 2 * margin
|
||||
|
||||
|
||||
# Find 2D min/max values for scaling
|
||||
all_points_2d = np.vstack([cloud[:, :2], vertices[:, :2]])
|
||||
min_vals = np.min(all_points_2d, axis=0)
|
||||
max_vals = np.max(all_points_2d, axis=0)
|
||||
|
||||
|
||||
# Function to scale points to fit visualization
|
||||
def scale_point(point):
|
||||
scaled = (point - min_vals) / (max_vals - min_vals)
|
||||
x = int(scaled[0] * view_width) + margin
|
||||
y = int(scaled[1] * view_height) + margin
|
||||
return (x, y)
|
||||
|
||||
|
||||
# Draw original point cloud (red)
|
||||
for point in cloud[::10]: # Downsample for visualization
|
||||
pt = scale_point(point[:2])
|
||||
cv2.circle(reg_vis, pt, 1, (0, 0, 255), -1) # Red dots
|
||||
|
||||
|
||||
# Draw original mesh vertices (blue)
|
||||
for vertex in vertices:
|
||||
pt = scale_point(vertex[:2])
|
||||
cv2.circle(reg_vis, pt, 2, (255, 0, 0), -1) # Blue dots
|
||||
|
||||
|
||||
# Apply transformation to mesh vertices
|
||||
transformed_vertices = []
|
||||
for vertex in vertices:
|
||||
@ -83,181 +115,319 @@ def visualize_registration(cloud, vertices, icp_transform):
|
||||
# Apply transformation
|
||||
v_transformed = np.dot(icp_transform, v_hom)
|
||||
transformed_vertices.append(v_transformed[:3])
|
||||
|
||||
|
||||
# Draw transformed mesh vertices (green)
|
||||
for vertex in transformed_vertices:
|
||||
pt = scale_point(vertex[:2])
|
||||
cv2.circle(reg_vis, pt, 2, (0, 255, 0), -1) # Green dots
|
||||
|
||||
|
||||
# Add transformation matrix display
|
||||
cv2.putText(reg_vis, "ICP Transformation Matrix:", (10, height - 120),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1)
|
||||
|
||||
cv2.putText(
|
||||
reg_vis,
|
||||
"ICP Transformation Matrix:",
|
||||
(10, height - 120),
|
||||
cv2.FONT_HERSHEY_SIMPLEX,
|
||||
0.5,
|
||||
(0, 0, 0),
|
||||
1,
|
||||
)
|
||||
|
||||
for i in range(4):
|
||||
matrix_text = f"[{icp_transform[i][0]:.2f}, {icp_transform[i][1]:.2f}, {icp_transform[i][2]:.2f}, {icp_transform[i][3]:.2f}]"
|
||||
cv2.putText(reg_vis, matrix_text, (10, height - 90 + i * 20),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 0, 0), 1)
|
||||
|
||||
cv2.putText(
|
||||
reg_vis,
|
||||
matrix_text,
|
||||
(10, height - 90 + i * 20),
|
||||
cv2.FONT_HERSHEY_SIMPLEX,
|
||||
0.4,
|
||||
(0, 0, 0),
|
||||
1,
|
||||
)
|
||||
|
||||
# Add legend
|
||||
cv2.putText(reg_vis, "Red: Point Cloud", (width - 200, 30),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)
|
||||
cv2.putText(reg_vis, "Blue: Original Mesh", (width - 200, 60),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1)
|
||||
cv2.putText(reg_vis, "Green: Transformed Mesh", (width - 200, 90),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
|
||||
|
||||
cv2.putText(
|
||||
reg_vis,
|
||||
"Red: Point Cloud",
|
||||
(width - 200, 30),
|
||||
cv2.FONT_HERSHEY_SIMPLEX,
|
||||
0.5,
|
||||
(0, 0, 255),
|
||||
1,
|
||||
)
|
||||
cv2.putText(
|
||||
reg_vis,
|
||||
"Blue: Original Mesh",
|
||||
(width - 200, 60),
|
||||
cv2.FONT_HERSHEY_SIMPLEX,
|
||||
0.5,
|
||||
(255, 0, 0),
|
||||
1,
|
||||
)
|
||||
cv2.putText(
|
||||
reg_vis,
|
||||
"Green: Transformed Mesh",
|
||||
(width - 200, 90),
|
||||
cv2.FONT_HERSHEY_SIMPLEX,
|
||||
0.5,
|
||||
(0, 255, 0),
|
||||
1,
|
||||
)
|
||||
|
||||
return reg_vis
|
||||
|
||||
def visualize_constraints(frame, refined_landmarks, constrained_landmarks, camera_intrinsics):
|
||||
|
||||
def visualize_constraints(
|
||||
frame, refined_landmarks, constrained_landmarks, camera_intrinsics
|
||||
):
|
||||
"""Visualize before and after applying anatomical constraints"""
|
||||
constraints_vis = frame.copy()
|
||||
height, width = frame.shape[:2]
|
||||
focal_x, focal_y, center_x, center_y = camera_intrinsics
|
||||
|
||||
|
||||
# Define finger indices
|
||||
thumb_indices = [1, 2, 3, 4]
|
||||
index_indices = [5, 6, 7, 8]
|
||||
highlighted_indices = thumb_indices + index_indices
|
||||
|
||||
|
||||
# Draw original refined landmarks
|
||||
for i, landmark in enumerate(refined_landmarks):
|
||||
x, y, z = landmark
|
||||
if z > 0:
|
||||
u = int(x * focal_x / z + center_x)
|
||||
v = int(y * focal_y / z + center_y)
|
||||
|
||||
|
||||
if 0 <= u < width and 0 <= v < height:
|
||||
if i in highlighted_indices:
|
||||
# Draw larger circles for thumb and index fingers (the constrained ones)
|
||||
cv2.circle(constraints_vis, (u, v), 5, (0, 0, 255), -1) # Red circles
|
||||
cv2.circle(
|
||||
constraints_vis, (u, v), 5, (0, 0, 255), -1
|
||||
) # Red circles
|
||||
else:
|
||||
cv2.circle(constraints_vis, (u, v), 3, (0, 0, 255), -1)
|
||||
|
||||
|
||||
# Draw connections for original landmarks
|
||||
for i in range(len(refined_landmarks) - 1):
|
||||
if i + 1 in thumb_indices and i in thumb_indices:
|
||||
# Draw thumb connections
|
||||
start = refined_landmarks[i]
|
||||
end = refined_landmarks[i + 1]
|
||||
|
||||
|
||||
if start[2] > 0 and end[2] > 0:
|
||||
start_u = int(start[0] * focal_x / start[2] + center_x)
|
||||
start_v = int(start[1] * focal_y / start[2] + center_y)
|
||||
end_u = int(end[0] * focal_x / end[2] + center_x)
|
||||
end_v = int(end[1] * focal_y / end[2] + center_y)
|
||||
|
||||
if (0 <= start_u < width and 0 <= start_v < height and
|
||||
0 <= end_u < width and 0 <= end_v < height):
|
||||
cv2.line(constraints_vis, (start_u, start_v), (end_u, end_v), (0, 0, 255), 2)
|
||||
|
||||
|
||||
if (
|
||||
0 <= start_u < width
|
||||
and 0 <= start_v < height
|
||||
and 0 <= end_u < width
|
||||
and 0 <= end_v < height
|
||||
):
|
||||
cv2.line(
|
||||
constraints_vis,
|
||||
(start_u, start_v),
|
||||
(end_u, end_v),
|
||||
(0, 0, 255),
|
||||
2,
|
||||
)
|
||||
|
||||
# Draw constrained landmarks
|
||||
for i, landmark in enumerate(constrained_landmarks):
|
||||
x, y, z = landmark
|
||||
if z > 0:
|
||||
u = int(x * focal_x / z + center_x)
|
||||
v = int(y * focal_y / z + center_y)
|
||||
|
||||
|
||||
if 0 <= u < width and 0 <= v < height:
|
||||
if i in highlighted_indices:
|
||||
# Draw larger circles for thumb and index fingers
|
||||
cv2.circle(constraints_vis, (u, v), 5, (0, 255, 0), -1) # Green circles
|
||||
cv2.circle(
|
||||
constraints_vis, (u, v), 5, (0, 255, 0), -1
|
||||
) # Green circles
|
||||
else:
|
||||
cv2.circle(constraints_vis, (u, v), 3, (0, 255, 0), -1)
|
||||
|
||||
|
||||
# Draw connections for constrained landmarks
|
||||
for i in range(len(constrained_landmarks) - 1):
|
||||
if i + 1 in thumb_indices and i in thumb_indices:
|
||||
# Draw thumb connections
|
||||
start = constrained_landmarks[i]
|
||||
end = constrained_landmarks[i + 1]
|
||||
|
||||
|
||||
if start[2] > 0 and end[2] > 0:
|
||||
start_u = int(start[0] * focal_x / start[2] + center_x)
|
||||
start_v = int(start[1] * focal_y / start[2] + center_y)
|
||||
end_u = int(end[0] * focal_x / end[2] + center_x)
|
||||
end_v = int(end[1] * focal_y / end[2] + center_y)
|
||||
|
||||
if (0 <= start_u < width and 0 <= start_v < height and
|
||||
0 <= end_u < width and 0 <= end_v < height):
|
||||
cv2.line(constraints_vis, (start_u, start_v), (end_u, end_v), (0, 255, 0), 2)
|
||||
|
||||
|
||||
if (
|
||||
0 <= start_u < width
|
||||
and 0 <= start_v < height
|
||||
and 0 <= end_u < width
|
||||
and 0 <= end_v < height
|
||||
):
|
||||
cv2.line(
|
||||
constraints_vis,
|
||||
(start_u, start_v),
|
||||
(end_u, end_v),
|
||||
(0, 255, 0),
|
||||
2,
|
||||
)
|
||||
|
||||
# Add legend
|
||||
cv2.putText(constraints_vis, "Red: Before constraints", (10, 30),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
|
||||
cv2.putText(constraints_vis, "Green: After constraints", (10, 60),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
|
||||
|
||||
cv2.putText(
|
||||
constraints_vis,
|
||||
"Red: Before constraints",
|
||||
(10, 30),
|
||||
cv2.FONT_HERSHEY_SIMPLEX,
|
||||
0.7,
|
||||
(0, 0, 255),
|
||||
2,
|
||||
)
|
||||
cv2.putText(
|
||||
constraints_vis,
|
||||
"Green: After constraints",
|
||||
(10, 60),
|
||||
cv2.FONT_HERSHEY_SIMPLEX,
|
||||
0.7,
|
||||
(0, 255, 0),
|
||||
2,
|
||||
)
|
||||
|
||||
return constraints_vis
|
||||
|
||||
def visualize_robot_params(frame, position, direction, normal, gripper_width, camera_intrinsics):
|
||||
"""Visualize robot parameters (position, orientation, gripper width)"""
|
||||
|
||||
def visualize_robot_params(
|
||||
frame, position, orientation_matrix, gripper_width, camera_intrinsics
|
||||
):
|
||||
"""Visualize robot parameters (position, orientation matrix, gripper width)"""
|
||||
robot_vis = frame.copy()
|
||||
height, width = frame.shape[:2]
|
||||
focal_x, focal_y, center_x, center_y = camera_intrinsics
|
||||
|
||||
|
||||
# Extract axes from orientation matrix
|
||||
x_axis = orientation_matrix[:, 0] # First column (principal axis)
|
||||
y_axis = orientation_matrix[:, 1] # Second column
|
||||
z_axis = orientation_matrix[:, 2] # Third column (normal)
|
||||
|
||||
# Project position to image coordinates
|
||||
x, y, z = position
|
||||
if z > 0:
|
||||
u = int(x * focal_x / z + center_x)
|
||||
v = int(y * focal_y / z + center_y)
|
||||
|
||||
|
||||
if 0 <= u < width and 0 <= v < height:
|
||||
# Draw end effector position
|
||||
cv2.circle(robot_vis, (u, v), 10, (255, 0, 0), -1) # Blue circle
|
||||
|
||||
# Draw direction vector (X axis)
|
||||
dx, dy, dz = direction
|
||||
|
||||
# Draw X axis (principal axis)
|
||||
dx, dy, dz = x_axis
|
||||
scale = 50 # Scale for better visualization
|
||||
end_u = int((x + dx * scale) * focal_x / (z + dz * scale) + center_x)
|
||||
end_v = int((y + dy * scale) * focal_y / (z + dz * scale) + center_y)
|
||||
|
||||
|
||||
if 0 <= end_u < width and 0 <= end_v < height:
|
||||
cv2.line(robot_vis, (u, v), (end_u, end_v), (0, 0, 255), 2) # Red line (X axis)
|
||||
cv2.putText(robot_vis, "X", (end_u, end_v),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)
|
||||
|
||||
# Draw normal vector (Z axis)
|
||||
nx, ny, nz = normal
|
||||
cv2.line(
|
||||
robot_vis, (u, v), (end_u, end_v), (0, 0, 255), 2
|
||||
) # Red line (X axis)
|
||||
cv2.putText(
|
||||
robot_vis,
|
||||
"X",
|
||||
(end_u, end_v),
|
||||
cv2.FONT_HERSHEY_SIMPLEX,
|
||||
0.5,
|
||||
(0, 0, 255),
|
||||
2,
|
||||
)
|
||||
|
||||
# Draw Z axis (normal)
|
||||
nx, ny, nz = z_axis
|
||||
end_u = int((x + nx * scale) * focal_x / (z + nz * scale) + center_x)
|
||||
end_v = int((y + ny * scale) * focal_y / (z + nz * scale) + center_y)
|
||||
|
||||
|
||||
if 0 <= end_u < width and 0 <= end_v < height:
|
||||
cv2.line(robot_vis, (u, v), (end_u, end_v), (0, 255, 0), 2) # Green line (Z axis)
|
||||
cv2.putText(robot_vis, "Z", (end_u, end_v),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
|
||||
|
||||
# Calculate Y axis (cross product of Z and X for right-hand coordinate system)
|
||||
y_axis = np.cross(normal, direction)
|
||||
y_axis = y_axis / np.linalg.norm(y_axis)
|
||||
cv2.line(
|
||||
robot_vis, (u, v), (end_u, end_v), (0, 255, 0), 2
|
||||
) # Green line (Z axis)
|
||||
cv2.putText(
|
||||
robot_vis,
|
||||
"Z",
|
||||
(end_u, end_v),
|
||||
cv2.FONT_HERSHEY_SIMPLEX,
|
||||
0.5,
|
||||
(0, 255, 0),
|
||||
2,
|
||||
)
|
||||
|
||||
# Draw Y axis
|
||||
yx, yy, yz = y_axis
|
||||
|
||||
end_u = int((x + yx * scale) * focal_x / (z + yz * scale) + center_x)
|
||||
end_v = int((y + yy * scale) * focal_y / (z + yz * scale) + center_y)
|
||||
|
||||
|
||||
if 0 <= end_u < width and 0 <= end_v < height:
|
||||
cv2.line(robot_vis, (u, v), (end_u, end_v), (255, 0, 0), 2) # Blue line (Y axis)
|
||||
cv2.putText(robot_vis, "Y", (end_u, end_v),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)
|
||||
|
||||
cv2.line(
|
||||
robot_vis, (u, v), (end_u, end_v), (255, 0, 0), 2
|
||||
) # Blue line (Y axis)
|
||||
cv2.putText(
|
||||
robot_vis,
|
||||
"Y",
|
||||
(end_u, end_v),
|
||||
cv2.FONT_HERSHEY_SIMPLEX,
|
||||
0.5,
|
||||
(255, 0, 0),
|
||||
2,
|
||||
)
|
||||
|
||||
# Visualize gripper width
|
||||
gripper_radius = int(gripper_width * 100) # Scale for better visualization
|
||||
cv2.circle(robot_vis, (u, v), gripper_radius, (0, 255, 255), 2) # Yellow circle
|
||||
|
||||
cv2.circle(
|
||||
robot_vis, (u, v), gripper_radius, (0, 255, 255), 2
|
||||
) # Yellow circle
|
||||
|
||||
# Add parameter values as text
|
||||
y_offset = height - 160
|
||||
cv2.putText(robot_vis, f"Position: ({x:.2f}, {y:.2f}, {z:.2f})", (10, y_offset),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
|
||||
|
||||
cv2.putText(
|
||||
robot_vis,
|
||||
f"Position: ({x:.2f}, {y:.2f}, {z:.2f})",
|
||||
(10, y_offset),
|
||||
cv2.FONT_HERSHEY_SIMPLEX,
|
||||
0.5,
|
||||
(255, 255, 255),
|
||||
1,
|
||||
)
|
||||
|
||||
y_offset += 30
|
||||
cv2.putText(robot_vis, f"Direction: ({dx:.2f}, {dy:.2f}, {dz:.2f})", (10, y_offset),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)
|
||||
|
||||
cv2.putText(
|
||||
robot_vis,
|
||||
f"X axis: ({x_axis[0]:.2f}, {x_axis[1]:.2f}, {x_axis[2]:.2f})",
|
||||
(10, y_offset),
|
||||
cv2.FONT_HERSHEY_SIMPLEX,
|
||||
0.5,
|
||||
(0, 0, 255),
|
||||
1,
|
||||
)
|
||||
|
||||
y_offset += 30
|
||||
cv2.putText(robot_vis, f"Normal: ({nx:.2f}, {ny:.2f}, {nz:.2f})", (10, y_offset),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
|
||||
|
||||
cv2.putText(
|
||||
robot_vis,
|
||||
f"Z axis: ({z_axis[0]:.2f}, {z_axis[1]:.2f}, {z_axis[2]:.2f})",
|
||||
(10, y_offset),
|
||||
cv2.FONT_HERSHEY_SIMPLEX,
|
||||
0.5,
|
||||
(0, 255, 0),
|
||||
1,
|
||||
)
|
||||
|
||||
y_offset += 30
|
||||
cv2.putText(robot_vis, f"Gripper Width: {gripper_width:.2f}", (10, y_offset),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 255), 1)
|
||||
|
||||
return robot_vis
|
||||
cv2.putText(
|
||||
robot_vis,
|
||||
f"Gripper Width: {gripper_width:.2f}",
|
||||
(10, y_offset),
|
||||
cv2.FONT_HERSHEY_SIMPLEX,
|
||||
0.5,
|
||||
(0, 255, 255),
|
||||
1,
|
||||
)
|
||||
|
||||
return robot_vis
|
||||
|
Loading…
x
Reference in New Issue
Block a user