WiLoR/triangulate.py at main · SPICExLAB/WiLoR · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
from pathlib import Path
import torch
from ultralytics.nn.tasks import PoseModel as UltralyticsPoseModel
import argparse
import os
import cv2
import numpy as np
import json
from typing import Dict, Optional

from wilor.models import WiLoR, load_wilor
from wilor.utils import recursive_to
from wilor.datasets.vitdet_dataset import ViTDetDataset, DEFAULT_MEAN, DEFAULT_STD
from wilor.utils.renderer import Renderer, cam_crop_to_full
from ultralytics import YOLO
from manopth.manolayer import ManoLayer
# from smplx import MANOLayer

LIGHT_PURPLE=(0.25098039,  0.274117647,  0.65882353)

mano_model = {
    'left': ManoLayer(
        mano_root='mano_data',
        use_pca=False,
        ncomps=45,
        side='left'
    ),
    'right': ManoLayer(
        mano_root='mano_data',
        use_pca=False,
        ncomps=45,
        side='right'
    )
}

cams_for_left = ("0", "3", "1", "2")
cams_for_right = ("1", "2", "0", "3")

def main_naive():
    parser = argparse.ArgumentParser(description='WiLoR demo code')
    parser.add_argument('--in_folder', type=str, default='images', help='Folder with input images')
    parser.add_argument('--out_folder', type=str, default='out_demo', help='Output folder to save rendered results')
    parser.add_argument('--save_mesh', dest='save_mesh', action='store_true', default=False, help='If set, save meshes to disk also')
    parser.add_argument('--rescale_factor', type=float, default=2.0, help='Factor for padding the bbox')
    parser.add_argument('--file_type', nargs='+', default=['*.jpg', '*.png', '*.jpeg'], help='List of file extensions to consider')

    args = parser.parse_args()

    # Download and load checkpoints
    model, model_cfg = load_wilor(checkpoint_path = './pretrained_models/wilor_final.ckpt' , cfg_path= './pretrained_models/model_config.yaml')
    detector = YOLO('./pretrained_models/detector.pt')
    # Setup the renderer
    renderer = Renderer(model_cfg, faces=model.mano.faces)
    # renderer_side = Renderer(model_cfg, faces=model.mano.faces)

    device   = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model    = model.to(device)
    detector = detector.to(device)
    model.eval()

    # Make output directory if it does not exist
    os.makedirs(args.out_folder, exist_ok=True)

    proj_matrices = get_camera_params()
    base_img_folder = Path(args.in_folder)
    camera_image_sequences = {}
    # Sort camera folders by name to ensure a consistent processing order
    camera_folder_paths = sorted([p for p in base_img_folder.iterdir() if p.is_dir()])

    for camera_folder_path in camera_folder_paths:
        camera_name = camera_folder_path.name
        current_camera_imgs = []
        for ext_pattern in args.file_type:
            current_camera_imgs.extend(list(camera_folder_path.glob(ext_pattern)))

        current_camera_imgs.sort()  # Sort images to ensure frame sequence
        camera_image_sequences[camera_name] = current_camera_imgs
        print(f"Camera '{camera_name}': Found {len(current_camera_imgs)} images.")

    active_camera_names = [cfp.name for cfp in camera_folder_paths if cfp.name in camera_image_sequences]
    num_frames_per_camera = [len(camera_image_sequences[cam_name]) for cam_name in active_camera_names]
    num_synchronized_frames = num_frames_per_camera[0]

    for frame_idx in range(num_synchronized_frames):

        current_frame_image_paths = {} # Stores {camera_name: path_to_img_for_this_frame}

        for camera_name in active_camera_names:
            img_path = camera_image_sequences[camera_name][frame_idx]
            current_frame_image_paths[camera_name] = img_path

        collected_keypoints_for_frame = {
            "left":{}, #left hand
            "right":{}  #right hand
        }
        verts_for_frame = {
            "left":{},  #left hand vertices
            "right":{}  #right hand vertices
        }

        #wilor predicted 3D keypoints
        kpts3d_for_frame = {}
        mano_params_for_frame = {}
        ref_cam_priority = { "left": float('inf'), "right": float('inf') }
        for camera_name, img_path in current_frame_image_paths.items():
            print(f"Processing {camera_name}'s view: {img_path.name}")
            # Get filename from path img_path
            img_fn, _ = os.path.splitext(os.path.basename(img_path))
            img_cv2 = cv2.imread(str(img_path))
            detections = detector(img_cv2, conf = 0.3, verbose=False)[0]
            bboxes    = []
            is_right  = []
            for det in detections:
                Bbox = det.boxes.data.cpu().detach().squeeze().numpy()
                is_right.append(det.boxes.cls.cpu().detach().squeeze().item())
                bboxes.append(Bbox[:4].tolist())

            if len(bboxes) == 0:
                continue
            boxes = np.stack(bboxes)
            right = np.stack(is_right)
            dataset = ViTDetDataset(model_cfg, img_cv2, boxes, right, rescale_factor=args.rescale_factor)
            dataloader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=False, num_workers=0)

            all_verts = []
            all_cam_t = []
            all_right = []
            all_joints= []
            all_kpts  = []

            for batch in dataloader:
                batch = recursive_to(batch, device)

                with torch.no_grad():
                    out = model(batch)

                multiplier    = (2*batch['right']-1)
                pred_cam      = out['pred_cam']
                pred_cam[:,1] = multiplier*pred_cam[:,1]
                box_center    = batch["box_center"].float()
                box_size      = batch["box_size"].float()
                img_size      = batch["img_size"].float()
                scaled_focal_length = model_cfg.EXTRA.FOCAL_LENGTH / model_cfg.MODEL.IMAGE_SIZE * img_size.max()
                pred_cam_t_full     = cam_crop_to_full(pred_cam, box_center, box_size, img_size, scaled_focal_length).detach().cpu().numpy()

                # Render the result
                batch_size = batch['img'].shape[0]
                for n in range(batch_size):
                    verts  = out['pred_vertices'][n].detach().cpu().numpy()
                    joints = out['pred_keypoints_3d'][n].detach().cpu().numpy()
                    is_right    = batch['right'][n].cpu().numpy()
                    verts[:,0]  = (2*is_right-1)*verts[:,0]
                    joints[:,0] = (2*is_right-1)*joints[:,0]
                    cam_t = pred_cam_t_full[n]
                    kpts_2d = project_full_img(verts, cam_t, scaled_focal_length, img_size[n])
                    joint_kpts_2d = project_full_img(joints, cam_t, scaled_focal_length, img_size[n])

                    all_verts.append(verts)
                    all_cam_t.append(cam_t)
                    all_right.append(is_right)
                    all_joints.append(joints)
                    all_kpts.append(kpts_2d)

                    mano_params = {}
                    mano_params['global_orient'] = out['pred_mano_params']['global_orient'][n].detach().cpu().numpy()
                    mano_params['poses'] = out['pred_mano_params']['hand_pose'][n].detach().cpu().numpy()
                    mano_params['shapes'] = out['pred_mano_params']['betas'][n].detach().cpu().numpy()

                    # assume that theres one right and one left hand per image
                    if is_right and camera_name in cams_for_right:
                        collected_keypoints_for_frame["right"][camera_name] = joint_kpts_2d
                        verts_for_frame["right"][camera_name] = verts

                        # use camera "0" as the reference for right hand
                        current_priority = cams_for_right.index(camera_name)
                        if current_priority < ref_cam_priority["right"]:
                            kpts3d_for_frame["right"] = joints
                            mano_params_for_frame["right"] = mano_params
                            ref_cam_priority["right"] = current_priority

                    elif not is_right and camera_name in cams_for_left:
                        collected_keypoints_for_frame["left"][camera_name] = joint_kpts_2d
                        verts_for_frame["left"][camera_name] = verts
                        current_priority = cams_for_left.index(camera_name)

                        if current_priority < ref_cam_priority["left"]:
                            kpts3d_for_frame["left"] = joints
                            mano_params_for_frame["left"] = mano_params
                            ref_cam_priority["left"] = current_priority

        verts_this_frame = {}

        for hand_side in ["left", "right"]:
            keypoints = collected_keypoints_for_frame[hand_side]
            if hand_side == "left":
                cams = cams_for_left
            else:
                cams = cams_for_right

            cams_to_use = []
            for cam_name in cams:
                if cam_name in keypoints:
                    cams_to_use.append(cam_name)
                if len(cams_to_use) == 2:
                    break

            kpts_list = []
            for cam in cams_to_use:
                kp = keypoints[cam]

                if hasattr(kp, 'detach'):
                    kp_np = kp.detach().cpu().numpy()
                else:
                    kp_np = np.array(kp)

                kp_np = kp_np.astype(np.float64)
                kpts_list.append(kp_np)
                print(f"Camera {cam} keypoints shape: {kp_np.shape}")

            if len(kpts_list) != 2:
                print(f"Error: Need exactly 2 cameras for {hand_side} hand, got {len(kpts_list)}")
                continue

            if kpts_list[0].shape != kpts_list[1].shape:
                print(f"Error: Keypoint shapes don't match for {hand_side} hand:")
                print(f"  {cams_to_use[0]}: {kpts_list[0].shape}")
                print(f"  {cams_to_use[1]}: {kpts_list[1].shape}")
                continue

            kpts = np.array(kpts_list).astype(np.float64)
            projs = np.array([proj_matrices[cams_to_use[0]], proj_matrices[cams_to_use[1]]]).astype(np.float64)

            triangulated_kpts_homog = cv2.triangulatePoints(projs[0], projs[1], kpts[0].T, kpts[1].T)
            w = triangulated_kpts_homog[3, :]
            epsilon = 1e-8

            num_points = triangulated_kpts_homog.shape[1]
            triangulated_kpts_3d = np.full((num_points, 3), np.nan, dtype=np.float64)

            valid_w_mask = np.abs(w) > epsilon

            if np.any(valid_w_mask):
                triangulated_kpts_3d[valid_w_mask, :] = \
                    (triangulated_kpts_homog[:3, valid_w_mask] / w[valid_w_mask]).T

            #### directly save mesh
            rot_mesh, t_mesh = kabsch_numpy(kpts3d_for_frame[hand_side], triangulated_kpts_3d)

            reference_cam = cams_to_use[0]
            if reference_cam in verts_for_frame[hand_side]:
                 verts_this_frame[hand_side] = (verts_for_frame[hand_side][reference_cam] @ rot_mesh.T + t_mesh)
            ####

            original_poses = mano_params_for_frame[hand_side]['poses']
            original_shapes = mano_params_for_frame[hand_side]['shapes']

            axis_angle_list = []
            for rot_mat in original_poses:
                axis_angle_vec, _ = cv2.Rodrigues(rot_mat)
                axis_angle_list.append(axis_angle_vec.flatten())

            hand_pose_ax_angle_np = np.concatenate(axis_angle_list)

            canonical_pose_coeffs = np.concatenate([
                np.zeros(3),
                hand_pose_ax_angle_np
            ])

            pose_coeffs_tensor = torch.tensor(canonical_pose_coeffs, dtype=torch.float32).unsqueeze(0)
            shapes_tensor = torch.tensor(original_shapes, dtype=torch.float32).unsqueeze(0)

            _, canonical_joints = mano_model[hand_side](
                th_pose_coeffs= pose_coeffs_tensor,
                th_betas=shapes_tensor,
                th_trans=torch.zeros(1, 3)
            )

            canonical_joints_np = canonical_joints[0].detach().numpy()
            rot, t = kabsch_numpy(canonical_joints_np, triangulated_kpts_3d)

            new_global_orient, _ = cv2.Rodrigues(rot)
            new_global_orient = new_global_orient.flatten()


            axis_angle_list = []
            for rot_mat in original_poses:
                axis_angle_vec, _ = cv2.Rodrigues(rot_mat)
                axis_angle_list.append(axis_angle_vec.flatten())

            poses_flat_vector = np.concatenate(axis_angle_list)

            json_data = {
                'Rh': new_global_orient.tolist(),
                'poses': poses_flat_vector.tolist(),
                'shapes': original_shapes.tolist(),
                'Th': t.tolist()
            }
            output_path = f"{args.out_folder}/mano/{hand_side}/{frame_idx}.json"
            os.makedirs(os.path.dirname(output_path), exist_ok=True)

            with open(output_path, 'w') as f:
                json.dump(json_data, f, indent=2)

        if args.save_mesh:
            for hand in ["left", "right"]:
                if hand in verts_this_frame:
                    verts = verts_this_frame[hand]
                    tmesh = renderer.vertices_to_trimesh(verts, np.zeros(3), LIGHT_PURPLE, is_right=(hand=="right"))
                    mesh_output_dir = os.path.join(args.out_folder, 'mesh')
                    os.makedirs(mesh_output_dir, exist_ok=True)
                    tmesh.export(os.path.join(mesh_output_dir, f'{img_fn}_{int(hand == "right")}.obj'))

def ransac(kpts, proj_matrices, tol = 5.0):
    """
    kpts should be all the 2d kpts from each camera pair
    With 4 cameras there should be 4C2 = 6 pairs.
    Use this to determine which pair of cameras has the best prediction for the hand
    """
    pass

def get_camera_params(intri="camera_params/intri.yml", extri="camera_params/extri.yml"):
    """
    Load camera intrinsic and extrinsic parameters from YAML files.

    :param intri: Path to the intrinsic parameters file.
    :param extri: Path to the extrinsic parameters file.
    :return: A tuple containing intrinsic and extrinsic parameters as dictionaries.
    """
    import yaml
    with open(intri, 'r') as f:
        intrinsics = yaml.safe_load(f)
    with open(extri, 'r') as f:
        extrinsics = yaml.safe_load(f)
    proj_matrices = {}
    names = extrinsics.get('names')

    for name in names:
        k_key = f'K_{name}'
        rot_key = f'Rot_{name}'
        t_key = f'T_{name}'

        k_data_dict = intrinsics[k_key]
        rot_data_dict = extrinsics[rot_key]
        t_data_dict = extrinsics[t_key]

        K_np = np.array(k_data_dict['data']).reshape((k_data_dict['rows'], k_data_dict['cols']))
        Rot_np = np.array(rot_data_dict['data']).reshape((rot_data_dict['rows'], rot_data_dict['cols']))
        T_np = np.array(t_data_dict['data']).reshape((t_data_dict['rows'], t_data_dict['cols']))

        extrinsic_matrix_np = np.hstack((Rot_np, T_np))
        projection_matrix_np = K_np @ extrinsic_matrix_np
        proj_matrices[name] = projection_matrix_np

    return proj_matrices

def kabsch_numpy(P, Q):
    """
    Computes the optimal rotation and translation to align two sets of points (P -> Q),
    and their RMSD.

    :param P: A Nx3 matrix of points
    :param Q: A Nx3 matrix of points
    :return: A tuple containing the optimal rotation matrix, the optimal
            translation vector, and the RMSD.
    """
    assert P.shape == Q.shape, f"Matrix dimensions must match, {P.shape} != {Q.shape}"

    # Compute centroids
    centroid_P = np.mean(P, axis=0)
    centroid_Q = np.mean(Q, axis=0)

    # Optimal translation
    t = centroid_Q - centroid_P

    # Center the points
    p = P - centroid_P
    q = Q - centroid_Q

    # Compute the covariance matrix
    H = np.dot(p.T, q)

    # SVD
    U, S, Vt = np.linalg.svd(H)

    # Validate right-handed coordinate system
    if np.linalg.det(np.dot(Vt.T, U.T)) < 0.0:
        Vt[-1, :] *= -1.0

    # Optimal rotation
    R = np.dot(Vt.T, U.T)

    # RMSD
    # rmsd = np.sqrt(np.sum(np.square(np.dot(p, R.T) - q)) / P.shape[0])
    t_direct = centroid_Q - (centroid_P @ R.T)
    return R, t_direct

# priject 3d points to 2D image plane
def project_full_img(points, cam_trans, focal_length, img_res):
    camera_center = [img_res[0] / 2., img_res[1] / 2.]
    K = torch.eye(3)
    K[0,0] = focal_length
    K[1,1] = focal_length
    K[0,2] = camera_center[0]
    K[1,2] = camera_center[1]
    points = points + cam_trans
    points = points / points[..., -1:]

    V_2d = (K @ points.T).T
    return V_2d[..., :-1]

if __name__ == '__main__':
    main_naive()