From c62c4cd2b376642fe7a845bf9b2b1712ab656230 Mon Sep 17 00:00:00 2001 From: Zhe Zhang Date: Sat, 18 Apr 2026 00:23:55 -0700 Subject: [PATCH] Add diagnose_boxer_output.py: geometric quality metrics without ground truth MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three orthogonal per-box self-consistency metrics for Boxer 3D output, useful for debugging custom loaders / in-the-wild captures that lack 3D ground-truth annotations. A. dist_to_cloud — distance from the box center to the nearest scene- cloud point (uses ctx.sdp_global or ctx.uid_to_p3 depending on the dataset). Catches boxes placed in empty space. B. iou2d — reproject the box's 8 corners with the same (K, pose) the loader supplied, take the axis-aligned 2D bbox, compute IoU with the OWL prompt bbox from owl_2dbbs.csv. Catches K / pose / 3D position inconsistency. C. depth_gap — camera-frame Z of the box's 8 corners vs. the q10..q90 Z range of SDP points whose projection lands inside the OWL 2D bbox. Catches "right angular direction, wrong distance" — a failure mode systematic K bias or per-segment scale drift produces, which A and B both tend to miss. Each metric is orthogonal: one can pass and the others fail. Combining them as `--filter` thresholds drops clearly-bad per-frame boxes before `fuse_3d_boxes.fuse_obbs_from_csv` clusters them, producing a cleaner scene graph. Outputs (in `//`): - diagnose_by_box.csv per-box metrics - diagnose_summary.json aggregate distributions - _3dbbs_filtered.csv with --filter Follows the CLI shape of view_fusion.py / view_tracker.py: python diagnose_boxer_output.py --input python diagnose_boxer_output.py --input --filter \\ --max_dist_to_cloud 0.5 --min_iou2d 0.1 --max_depth_gap 0.5 Uses build_seq_ctx to pull pose / K / SDP per dataset, so Aria and CA-1M work out of the box; ScanNet falls back to NaN for A/C when no pre-scanned cloud is in the context (IoU still works). README: add 'Demo #3.5' section pointing at the tool between the Offline Fusion demo and the Online Tracker demo. --- README.md | 19 ++ diagnose_boxer_output.py | 412 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 431 insertions(+) create mode 100644 diagnose_boxer_output.py diff --git a/README.md b/README.md index ff06e18..428eb5b 100644 --- a/README.md +++ b/README.md @@ -96,6 +96,25 @@ You should see a window like this: ![View Fusion Demo](docs/images/view_fusion_demo.jpg) +## Demo #3.5: Diagnose 3D output quality without ground truth + +Useful when you've plugged in a custom dataset loader or captured your own +sequence and want to know whether the 3D boxes look right — without any +ground-truth annotations. + +```bash +python diagnose_boxer_output.py --input nym10_gen1 +python diagnose_boxer_output.py --input nym10_gen1 --filter +``` + +Computes three geometric self-consistency metrics per 3D box +(distance-to-cloud, reprojection-IoU vs. OWL 2D bbox, depth-gap vs. SDP +inside the 2D bbox) and writes `diagnose_by_box.csv` + +`diagnose_summary.json` to the output dir. With `--filter`, writes a +`*_3dbbs_filtered.csv` that `fuse_3d_boxes.fuse_obbs_from_csv` can +consume for a cleaner fused scene graph. See the top of +`diagnose_boxer_output.py` for what each metric catches. + ## Demo #4: Online Tracker (requires Demo #1) Make sure to run Demo #1 above first to generate the 2DBB and 3DBB CSVs. Run the online tracker, which will estimate 3DBBs on the fly as new images are observed: diff --git a/diagnose_boxer_output.py b/diagnose_boxer_output.py new file mode 100644 index 0000000..43c2d6c --- /dev/null +++ b/diagnose_boxer_output.py @@ -0,0 +1,412 @@ +#! /usr/bin/env python3 + +# Copyright (c) Meta Platforms, Inc. and affiliates. +# This source code is licensed under the CC-BY-NC 4.0 license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe +"""Geometric quality metrics for Boxer 3D outputs — no ground truth needed. + +Three orthogonal self-consistency checks per 3D box: + + A. dist_to_cloud Distance from box center to the nearest scene-cloud point. + Catches boxes placed in empty space. + + B. iou2d Reproject the box's 8 corners into the image with the same + (K, pose) BoxerNet saw, take the axis-aligned 2D bbox, + compute IoU with the OWL prompt bbox. + Catches K / pose / 3D-position inconsistency. + + C. depth_gap Camera-frame Z of the box's 8 corners vs. the Z distribution + of SDP points whose projection lands inside the OWL 2D bbox. + Catches "right angular direction, wrong distance" — the + failure mode systematic K bias / per-segment scale drift + produces, which A and B both tend to miss. + +Each metric is orthogonal: one can pass and the other two fail. Combining all +three as a filter drops clearly-bad per-frame boxes before `fuse_3d_boxes.py` +clusters them, producing a cleaner scene graph. + +Usage: + python diagnose_boxer_output.py --input + python diagnose_boxer_output.py --input --filter + python diagnose_boxer_output.py --input --filter \\ + --max_dist_to_cloud 0.5 --min_iou2d 0.1 --max_depth_gap 0.5 + +Outputs (in `//`): + diagnose_by_box.csv per-box metrics + diagnose_summary.json aggregate distributions + _3dbbs_filtered.csv (only with --filter) bad boxes removed +""" + +import argparse +import csv +import json +import math +import os +import sys +from typing import Optional + +import numpy as np +import torch + +from utils.demo_utils import DEFAULT_SEQ, EVAL_PATH +from utils.file_io import load_bb2d_csv, read_obb_csv +from utils.viewer_3d import build_seq_ctx, resolve_input + + +# --------------------------------------------------------------------------- +# Geometry helpers +# --------------------------------------------------------------------------- + + +def _project_points( + pts_world: np.ndarray, # [N, 3] + T_wc: np.ndarray, # [4, 4] world_from_camera + K: np.ndarray, # [3, 3] pinhole +) -> tuple[np.ndarray, np.ndarray]: + """Return (uv [N, 2], z [N]). z > 0 means in front of camera.""" + T_cw = np.linalg.inv(T_wc) + pts_h = np.concatenate([pts_world, np.ones((pts_world.shape[0], 1), dtype=np.float32)], axis=1) + pts_cam = pts_h @ T_cw.T + z = pts_cam[:, 2] + with np.errstate(divide="ignore", invalid="ignore"): + u = pts_cam[:, 0] / z * K[0, 0] + K[0, 2] + v = pts_cam[:, 1] / z * K[1, 1] + K[1, 2] + return np.stack([u, v], axis=1).astype(np.float32), z.astype(np.float32) + + +def _iou_xyxy(a: np.ndarray, b: np.ndarray) -> float: + """IoU between two xyxy boxes [x1, y1, x2, y2].""" + ix1 = max(a[0], b[0]); iy1 = max(a[1], b[1]) + ix2 = min(a[2], b[2]); iy2 = min(a[3], b[3]) + iw = max(0.0, ix2 - ix1); ih = max(0.0, iy2 - iy1) + inter = iw * ih + area_a = max(0.0, a[2] - a[0]) * max(0.0, a[3] - a[1]) + area_b = max(0.0, b[2] - b[0]) * max(0.0, b[3] - b[1]) + denom = area_a + area_b - inter + return float(inter / denom) if denom > 0 else 0.0 + + +def _box_corners_from_obb(obb) -> np.ndarray: + """Return the 8 world-frame corners of an ObbTW as [8, 3] float32.""" + bb3 = obb.bb3_object.squeeze(0).numpy() # [6] xmin, xmax, ymin, ymax, zmin, zmax + T_wo = obb.T_world_object # PoseTW + R_wo = T_wo.R.squeeze(0).numpy() # [3, 3] + t_wo = T_wo.t.squeeze(0).numpy() # [3] + xs = (bb3[0], bb3[1]); ys = (bb3[2], bb3[3]); zs = (bb3[4], bb3[5]) + corners_local = np.array( + [[x, y, z] for x in xs for y in ys for z in zs], dtype=np.float32 + ) + return (corners_local @ R_wo.T + t_wo[None]).astype(np.float32) + + +def _match_bb2d_for_obb( + label: str, + proj_center_uv: np.ndarray, # [2] + bb2d_entries: list[dict], # from load_bb2d_csv per-frame list + max_dist_px: float = 1e9, +) -> Optional[np.ndarray]: + """Find the OWL 2D bbox with matching label whose center is closest in the image. + + Returns xyxy array or None. run_boxer's thresh_3d filter strips the index + correspondence between BoxerNet outputs and OWL prompts, so we recover the + link heuristically by nearest-projected-center within the same label class. + """ + best = None; best_d = max_dist_px + for d in bb2d_entries: + if d.get("label") != label: + continue + bb = d["bb2d"] # (x1, y1, x2, y2) numpy array + c = np.array([0.5 * (bb[0] + bb[2]), 0.5 * (bb[1] + bb[3])], dtype=np.float32) + dist = float(np.linalg.norm(c - proj_center_uv)) + if dist < best_d: + best = np.asarray(bb, dtype=np.float32); best_d = dist + return best + + +# --------------------------------------------------------------------------- +# Per-frame context: pose + K + SDP lookup +# --------------------------------------------------------------------------- + + +def _pose_K_for_time(ctx: dict, time_ns: int) -> Optional[tuple[np.ndarray, np.ndarray, int, int]]: + """Find the closest (pose, K) to a given timestamp. Returns (T_wc, K, H, W). + + build_seq_ctx's outputs differ per dataset, but all expose pose_ts + traj + (camera poses) and calibs (CameraTW). We pick by nearest-timestamp. + """ + pose_ts = ctx.get("pose_ts") + traj = ctx.get("traj") + calibs = ctx.get("calibs") + calib_ts = ctx.get("calib_ts", pose_ts) + if pose_ts is None or traj is None or calibs is None: + return None + + # Traj may be a list of PoseTW (CA-1M path) or a tensor / list (Aria). + idx = int(np.argmin(np.abs(np.asarray(pose_ts) - time_ns))) + pose = traj[idx] if isinstance(traj, list) else traj[idx] + # pose can be a PoseTW or a 4x4 tensor; normalize. + if hasattr(pose, "_data"): + R = pose.R.numpy() if hasattr(pose.R, "numpy") else np.asarray(pose.R) + t = pose.t.numpy() if hasattr(pose.t, "numpy") else np.asarray(pose.t) + if R.ndim == 3: + R = R[0]; t = t[0] + T_wc = np.eye(4, dtype=np.float32) + T_wc[:3, :3] = R; T_wc[:3, 3] = t + else: + T_wc = np.asarray(pose, dtype=np.float32) + if T_wc.shape == (3, 4): + T_wc = np.vstack([T_wc, [0, 0, 0, 1]]).astype(np.float32) + + cidx = int(np.argmin(np.abs(np.asarray(calib_ts) - time_ns))) + cam = calibs[cidx] if isinstance(calibs, list) else calibs + # CameraTW stores [w, h, fx, fy, cx, cy, ...]. + data = cam._data if hasattr(cam, "_data") else cam + data = data.squeeze().numpy() if hasattr(data, "numpy") else np.asarray(data).squeeze() + W = int(round(float(data[0]))); H = int(round(float(data[1]))) + fx = float(data[2]); fy = float(data[3]); cx = float(data[4]); cy = float(data[5]) + K = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float32) + return T_wc, K, H, W + + +def _resolve_scene_cloud(ctx: dict, max_points: int = 80_000) -> np.ndarray: + """Best-effort scene cloud from the sequence context (may be empty).""" + # CA-1M exposes sdp_global. + sdp = ctx.get("sdp_global") + if sdp is not None and len(sdp) > 0: + pts = np.asarray(sdp, dtype=np.float32) + else: + # Aria path: uid_to_p3 maps unique IDs to 3-vectors. + uid_to_p3 = ctx.get("uid_to_p3") + if uid_to_p3: + pts = np.asarray(list(uid_to_p3.values()), dtype=np.float32) + else: + return np.zeros((0, 3), dtype=np.float32) + pts = pts[np.isfinite(pts).all(axis=1)] + if pts.shape[0] > max_points: + idx = np.random.default_rng(0).choice(pts.shape[0], size=max_points, replace=False) + pts = pts[idx] + return pts + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + + +def main(): + parser = argparse.ArgumentParser( + description=__doc__.splitlines()[0], + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument("--input", type=str, default=DEFAULT_SEQ, help="path to the sequence folder") + parser.add_argument("--output_dir", type=str, default=EVAL_PATH, help="Where CSVs live (default: ~/viz_boxer)") + parser.add_argument("--write_name", default="boxer", type=str, help="CSV prefix (default: boxer)") + parser.add_argument("--filter", action="store_true", help="write a *_filtered.csv with bad boxes dropped") + parser.add_argument("--max_dist_to_cloud", type=float, default=0.5, help="filter threshold (m)") + parser.add_argument("--min_iou2d", type=float, default=0.1, help="filter threshold") + parser.add_argument("--max_depth_gap", type=float, default=0.5, help="filter threshold (m)") + args = parser.parse_args() + + input_path, dataset_type, seq_name = resolve_input(args.input) + output_dir = os.path.expanduser(args.output_dir) + log_dir = os.path.join(output_dir, seq_name) + + obb_csv = os.path.join(log_dir, f"{args.write_name}_3dbbs.csv") + bb2d_csv = os.path.join(log_dir, "owl_2dbbs.csv") + missing = [p for p in (obb_csv, bb2d_csv) if not os.path.exists(p)] + if missing: + print("[diagnose] missing prerequisite file(s):") + for p in missing: + print(f" - {p}") + print(f"\nGenerate them with:\n python run_boxer.py --input {seq_name} --skip_viz") + sys.exit(1) + + print(f"[diagnose] loading OBBs from {obb_csv}") + timed_obbs = read_obb_csv(obb_csv) + total_obbs = sum(len(o) for o in timed_obbs.values()) + print(f"[diagnose] loading 2D bboxes from {bb2d_csv}") + timed_bb2d = load_bb2d_csv(bb2d_csv) + + print(f"[diagnose] building sequence context ({dataset_type})") + ctx = build_seq_ctx(input_path, dataset_type) + cloud = _resolve_scene_cloud(ctx) + has_cloud = cloud.shape[0] > 0 + if not has_cloud: + print("[diagnose] no scene cloud available for this dataset — A and C will be NaN") + + bb2d_times = np.array(sorted(timed_bb2d.keys()), dtype=np.int64) if timed_bb2d else np.zeros(0, dtype=np.int64) + + rows_out: list[dict] = [] + for time_ns in sorted(timed_obbs.keys()): + obbs = timed_obbs[time_ns] + if not obbs: + continue + + ctx_pose_K = _pose_K_for_time(ctx, time_ns) + if ctx_pose_K is None: + continue + T_wc, K, H, W = ctx_pose_K + + # Match the frame's 2D bboxes by nearest timestamp (within 50 ms). + bb2d_entries: list[dict] = [] + if bb2d_times.size > 0: + bb_idx = int(np.searchsorted(bb2d_times, time_ns)) + bb_idx = min(bb_idx, bb2d_times.size - 1) + nearest = int(bb2d_times[bb_idx]) + if abs(nearest - time_ns) < 50_000_000: + entry = timed_bb2d.get(nearest, {}) + n = len(entry.get("bb2d", [])) + for i in range(n): + bb2d_entries.append({ + "label": entry["labels"][i] if "labels" in entry else "", + "bb2d": np.asarray(entry["bb2d"][i], dtype=np.float32), + }) + + for obb in obbs: + # Label as a string + label = "" + try: + txt = obb.text_string() + label = txt[0] if isinstance(txt, (list, tuple)) else str(txt) + except Exception: + pass + + center = obb.T_world_object.t.squeeze(0).numpy().astype(np.float32) + corners = _box_corners_from_obb(obb) + + # Metric A + dist_to_cloud = float("nan") + if has_cloud: + d2 = np.sum((cloud - center[None, :]) ** 2, axis=1) + dist_to_cloud = float(np.sqrt(d2.min())) + + # Project center for OWL matching + proj_center_uv, proj_center_z = _project_points(center[None, :], T_wc, K) + owl_bb = None + if proj_center_z[0] > 1e-3 and np.isfinite(proj_center_uv).all(): + owl_bb = _match_bb2d_for_obb(label, proj_center_uv[0], bb2d_entries) + + # Metric B + iou2d = float("nan") + if owl_bb is not None: + uv, z = _project_points(corners, T_wc, K) + mask = (z > 1e-3) & np.isfinite(uv).all(axis=1) + if mask.sum() >= 2: + u = uv[mask, 0]; v = uv[mask, 1] + proj_bb = np.array([u.min(), v.min(), u.max(), v.max()], dtype=np.float32) + iou2d = _iou_xyxy(proj_bb, owl_bb) + + # Metric C + depth_gap = float("nan") + if owl_bb is not None and has_cloud: + uv, z = _project_points(cloud, T_wc, K) + in_front = z > 1e-3 + in_bbox = ( + (uv[:, 0] >= owl_bb[0]) & (uv[:, 0] <= owl_bb[2]) & + (uv[:, 1] >= owl_bb[1]) & (uv[:, 1] <= owl_bb[3]) + ) + keep = in_front & in_bbox & np.isfinite(uv).all(axis=1) + zs_in = z[keep] + if zs_in.size >= 30: + q10, q50, q90 = np.percentile(zs_in, [10, 50, 90]).tolist() + _, z_box = _project_points(corners, T_wc, K) + z_box = z_box[np.isfinite(z_box)] + if z_box.size > 0: + z_box_min, z_box_max = float(z_box.min()), float(z_box.max()) + depth_gap = float(max(0.0, q10 - z_box_max, z_box_min - q90)) + + rows_out.append({ + "time_ns": time_ns, + "label": label, + "prob": float(obb.prob.item()) if hasattr(obb, "prob") else float("nan"), + "cx": float(center[0]), "cy": float(center[1]), "cz": float(center[2]), + "dist_to_cloud": dist_to_cloud, + "iou2d": iou2d, + "owl_matched": owl_bb is not None, + "depth_gap": depth_gap, + }) + + # Write per-box CSV. + out_csv = os.path.join(log_dir, "diagnose_by_box.csv") + os.makedirs(log_dir, exist_ok=True) + if rows_out: + with open(out_csv, "w", newline="") as f: + w = csv.DictWriter(f, fieldnames=list(rows_out[0].keys())) + w.writeheader() + for r in rows_out: + w.writerow(r) + print(f"[diagnose] wrote {len(rows_out)} rows → {out_csv}") + + # Aggregate summary. + def _stats(key: str) -> Optional[dict]: + vals = np.array( + [r[key] for r in rows_out if r[key] is not None and not (isinstance(r[key], float) and math.isnan(r[key]))] + ) + if vals.size == 0: + return None + return { + "n": int(vals.size), + "mean": float(vals.mean()), + "median": float(np.median(vals)), + "p90": float(np.percentile(vals, 90)), + "p99": float(np.percentile(vals, 99)), + "max": float(vals.max()), + } + summary = { + "seq_name": seq_name, + "dataset_type": dataset_type, + "n_boxes": len(rows_out), + "n_owl_matched": sum(1 for r in rows_out if r["owl_matched"]), + "has_scene_cloud": bool(has_cloud), + "dist_to_cloud": _stats("dist_to_cloud"), + "iou2d": _stats("iou2d"), + "depth_gap": _stats("depth_gap"), + } + summary_path = os.path.join(log_dir, "diagnose_summary.json") + with open(summary_path, "w") as f: + json.dump(summary, f, indent=2) + print(json.dumps(summary, indent=2)) + + # Optional filter — write a reduced CSV for downstream fuse_3d_boxes.py. + if args.filter: + filtered_csv = os.path.join(log_dir, f"{args.write_name}_3dbbs_filtered.csv") + key_to_row = { + (r["time_ns"], round(r["cx"], 5), round(r["cy"], 5), round(r["cz"], 5)): r + for r in rows_out + } + kept, dropped = 0, 0 + with open(obb_csv) as fin, open(filtered_csv, "w", newline="") as fout: + header = fin.readline() + fout.write(header) + for line in fin: + parts = line.split(",") + key = ( + int(parts[0]), + round(float(parts[1]), 5), + round(float(parts[2]), 5), + round(float(parts[3]), 5), + ) + r = key_to_row.get(key) + ok = True + if r is not None: + if not math.isnan(r["dist_to_cloud"]) and r["dist_to_cloud"] > args.max_dist_to_cloud: + ok = False + elif not math.isnan(r["iou2d"]) and r["iou2d"] < args.min_iou2d: + ok = False + elif not math.isnan(r["depth_gap"]) and r["depth_gap"] > args.max_depth_gap: + ok = False + if ok: + fout.write(line); kept += 1 + else: + dropped += 1 + print(f"[diagnose] filter: kept {kept}, dropped {dropped} → {filtered_csv}") + print(f"[diagnose] thresholds: max_dist_to_cloud={args.max_dist_to_cloud} " + f"min_iou2d={args.min_iou2d} max_depth_gap={args.max_depth_gap}") + print(f"[diagnose] now run: python -c \"from utils.fuse_3d_boxes import fuse_obbs_from_csv; " + f"fuse_obbs_from_csv('{filtered_csv}')\"") + + +if __name__ == "__main__": + main()