Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ROADMAP.md
Original file line number Diff line number Diff line change
Expand Up @@ -867,7 +867,7 @@ get from a Python script calling `/v1/chat/completions`.
- Detects: one bad GPU in a pool, one node with wrong precision config, asymmetric NCCL issues
- Critical for production clusters where "5% divergence rate" might be "one node is broken, the rest are fine"

## M90: Hardware Precision Baseline Library
## M90: Hardware Precision Baseline Library
- Collect and maintain reference data for expected numerical differences across:
- GPU architectures: A100 vs H100 vs H200 vs Gaudi2 vs Gaudi3
- Precision modes: FP16 vs BF16 vs FP8 vs INT8-KV
Expand Down
3 changes: 2 additions & 1 deletion docs/iterations/current.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,4 +53,5 @@ shell for exploratory comparison of two endpoints.
| M85 | 2026-04-06 | Offline Mode — File-Based Comparison | ✅ merged | Both approved |
| M87 | 2026-04-06 | Automatic KV Cache Export from vLLM | ✅ merged | Both approved |
| M88 | 2026-04-06 | Framework-Level Inference Hooks | ✅ merged | Both approved |
| M89 | 2026-04-06 | PD Topology-Aware Testing | ⏳ pending review | — |
| M89 | 2026-04-06 | PD Topology-Aware Testing | ✅ merged | Both approved |
| M90 | 2026-04-06 | Hardware Precision Baseline Library | ⏳ pending review | — |
2 changes: 2 additions & 0 deletions src/xpyd_acc/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
_run_length_bias,
_run_sensitivity,
_run_watch,
handle_baseline_db,
handle_capture_kv,
handle_heatmap,
handle_root_cause,
Expand Down Expand Up @@ -145,6 +146,7 @@ def main(argv: list[str] | None = None) -> None:
"latency-regression": lambda: _run_latency_regression(args),
"compare-files": lambda: _run_file_compare(args),
"topology-scan": lambda: handle_topology_scan(args),
"baseline-db": lambda: handle_baseline_db(args),
}

if args.command in _early:
Expand Down
81 changes: 81 additions & 0 deletions src/xpyd_acc/cli/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -515,3 +515,84 @@ def mock_test(p_node: TopologyNode, d_node: TopologyNode) -> NodePairResult:
if getattr(args, "json", None):
report.to_json(args.json)
print(f"Topology report exported to {args.json}")


def handle_baseline_db(args: argparse.Namespace) -> None:
"""Handle baseline-db subcommand."""
import json as _json

from xpyd_acc.hw_baseline import (
BaselineDB,
classify_difference,
format_classification,
format_profile,
format_profile_list,
)

db = BaselineDB()
action = getattr(args, "baseline_action", None)

if action == "list":
print(format_profile_list(db))

elif action == "show":
profile = db.get_profile(args.profile)
if profile is None:
print(f"Profile '{args.profile}' not found.", file=sys.stderr)
raise SystemExit(1)
print(format_profile(profile))

elif action == "export":
db.export_json(args.output)
print(f"Exported {len(db.list_profiles())} profiles to {args.output}")

elif action == "import":
count = db.import_json(getattr(args, "input"))
print(f"Imported {count} profiles.")

elif action == "find":
results = db.find_profiles(
gpu_arch=args.gpu,
precision_mode=args.precision,
tp_degree=args.tp,
)
if not results:
print("No matching profiles found.")
else:
for p in results:
print(format_profile(p))
print()

elif action == "classify":
profile = db.get_profile(args.profile)
if profile is None:
print(f"Profile '{args.profile}' not found.", file=sys.stderr)
raise SystemExit(1)
observations: dict[str, float] = {}
if args.max_abs_diff is not None:
observations["max_abs_diff"] = args.max_abs_diff
if args.mean_abs_diff is not None:
observations["mean_abs_diff"] = args.mean_abs_diff
if args.cosine_sim is not None:
observations["cosine_sim"] = args.cosine_sim
if not observations:
print(
"No observations provided. "
"Use --max-abs-diff, --mean-abs-diff, or --cosine-sim.",
file=sys.stderr,
)
raise SystemExit(1)
report = classify_difference(profile, observations)
print(format_classification(report))
if getattr(args, "json", None):
from pathlib import Path
Path(args.json).write_text(_json.dumps(report.to_dict(), indent=2) + "\n")
print(f"Classification exported to {args.json}")

else:
print(
"Usage: xpyd-acc baseline-db "
"{list|show|export|import|find|classify}",
file=sys.stderr,
)
raise SystemExit(1)
40 changes: 40 additions & 0 deletions src/xpyd_acc/cli/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def register_all(sub: argparse._SubParsersAction) -> None:
_register_file_compare(sub)
_register_trace(sub)
_register_topology_scan(sub)
_register_baseline_db(sub)
def _register_compare(sub):
lp = sub.add_parser("compare-logprobs", help="Compare logprobs between two endpoints")
lp.add_argument("--baseline", required=True, help="Baseline endpoint URL")
Expand Down Expand Up @@ -759,3 +760,42 @@ def _register_topology_scan(sub):
"--mock", action="store_true", default=False,
help="Use mock topology for testing (no real endpoints)",
)


def _register_baseline_db(sub):
bd = sub.add_parser(
"baseline-db",
help="Manage hardware precision baseline profiles",
)
bd_sub = bd.add_subparsers(dest="baseline_action")
bd_sub.add_parser("list", help="List available hardware profiles")

show_p = bd_sub.add_parser("show", help="Show details of a specific profile")
show_p.add_argument("profile", help="Profile name")

export_p = bd_sub.add_parser("export", help="Export all profiles to JSON")
export_p.add_argument("--output", required=True, help="Output JSON file path")

import_p = bd_sub.add_parser("import", help="Import profiles from JSON")
import_p.add_argument("--input", required=True, help="Input JSON file path")

find_p = bd_sub.add_parser("find", help="Find profiles by criteria")
find_p.add_argument("--gpu", default=None, help="Filter by GPU architecture")
find_p.add_argument("--precision", default=None, help="Filter by precision mode")
find_p.add_argument("--tp", type=int, default=None, help="Filter by TP degree")

classify_p = bd_sub.add_parser("classify", help="Classify observed differences")
classify_p.add_argument("--profile", required=True, help="Profile name to compare against")
classify_p.add_argument(
"--max-abs-diff", type=float, default=None, dest="max_abs_diff",
help="Observed max absolute diff",
)
classify_p.add_argument(
"--mean-abs-diff", type=float, default=None, dest="mean_abs_diff",
help="Observed mean absolute diff",
)
classify_p.add_argument(
"--cosine-sim", type=float, default=None, dest="cosine_sim",
help="Observed cosine similarity",
)
classify_p.add_argument("--json", default=None, help="Export classification as JSON")
Loading
Loading