diff --git a/ROADMAP.md b/ROADMAP.md index 956e916..8834166 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -867,7 +867,7 @@ get from a Python script calling `/v1/chat/completions`. - Detects: one bad GPU in a pool, one node with wrong precision config, asymmetric NCCL issues - Critical for production clusters where "5% divergence rate" might be "one node is broken, the rest are fine" -## M90: Hardware Precision Baseline Library +## M90: Hardware Precision Baseline Library ✅ - Collect and maintain reference data for expected numerical differences across: - GPU architectures: A100 vs H100 vs H200 vs Gaudi2 vs Gaudi3 - Precision modes: FP16 vs BF16 vs FP8 vs INT8-KV diff --git a/docs/iterations/current.md b/docs/iterations/current.md index 627d48d..bf3710e 100644 --- a/docs/iterations/current.md +++ b/docs/iterations/current.md @@ -53,4 +53,5 @@ shell for exploratory comparison of two endpoints. | M85 | 2026-04-06 | Offline Mode — File-Based Comparison | ✅ merged | Both approved | | M87 | 2026-04-06 | Automatic KV Cache Export from vLLM | ✅ merged | Both approved | | M88 | 2026-04-06 | Framework-Level Inference Hooks | ✅ merged | Both approved | -| M89 | 2026-04-06 | PD Topology-Aware Testing | ⏳ pending review | — | +| M89 | 2026-04-06 | PD Topology-Aware Testing | ✅ merged | Both approved | +| M90 | 2026-04-06 | Hardware Precision Baseline Library | ⏳ pending review | — | diff --git a/src/xpyd_acc/cli/__init__.py b/src/xpyd_acc/cli/__init__.py index 15c6bb8..2057105 100644 --- a/src/xpyd_acc/cli/__init__.py +++ b/src/xpyd_acc/cli/__init__.py @@ -19,6 +19,7 @@ _run_length_bias, _run_sensitivity, _run_watch, + handle_baseline_db, handle_capture_kv, handle_heatmap, handle_root_cause, @@ -145,6 +146,7 @@ def main(argv: list[str] | None = None) -> None: "latency-regression": lambda: _run_latency_regression(args), "compare-files": lambda: _run_file_compare(args), "topology-scan": lambda: handle_topology_scan(args), + "baseline-db": lambda: handle_baseline_db(args), } if args.command in _early: diff --git a/src/xpyd_acc/cli/analysis.py b/src/xpyd_acc/cli/analysis.py index 51fa376..a0fc211 100644 --- a/src/xpyd_acc/cli/analysis.py +++ b/src/xpyd_acc/cli/analysis.py @@ -515,3 +515,84 @@ def mock_test(p_node: TopologyNode, d_node: TopologyNode) -> NodePairResult: if getattr(args, "json", None): report.to_json(args.json) print(f"Topology report exported to {args.json}") + + +def handle_baseline_db(args: argparse.Namespace) -> None: + """Handle baseline-db subcommand.""" + import json as _json + + from xpyd_acc.hw_baseline import ( + BaselineDB, + classify_difference, + format_classification, + format_profile, + format_profile_list, + ) + + db = BaselineDB() + action = getattr(args, "baseline_action", None) + + if action == "list": + print(format_profile_list(db)) + + elif action == "show": + profile = db.get_profile(args.profile) + if profile is None: + print(f"Profile '{args.profile}' not found.", file=sys.stderr) + raise SystemExit(1) + print(format_profile(profile)) + + elif action == "export": + db.export_json(args.output) + print(f"Exported {len(db.list_profiles())} profiles to {args.output}") + + elif action == "import": + count = db.import_json(getattr(args, "input")) + print(f"Imported {count} profiles.") + + elif action == "find": + results = db.find_profiles( + gpu_arch=args.gpu, + precision_mode=args.precision, + tp_degree=args.tp, + ) + if not results: + print("No matching profiles found.") + else: + for p in results: + print(format_profile(p)) + print() + + elif action == "classify": + profile = db.get_profile(args.profile) + if profile is None: + print(f"Profile '{args.profile}' not found.", file=sys.stderr) + raise SystemExit(1) + observations: dict[str, float] = {} + if args.max_abs_diff is not None: + observations["max_abs_diff"] = args.max_abs_diff + if args.mean_abs_diff is not None: + observations["mean_abs_diff"] = args.mean_abs_diff + if args.cosine_sim is not None: + observations["cosine_sim"] = args.cosine_sim + if not observations: + print( + "No observations provided. " + "Use --max-abs-diff, --mean-abs-diff, or --cosine-sim.", + file=sys.stderr, + ) + raise SystemExit(1) + report = classify_difference(profile, observations) + print(format_classification(report)) + if getattr(args, "json", None): + from pathlib import Path + Path(args.json).write_text(_json.dumps(report.to_dict(), indent=2) + "\n") + print(f"Classification exported to {args.json}") + + else: + print( + "Usage: xpyd-acc baseline-db " + "{list|show|export|import|find|classify}", + file=sys.stderr, + ) + raise SystemExit(1) diff --git a/src/xpyd_acc/cli/parsers.py b/src/xpyd_acc/cli/parsers.py index f52b112..dc4414e 100644 --- a/src/xpyd_acc/cli/parsers.py +++ b/src/xpyd_acc/cli/parsers.py @@ -56,6 +56,7 @@ def register_all(sub: argparse._SubParsersAction) -> None: _register_file_compare(sub) _register_trace(sub) _register_topology_scan(sub) + _register_baseline_db(sub) def _register_compare(sub): lp = sub.add_parser("compare-logprobs", help="Compare logprobs between two endpoints") lp.add_argument("--baseline", required=True, help="Baseline endpoint URL") @@ -759,3 +760,42 @@ def _register_topology_scan(sub): "--mock", action="store_true", default=False, help="Use mock topology for testing (no real endpoints)", ) + + +def _register_baseline_db(sub): + bd = sub.add_parser( + "baseline-db", + help="Manage hardware precision baseline profiles", + ) + bd_sub = bd.add_subparsers(dest="baseline_action") + bd_sub.add_parser("list", help="List available hardware profiles") + + show_p = bd_sub.add_parser("show", help="Show details of a specific profile") + show_p.add_argument("profile", help="Profile name") + + export_p = bd_sub.add_parser("export", help="Export all profiles to JSON") + export_p.add_argument("--output", required=True, help="Output JSON file path") + + import_p = bd_sub.add_parser("import", help="Import profiles from JSON") + import_p.add_argument("--input", required=True, help="Input JSON file path") + + find_p = bd_sub.add_parser("find", help="Find profiles by criteria") + find_p.add_argument("--gpu", default=None, help="Filter by GPU architecture") + find_p.add_argument("--precision", default=None, help="Filter by precision mode") + find_p.add_argument("--tp", type=int, default=None, help="Filter by TP degree") + + classify_p = bd_sub.add_parser("classify", help="Classify observed differences") + classify_p.add_argument("--profile", required=True, help="Profile name to compare against") + classify_p.add_argument( + "--max-abs-diff", type=float, default=None, dest="max_abs_diff", + help="Observed max absolute diff", + ) + classify_p.add_argument( + "--mean-abs-diff", type=float, default=None, dest="mean_abs_diff", + help="Observed mean absolute diff", + ) + classify_p.add_argument( + "--cosine-sim", type=float, default=None, dest="cosine_sim", + help="Observed cosine similarity", + ) + classify_p.add_argument("--json", default=None, help="Export classification as JSON") diff --git a/src/xpyd_acc/hw_baseline.py b/src/xpyd_acc/hw_baseline.py new file mode 100644 index 0000000..1919d6b --- /dev/null +++ b/src/xpyd_acc/hw_baseline.py @@ -0,0 +1,437 @@ +"""Hardware Precision Baseline Library. + +Maintains reference precision profiles for common GPU hardware configurations +and classifies observed numerical differences as expected hardware variance +or likely software bugs. +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass, field +from pathlib import Path + + +@dataclass +class PrecisionRange: + """Expected numerical range for a specific metric.""" + + metric: str # "max_abs_diff", "mean_abs_diff", "cosine_sim" + expected_min: float + expected_max: float + description: str = "" + + def contains(self, value: float) -> bool: + """Check if value falls within the expected range.""" + return self.expected_min <= value <= self.expected_max + + def to_dict(self) -> dict: + return { + "metric": self.metric, + "expected_min": self.expected_min, + "expected_max": self.expected_max, + "description": self.description, + } + + @classmethod + def from_dict(cls, data: dict) -> PrecisionRange: + return cls( + metric=data["metric"], + expected_min=data["expected_min"], + expected_max=data["expected_max"], + description=data.get("description", ""), + ) + + +@dataclass +class HardwareProfile: + """A hardware configuration with expected precision characteristics.""" + + name: str + gpu_arch: str # e.g. "A100", "H100", "H200", "Gaudi2", "Gaudi3" + precision_mode: str # e.g. "FP16", "BF16", "FP8", "INT8-KV" + attention_impl: str # e.g. "FlashAttention-v2", "PagedAttention", "xFormers" + tp_degree: int # tensor parallelism degree + ranges: list[PrecisionRange] = field(default_factory=list) + metadata: dict = field(default_factory=dict) + + def to_dict(self) -> dict: + return { + "name": self.name, + "gpu_arch": self.gpu_arch, + "precision_mode": self.precision_mode, + "attention_impl": self.attention_impl, + "tp_degree": self.tp_degree, + "ranges": [r.to_dict() for r in self.ranges], + "metadata": self.metadata, + } + + @classmethod + def from_dict(cls, data: dict) -> HardwareProfile: + return cls( + name=data["name"], + gpu_arch=data["gpu_arch"], + precision_mode=data["precision_mode"], + attention_impl=data["attention_impl"], + tp_degree=data["tp_degree"], + ranges=[PrecisionRange.from_dict(r) for r in data.get("ranges", [])], + metadata=data.get("metadata", {}), + ) + + def get_range(self, metric: str) -> PrecisionRange | None: + """Look up the expected range for a given metric.""" + for r in self.ranges: + if r.metric == metric: + return r + return None + + +@dataclass +class DifferenceVerdict: + """Classification result for an observed numerical difference.""" + + metric: str + observed_value: float + expected_range: PrecisionRange | None + classification: str # "expected", "suspicious", "likely_bug", "unknown" + reasoning: str + + def to_dict(self) -> dict: + return { + "metric": self.metric, + "observed_value": self.observed_value, + "expected_range": self.expected_range.to_dict() if self.expected_range else None, + "classification": self.classification, + "reasoning": self.reasoning, + } + + +@dataclass +class ClassificationReport: + """Full classification report for a set of observed differences.""" + + profile_name: str + verdicts: list[DifferenceVerdict] + + @property + def overall_classification(self) -> str: + """Return the worst classification across all verdicts.""" + priority = {"likely_bug": 3, "suspicious": 2, "unknown": 1, "expected": 0} + if not self.verdicts: + return "unknown" + worst = max(self.verdicts, key=lambda v: priority.get(v.classification, 0)) + return worst.classification + + def to_dict(self) -> dict: + return { + "profile_name": self.profile_name, + "overall_classification": self.overall_classification, + "verdicts": [v.to_dict() for v in self.verdicts], + } + + +def classify_difference( + profile: HardwareProfile, + observations: dict[str, float], +) -> ClassificationReport: + """Classify observed numerical differences against a hardware profile. + + Args: + profile: hardware profile with expected ranges + observations: dict mapping metric names to observed values + e.g. {"max_abs_diff": 0.001, "mean_abs_diff": 0.0001, "cosine_sim": 0.9999} + + Returns: + ClassificationReport with per-metric verdicts + """ + verdicts: list[DifferenceVerdict] = [] + + for metric, value in observations.items(): + expected = profile.get_range(metric) + + if expected is None: + verdicts.append(DifferenceVerdict( + metric=metric, + observed_value=value, + expected_range=None, + classification="unknown", + reasoning=f"No expected range for '{metric}' in profile '{profile.name}'", + )) + continue + + if expected.contains(value): + verdicts.append(DifferenceVerdict( + metric=metric, + observed_value=value, + expected_range=expected, + classification="expected", + reasoning=( + f"{metric}={value} is within expected range " + f"[{expected.expected_min}, {expected.expected_max}] " + f"for {profile.name}" + ), + )) + else: + # Determine severity: how far outside the range? + if metric == "cosine_sim": + # For cosine similarity, below min is bad + if value < expected.expected_min: + gap = expected.expected_min - value + severity = "likely_bug" if gap > 0.01 else "suspicious" + else: + severity = "expected" # above max for cosine is fine + else: + # For diff metrics, above max is bad + if value > expected.expected_max: + max_val = expected.expected_max + ratio = value / max_val if max_val > 0 else float("inf") + severity = "likely_bug" if ratio > 2.0 else "suspicious" + else: + severity = "expected" # below min for diff is fine (less error) + + verdicts.append(DifferenceVerdict( + metric=metric, + observed_value=value, + expected_range=expected, + classification=severity, + reasoning=( + f"{metric}={value} is outside expected range " + f"[{expected.expected_min}, {expected.expected_max}] " + f"for {profile.name}" + ), + )) + + return ClassificationReport(profile_name=profile.name, verdicts=verdicts) + + +# --- Built-in Profiles --- + +_BUILTIN_PROFILES: list[HardwareProfile] = [ + HardwareProfile( + name="a100-bf16-tp1", + gpu_arch="A100", + precision_mode="BF16", + attention_impl="FlashAttention-v2", + tp_degree=1, + ranges=[ + PrecisionRange("max_abs_diff", 0.0, 0.002, "BF16 rounding on A100"), + PrecisionRange("mean_abs_diff", 0.0, 0.0005, "Average BF16 error"), + PrecisionRange("cosine_sim", 0.9995, 1.0, "Expected cosine similarity"), + ], + ), + HardwareProfile( + name="a100-fp16-tp1", + gpu_arch="A100", + precision_mode="FP16", + attention_impl="FlashAttention-v2", + tp_degree=1, + ranges=[ + PrecisionRange("max_abs_diff", 0.0, 0.001, "FP16 rounding on A100"), + PrecisionRange("mean_abs_diff", 0.0, 0.0002, "Average FP16 error"), + PrecisionRange("cosine_sim", 0.9998, 1.0, "Expected cosine similarity"), + ], + ), + HardwareProfile( + name="h100-bf16-tp4", + gpu_arch="H100", + precision_mode="BF16", + attention_impl="FlashAttention-v3", + tp_degree=4, + ranges=[ + PrecisionRange("max_abs_diff", 0.0, 0.005, "BF16 + TP=4 accumulation on H100"), + PrecisionRange("mean_abs_diff", 0.0, 0.001, "Average BF16 TP=4 error"), + PrecisionRange("cosine_sim", 0.999, 1.0, "Expected cosine similarity"), + ], + ), + HardwareProfile( + name="h100-fp8-tp4", + gpu_arch="H100", + precision_mode="FP8", + attention_impl="FlashAttention-v3", + tp_degree=4, + ranges=[ + PrecisionRange("max_abs_diff", 0.0, 0.02, "FP8 quantization noise on H100"), + PrecisionRange("mean_abs_diff", 0.0, 0.005, "Average FP8 error"), + PrecisionRange("cosine_sim", 0.995, 1.0, "Expected cosine similarity with FP8"), + ], + ), + HardwareProfile( + name="h200-bf16-tp8", + gpu_arch="H200", + precision_mode="BF16", + attention_impl="FlashAttention-v3", + tp_degree=8, + ranges=[ + PrecisionRange("max_abs_diff", 0.0, 0.008, "BF16 + TP=8 on H200"), + PrecisionRange("mean_abs_diff", 0.0, 0.002, "Average BF16 TP=8 error"), + PrecisionRange("cosine_sim", 0.998, 1.0, "Expected cosine similarity"), + ], + ), + HardwareProfile( + name="gaudi2-bf16-tp1", + gpu_arch="Gaudi2", + precision_mode="BF16", + attention_impl="PagedAttention", + tp_degree=1, + ranges=[ + PrecisionRange("max_abs_diff", 0.0, 0.003, "BF16 on Gaudi2"), + PrecisionRange("mean_abs_diff", 0.0, 0.0008, "Average BF16 error on Gaudi2"), + PrecisionRange("cosine_sim", 0.9993, 1.0, "Expected cosine similarity"), + ], + ), + HardwareProfile( + name="gaudi3-fp8-tp4", + gpu_arch="Gaudi3", + precision_mode="FP8", + attention_impl="PagedAttention", + tp_degree=4, + ranges=[ + PrecisionRange("max_abs_diff", 0.0, 0.025, "FP8 on Gaudi3 TP=4"), + PrecisionRange("mean_abs_diff", 0.0, 0.006, "Average FP8 error on Gaudi3"), + PrecisionRange("cosine_sim", 0.994, 1.0, "Expected cosine similarity"), + ], + ), + HardwareProfile( + name="a100-int8kv-tp2", + gpu_arch="A100", + precision_mode="INT8-KV", + attention_impl="PagedAttention", + tp_degree=2, + ranges=[ + PrecisionRange("max_abs_diff", 0.0, 0.015, "INT8 KV cache quantization on A100"), + PrecisionRange("mean_abs_diff", 0.0, 0.004, "Average INT8-KV error"), + PrecisionRange("cosine_sim", 0.996, 1.0, "Expected cosine similarity with INT8-KV"), + ], + ), +] + + +class BaselineDB: + """Database of hardware precision profiles.""" + + def __init__(self) -> None: + self._profiles: dict[str, HardwareProfile] = {} + # Load built-in profiles + for p in _BUILTIN_PROFILES: + self._profiles[p.name] = p + + def list_profiles(self) -> list[str]: + """Return sorted list of available profile names.""" + return sorted(self._profiles.keys()) + + def get_profile(self, name: str) -> HardwareProfile | None: + """Look up a profile by name.""" + return self._profiles.get(name) + + def add_profile(self, profile: HardwareProfile) -> None: + """Add or replace a profile.""" + self._profiles[profile.name] = profile + + def remove_profile(self, name: str) -> bool: + """Remove a profile. Returns True if it existed.""" + if name in self._profiles: + del self._profiles[name] + return True + return False + + def export_json(self, path: str | Path) -> None: + """Export all profiles to a JSON file.""" + data = { + "version": 1, + "profiles": [p.to_dict() for p in self._profiles.values()], + } + Path(path).write_text(json.dumps(data, indent=2) + "\n") + + def import_json(self, path: str | Path) -> int: + """Import profiles from a JSON file. Returns count of imported profiles.""" + data = json.loads(Path(path).read_text()) + profiles = data.get("profiles", []) + count = 0 + for p_data in profiles: + profile = HardwareProfile.from_dict(p_data) + self._profiles[profile.name] = profile + count += 1 + return count + + def find_profiles( + self, + gpu_arch: str | None = None, + precision_mode: str | None = None, + tp_degree: int | None = None, + ) -> list[HardwareProfile]: + """Find profiles matching given criteria.""" + results = [] + for p in self._profiles.values(): + if gpu_arch and p.gpu_arch.lower() != gpu_arch.lower(): + continue + if precision_mode and p.precision_mode.lower() != precision_mode.lower(): + continue + if tp_degree is not None and p.tp_degree != tp_degree: + continue + results.append(p) + return sorted(results, key=lambda p: p.name) + + +def format_profile(profile: HardwareProfile) -> str: + """Format a single profile for terminal display.""" + lines: list[str] = [] + lines.append(f"Profile: {profile.name}") + lines.append(f" GPU: {profile.gpu_arch}") + lines.append(f" Precision: {profile.precision_mode}") + lines.append(f" Attention: {profile.attention_impl}") + lines.append(f" TP Degree: {profile.tp_degree}") + + if profile.ranges: + lines.append(" Expected Ranges:") + for r in profile.ranges: + desc = f" ({r.description})" if r.description else "" + lines.append(f" {r.metric}: [{r.expected_min}, {r.expected_max}]{desc}") + + if profile.metadata: + lines.append(f" Metadata: {profile.metadata}") + + return "\n".join(lines) + + +def format_profile_list(db: BaselineDB) -> str: + """Format the profile list for terminal display.""" + names = db.list_profiles() + if not names: + return "No profiles available." + + lines: list[str] = [] + hdr = f"{'Name':<25} {'GPU':<10} {'Precision':<10} {'Attention':<20} {'TP':>3}" + lines.append(hdr) + lines.append("-" * len(hdr)) + + for name in names: + p = db.get_profile(name) + if p: + lines.append( + f"{p.name:<25} {p.gpu_arch:<10} {p.precision_mode:<10} " + f"{p.attention_impl:<20} {p.tp_degree:>3}" + ) + + return "\n".join(lines) + + +def format_classification(report: ClassificationReport) -> str: + """Format a classification report for terminal display.""" + lines: list[str] = [] + icon_map = { + "expected": "✅", + "suspicious": "⚠️", + "likely_bug": "❌", + "unknown": "❓", + } + overall_icon = icon_map.get(report.overall_classification, "❓") + lines.append(f"Hardware Profile: {report.profile_name}") + lines.append(f"Overall: {overall_icon} {report.overall_classification}") + lines.append("") + + for v in report.verdicts: + icon = icon_map.get(v.classification, "❓") + lines.append(f" {icon} {v.metric} = {v.observed_value}") + lines.append(f" {v.reasoning}") + + return "\n".join(lines) diff --git a/tests/test_hw_baseline.py b/tests/test_hw_baseline.py new file mode 100644 index 0000000..4e4debb --- /dev/null +++ b/tests/test_hw_baseline.py @@ -0,0 +1,334 @@ +"""Tests for hw_baseline module — Hardware Precision Baseline Library.""" + +from __future__ import annotations + +import json +from pathlib import Path + +from xpyd_acc.hw_baseline import ( + BaselineDB, + ClassificationReport, + DifferenceVerdict, + HardwareProfile, + PrecisionRange, + classify_difference, + format_classification, + format_profile, + format_profile_list, +) + +# --- PrecisionRange tests --- + + +class TestPrecisionRange: + def test_contains_within(self): + r = PrecisionRange("max_abs_diff", 0.0, 0.01) + assert r.contains(0.005) + + def test_contains_at_boundary(self): + r = PrecisionRange("max_abs_diff", 0.0, 0.01) + assert r.contains(0.0) + assert r.contains(0.01) + + def test_contains_outside(self): + r = PrecisionRange("max_abs_diff", 0.0, 0.01) + assert not r.contains(0.02) + + def test_round_trip(self): + r = PrecisionRange("cosine_sim", 0.99, 1.0, "test desc") + d = r.to_dict() + r2 = PrecisionRange.from_dict(d) + assert r2.metric == r.metric + assert r2.expected_min == r.expected_min + assert r2.expected_max == r.expected_max + assert r2.description == r.description + + +# --- HardwareProfile tests --- + + +class TestHardwareProfile: + def test_round_trip(self): + p = HardwareProfile( + name="test-profile", + gpu_arch="A100", + precision_mode="BF16", + attention_impl="FlashAttention-v2", + tp_degree=2, + ranges=[PrecisionRange("max_abs_diff", 0.0, 0.005)], + metadata={"note": "test"}, + ) + d = p.to_dict() + p2 = HardwareProfile.from_dict(d) + assert p2.name == p.name + assert p2.gpu_arch == p.gpu_arch + assert p2.tp_degree == p.tp_degree + assert len(p2.ranges) == 1 + assert p2.metadata == {"note": "test"} + + def test_get_range_found(self): + p = HardwareProfile( + name="t", gpu_arch="A100", precision_mode="BF16", + attention_impl="FA", tp_degree=1, + ranges=[PrecisionRange("max_abs_diff", 0.0, 0.01)], + ) + r = p.get_range("max_abs_diff") + assert r is not None + assert r.expected_max == 0.01 + + def test_get_range_not_found(self): + p = HardwareProfile( + name="t", gpu_arch="A100", precision_mode="BF16", + attention_impl="FA", tp_degree=1, + ) + assert p.get_range("nonexistent") is None + + +# --- classify_difference tests --- + + +class TestClassifyDifference: + def _make_profile(self): + return HardwareProfile( + name="test", + gpu_arch="A100", + precision_mode="BF16", + attention_impl="FA", + tp_degree=1, + ranges=[ + PrecisionRange("max_abs_diff", 0.0, 0.002), + PrecisionRange("mean_abs_diff", 0.0, 0.0005), + PrecisionRange("cosine_sim", 0.9995, 1.0), + ], + ) + + def test_all_expected(self): + profile = self._make_profile() + report = classify_difference(profile, { + "max_abs_diff": 0.001, + "mean_abs_diff": 0.0003, + "cosine_sim": 0.9998, + }) + assert report.overall_classification == "expected" + assert all(v.classification == "expected" for v in report.verdicts) + + def test_suspicious_diff(self): + profile = self._make_profile() + report = classify_difference(profile, { + "max_abs_diff": 0.003, # slightly above 0.002 max, ratio < 2 + }) + assert report.verdicts[0].classification == "suspicious" + + def test_likely_bug_diff(self): + profile = self._make_profile() + report = classify_difference(profile, { + "max_abs_diff": 0.01, # 5x the max -> likely_bug + }) + assert report.verdicts[0].classification == "likely_bug" + + def test_cosine_sim_below_min(self): + profile = self._make_profile() + report = classify_difference(profile, { + "cosine_sim": 0.98, # far below 0.9995 + }) + assert report.verdicts[0].classification == "likely_bug" + + def test_cosine_sim_suspicious(self): + profile = self._make_profile() + report = classify_difference(profile, { + "cosine_sim": 0.998, # below 0.9995 but gap < 0.01 + }) + assert report.verdicts[0].classification == "suspicious" + + def test_unknown_metric(self): + profile = self._make_profile() + report = classify_difference(profile, {"unknown_metric": 42.0}) + assert report.verdicts[0].classification == "unknown" + + def test_empty_observations(self): + profile = self._make_profile() + report = classify_difference(profile, {}) + assert report.overall_classification == "unknown" + assert len(report.verdicts) == 0 + + def test_overall_worst(self): + profile = self._make_profile() + report = classify_difference(profile, { + "max_abs_diff": 0.001, # expected + "cosine_sim": 0.98, # likely_bug + }) + assert report.overall_classification == "likely_bug" + + def test_below_min_diff_is_expected(self): + """Values below min for diff metrics (less error) should be fine.""" + profile = HardwareProfile( + name="t", gpu_arch="A100", precision_mode="BF16", + attention_impl="FA", tp_degree=1, + ranges=[PrecisionRange("max_abs_diff", 0.001, 0.01)], + ) + report = classify_difference(profile, {"max_abs_diff": 0.0005}) + # Below min for a diff metric → classified as expected (less error is fine) + assert report.verdicts[0].classification == "expected" + + +# --- BaselineDB tests --- + + +class TestBaselineDB: + def test_builtin_profiles_loaded(self): + db = BaselineDB() + names = db.list_profiles() + assert len(names) >= 6 + assert "a100-bf16-tp1" in names + assert "h100-fp8-tp4" in names + + def test_get_profile(self): + db = BaselineDB() + p = db.get_profile("a100-bf16-tp1") + assert p is not None + assert p.gpu_arch == "A100" + + def test_get_profile_not_found(self): + db = BaselineDB() + assert db.get_profile("nonexistent") is None + + def test_add_and_remove(self): + db = BaselineDB() + custom = HardwareProfile( + name="custom-test", gpu_arch="Custom", precision_mode="FP32", + attention_impl="Naive", tp_degree=1, + ) + db.add_profile(custom) + assert "custom-test" in db.list_profiles() + assert db.remove_profile("custom-test") is True + assert "custom-test" not in db.list_profiles() + + def test_remove_nonexistent(self): + db = BaselineDB() + assert db.remove_profile("nonexistent") is False + + def test_export_import_json(self, tmp_path: Path): + db = BaselineDB() + path = tmp_path / "profiles.json" + db.export_json(path) + assert path.exists() + data = json.loads(path.read_text()) + assert data["version"] == 1 + assert len(data["profiles"]) >= 6 + + # Import into fresh DB + db2 = BaselineDB() + db2._profiles.clear() + count = db2.import_json(path) + assert count >= 6 + assert "a100-bf16-tp1" in db2.list_profiles() + + def test_find_profiles_by_gpu(self): + db = BaselineDB() + results = db.find_profiles(gpu_arch="A100") + assert len(results) >= 2 + assert all(p.gpu_arch == "A100" for p in results) + + def test_find_profiles_by_precision(self): + db = BaselineDB() + results = db.find_profiles(precision_mode="FP8") + assert len(results) >= 1 + assert all(p.precision_mode == "FP8" for p in results) + + def test_find_profiles_by_tp(self): + db = BaselineDB() + results = db.find_profiles(tp_degree=4) + assert len(results) >= 1 + assert all(p.tp_degree == 4 for p in results) + + def test_find_profiles_no_match(self): + db = BaselineDB() + results = db.find_profiles(gpu_arch="NonexistentGPU") + assert results == [] + + +# --- Format tests --- + + +class TestFormatting: + def test_format_profile(self): + p = HardwareProfile( + name="test", gpu_arch="A100", precision_mode="BF16", + attention_impl="FA-v2", tp_degree=1, + ranges=[PrecisionRange("max_abs_diff", 0.0, 0.01, "test range")], + ) + text = format_profile(p) + assert "test" in text + assert "A100" in text + assert "BF16" in text + assert "max_abs_diff" in text + + def test_format_profile_list(self): + db = BaselineDB() + text = format_profile_list(db) + assert "a100-bf16-tp1" in text + assert "GPU" in text + + def test_format_profile_list_empty(self): + db = BaselineDB() + db._profiles.clear() + text = format_profile_list(db) + assert "No profiles" in text + + def test_format_classification_expected(self): + report = ClassificationReport( + profile_name="test", + verdicts=[DifferenceVerdict( + metric="max_abs_diff", + observed_value=0.001, + expected_range=PrecisionRange("max_abs_diff", 0.0, 0.01), + classification="expected", + reasoning="within range", + )], + ) + text = format_classification(report) + assert "✅" in text + assert "expected" in text + + def test_format_classification_bug(self): + report = ClassificationReport( + profile_name="test", + verdicts=[DifferenceVerdict( + metric="max_abs_diff", + observed_value=0.1, + expected_range=PrecisionRange("max_abs_diff", 0.0, 0.01), + classification="likely_bug", + reasoning="outside range", + )], + ) + text = format_classification(report) + assert "❌" in text + assert "likely_bug" in text + + +# --- Serialization round-trip --- + + +class TestSerialization: + def test_classification_report_to_dict(self): + report = ClassificationReport( + profile_name="test", + verdicts=[DifferenceVerdict( + metric="m", observed_value=0.5, + expected_range=PrecisionRange("m", 0.0, 1.0), + classification="expected", reasoning="ok", + )], + ) + d = report.to_dict() + assert d["profile_name"] == "test" + assert d["overall_classification"] == "expected" + assert len(d["verdicts"]) == 1 + + def test_verdict_to_dict_no_range(self): + v = DifferenceVerdict( + metric="x", observed_value=1.0, + expected_range=None, + classification="unknown", reasoning="no range", + ) + d = v.to_dict() + assert d["expected_range"] is None