diff --git a/ROADMAP.md b/ROADMAP.md
index 956e916..8834166 100644
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -867,7 +867,7 @@ get from a Python script calling `/v1/chat/completions`.
 - Detects: one bad GPU in a pool, one node with wrong precision config, asymmetric NCCL issues
 - Critical for production clusters where "5% divergence rate" might be "one node is broken, the rest are fine"
 
-## M90: Hardware Precision Baseline Library
+## M90: Hardware Precision Baseline Library ✅
 - Collect and maintain reference data for expected numerical differences across:
   - GPU architectures: A100 vs H100 vs H200 vs Gaudi2 vs Gaudi3
   - Precision modes: FP16 vs BF16 vs FP8 vs INT8-KV
diff --git a/docs/iterations/current.md b/docs/iterations/current.md
index 627d48d..bf3710e 100644
--- a/docs/iterations/current.md
+++ b/docs/iterations/current.md
@@ -53,4 +53,5 @@ shell for exploratory comparison of two endpoints.
 | M85 | 2026-04-06 | Offline Mode — File-Based Comparison | ✅ merged | Both approved |
 | M87 | 2026-04-06 | Automatic KV Cache Export from vLLM | ✅ merged | Both approved |
 | M88 | 2026-04-06 | Framework-Level Inference Hooks | ✅ merged | Both approved |
-| M89 | 2026-04-06 | PD Topology-Aware Testing | ⏳ pending review | — |
+| M89 | 2026-04-06 | PD Topology-Aware Testing | ✅ merged | Both approved |
+| M90 | 2026-04-06 | Hardware Precision Baseline Library | ⏳ pending review | — |
diff --git a/src/xpyd_acc/cli/__init__.py b/src/xpyd_acc/cli/__init__.py
index 15c6bb8..2057105 100644
--- a/src/xpyd_acc/cli/__init__.py
+++ b/src/xpyd_acc/cli/__init__.py
@@ -19,6 +19,7 @@
     _run_length_bias,
     _run_sensitivity,
     _run_watch,
+    handle_baseline_db,
     handle_capture_kv,
     handle_heatmap,
     handle_root_cause,
@@ -145,6 +146,7 @@ def main(argv: list[str] | None = None) -> None:
         "latency-regression": lambda: _run_latency_regression(args),
         "compare-files": lambda: _run_file_compare(args),
         "topology-scan": lambda: handle_topology_scan(args),
+        "baseline-db": lambda: handle_baseline_db(args),
     }
 
     if args.command in _early:
diff --git a/src/xpyd_acc/cli/analysis.py b/src/xpyd_acc/cli/analysis.py
index 51fa376..a0fc211 100644
--- a/src/xpyd_acc/cli/analysis.py
+++ b/src/xpyd_acc/cli/analysis.py
@@ -515,3 +515,84 @@ def mock_test(p_node: TopologyNode, d_node: TopologyNode) -> NodePairResult:
     if getattr(args, "json", None):
         report.to_json(args.json)
         print(f"Topology report exported to {args.json}")
+
+
+def handle_baseline_db(args: argparse.Namespace) -> None:
+    """Handle baseline-db subcommand."""
+    import json as _json
+
+    from xpyd_acc.hw_baseline import (
+        BaselineDB,
+        classify_difference,
+        format_classification,
+        format_profile,
+        format_profile_list,
+    )
+
+    db = BaselineDB()
+    action = getattr(args, "baseline_action", None)
+
+    if action == "list":
+        print(format_profile_list(db))
+
+    elif action == "show":
+        profile = db.get_profile(args.profile)
+        if profile is None:
+            print(f"Profile '{args.profile}' not found.", file=sys.stderr)
+            raise SystemExit(1)
+        print(format_profile(profile))
+
+    elif action == "export":
+        db.export_json(args.output)
+        print(f"Exported {len(db.list_profiles())} profiles to {args.output}")
+
+    elif action == "import":
+        count = db.import_json(getattr(args, "input"))
+        print(f"Imported {count} profiles.")
+
+    elif action == "find":
+        results = db.find_profiles(
+            gpu_arch=args.gpu,
+            precision_mode=args.precision,
+            tp_degree=args.tp,
+        )
+        if not results:
+            print("No matching profiles found.")
+        else:
+            for p in results:
+                print(format_profile(p))
+                print()
+
+    elif action == "classify":
+        profile = db.get_profile(args.profile)
+        if profile is None:
+            print(f"Profile '{args.profile}' not found.", file=sys.stderr)
+            raise SystemExit(1)
+        observations: dict[str, float] = {}
+        if args.max_abs_diff is not None:
+            observations["max_abs_diff"] = args.max_abs_diff
+        if args.mean_abs_diff is not None:
+            observations["mean_abs_diff"] = args.mean_abs_diff
+        if args.cosine_sim is not None:
+            observations["cosine_sim"] = args.cosine_sim
+        if not observations:
+            print(
+                "No observations provided. "
+                "Use --max-abs-diff, --mean-abs-diff, or --cosine-sim.",
+                file=sys.stderr,
+            )
+            raise SystemExit(1)
+        report = classify_difference(profile, observations)
+        print(format_classification(report))
+        if getattr(args, "json", None):
+            from pathlib import Path
+            Path(args.json).write_text(_json.dumps(report.to_dict(), indent=2) + "\n")
+            print(f"Classification exported to {args.json}")
+
+    else:
+        print(
+            "Usage: xpyd-acc baseline-db "
+            "{list|show|export|import|find|classify}",
+            file=sys.stderr,
+        )
+        raise SystemExit(1)
diff --git a/src/xpyd_acc/cli/parsers.py b/src/xpyd_acc/cli/parsers.py
index f52b112..dc4414e 100644
--- a/src/xpyd_acc/cli/parsers.py
+++ b/src/xpyd_acc/cli/parsers.py
@@ -56,6 +56,7 @@ def register_all(sub: argparse._SubParsersAction) -> None:
     _register_file_compare(sub)
     _register_trace(sub)
     _register_topology_scan(sub)
+    _register_baseline_db(sub)
 def _register_compare(sub):
     lp = sub.add_parser("compare-logprobs", help="Compare logprobs between two endpoints")
     lp.add_argument("--baseline", required=True, help="Baseline endpoint URL")
@@ -759,3 +760,42 @@ def _register_topology_scan(sub):
         "--mock", action="store_true", default=False,
         help="Use mock topology for testing (no real endpoints)",
     )
+
+
+def _register_baseline_db(sub):
+    bd = sub.add_parser(
+        "baseline-db",
+        help="Manage hardware precision baseline profiles",
+    )
+    bd_sub = bd.add_subparsers(dest="baseline_action")
+    bd_sub.add_parser("list", help="List available hardware profiles")
+
+    show_p = bd_sub.add_parser("show", help="Show details of a specific profile")
+    show_p.add_argument("profile", help="Profile name")
+
+    export_p = bd_sub.add_parser("export", help="Export all profiles to JSON")
+    export_p.add_argument("--output", required=True, help="Output JSON file path")
+
+    import_p = bd_sub.add_parser("import", help="Import profiles from JSON")
+    import_p.add_argument("--input", required=True, help="Input JSON file path")
+
+    find_p = bd_sub.add_parser("find", help="Find profiles by criteria")
+    find_p.add_argument("--gpu", default=None, help="Filter by GPU architecture")
+    find_p.add_argument("--precision", default=None, help="Filter by precision mode")
+    find_p.add_argument("--tp", type=int, default=None, help="Filter by TP degree")
+
+    classify_p = bd_sub.add_parser("classify", help="Classify observed differences")
+    classify_p.add_argument("--profile", required=True, help="Profile name to compare against")
+    classify_p.add_argument(
+        "--max-abs-diff", type=float, default=None, dest="max_abs_diff",
+        help="Observed max absolute diff",
+    )
+    classify_p.add_argument(
+        "--mean-abs-diff", type=float, default=None, dest="mean_abs_diff",
+        help="Observed mean absolute diff",
+    )
+    classify_p.add_argument(
+        "--cosine-sim", type=float, default=None, dest="cosine_sim",
+        help="Observed cosine similarity",
+    )
+    classify_p.add_argument("--json", default=None, help="Export classification as JSON")
diff --git a/src/xpyd_acc/hw_baseline.py b/src/xpyd_acc/hw_baseline.py
new file mode 100644
index 0000000..1919d6b
--- /dev/null
+++ b/src/xpyd_acc/hw_baseline.py
@@ -0,0 +1,437 @@
+"""Hardware Precision Baseline Library.
+
+Maintains reference precision profiles for common GPU hardware configurations
+and classifies observed numerical differences as expected hardware variance
+or likely software bugs.
+"""
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass, field
+from pathlib import Path
+
+
+@dataclass
+class PrecisionRange:
+    """Expected numerical range for a specific metric."""
+
+    metric: str  # "max_abs_diff", "mean_abs_diff", "cosine_sim"
+    expected_min: float
+    expected_max: float
+    description: str = ""
+
+    def contains(self, value: float) -> bool:
+        """Check if value falls within the expected range."""
+        return self.expected_min <= value <= self.expected_max
+
+    def to_dict(self) -> dict:
+        return {
+            "metric": self.metric,
+            "expected_min": self.expected_min,
+            "expected_max": self.expected_max,
+            "description": self.description,
+        }
+
+    @classmethod
+    def from_dict(cls, data: dict) -> PrecisionRange:
+        return cls(
+            metric=data["metric"],
+            expected_min=data["expected_min"],
+            expected_max=data["expected_max"],
+            description=data.get("description", ""),
+        )
+
+
+@dataclass
+class HardwareProfile:
+    """A hardware configuration with expected precision characteristics."""
+
+    name: str
+    gpu_arch: str  # e.g. "A100", "H100", "H200", "Gaudi2", "Gaudi3"
+    precision_mode: str  # e.g. "FP16", "BF16", "FP8", "INT8-KV"
+    attention_impl: str  # e.g. "FlashAttention-v2", "PagedAttention", "xFormers"
+    tp_degree: int  # tensor parallelism degree
+    ranges: list[PrecisionRange] = field(default_factory=list)
+    metadata: dict = field(default_factory=dict)
+
+    def to_dict(self) -> dict:
+        return {
+            "name": self.name,
+            "gpu_arch": self.gpu_arch,
+            "precision_mode": self.precision_mode,
+            "attention_impl": self.attention_impl,
+            "tp_degree": self.tp_degree,
+            "ranges": [r.to_dict() for r in self.ranges],
+            "metadata": self.metadata,
+        }
+
+    @classmethod
+    def from_dict(cls, data: dict) -> HardwareProfile:
+        return cls(
+            name=data["name"],
+            gpu_arch=data["gpu_arch"],
+            precision_mode=data["precision_mode"],
+            attention_impl=data["attention_impl"],
+            tp_degree=data["tp_degree"],
+            ranges=[PrecisionRange.from_dict(r) for r in data.get("ranges", [])],
+            metadata=data.get("metadata", {}),
+        )
+
+    def get_range(self, metric: str) -> PrecisionRange | None:
+        """Look up the expected range for a given metric."""
+        for r in self.ranges:
+            if r.metric == metric:
+                return r
+        return None
+
+
+@dataclass
+class DifferenceVerdict:
+    """Classification result for an observed numerical difference."""
+
+    metric: str
+    observed_value: float
+    expected_range: PrecisionRange | None
+    classification: str  # "expected", "suspicious", "likely_bug", "unknown"
+    reasoning: str
+
+    def to_dict(self) -> dict:
+        return {
+            "metric": self.metric,
+            "observed_value": self.observed_value,
+            "expected_range": self.expected_range.to_dict() if self.expected_range else None,
+            "classification": self.classification,
+            "reasoning": self.reasoning,
+        }
+
+
+@dataclass
+class ClassificationReport:
+    """Full classification report for a set of observed differences."""
+
+    profile_name: str
+    verdicts: list[DifferenceVerdict]
+
+    @property
+    def overall_classification(self) -> str:
+        """Return the worst classification across all verdicts."""
+        priority = {"likely_bug": 3, "suspicious": 2, "unknown": 1, "expected": 0}
+        if not self.verdicts:
+            return "unknown"
+        worst = max(self.verdicts, key=lambda v: priority.get(v.classification, 0))
+        return worst.classification
+
+    def to_dict(self) -> dict:
+        return {
+            "profile_name": self.profile_name,
+            "overall_classification": self.overall_classification,
+            "verdicts": [v.to_dict() for v in self.verdicts],
+        }
+
+
+def classify_difference(
+    profile: HardwareProfile,
+    observations: dict[str, float],
+) -> ClassificationReport:
+    """Classify observed numerical differences against a hardware profile.
+
+    Args:
+        profile: hardware profile with expected ranges
+        observations: dict mapping metric names to observed values
+            e.g. {"max_abs_diff": 0.001, "mean_abs_diff": 0.0001, "cosine_sim": 0.9999}
+
+    Returns:
+        ClassificationReport with per-metric verdicts
+    """
+    verdicts: list[DifferenceVerdict] = []
+
+    for metric, value in observations.items():
+        expected = profile.get_range(metric)
+
+        if expected is None:
+            verdicts.append(DifferenceVerdict(
+                metric=metric,
+                observed_value=value,
+                expected_range=None,
+                classification="unknown",
+                reasoning=f"No expected range for '{metric}' in profile '{profile.name}'",
+            ))
+            continue
+
+        if expected.contains(value):
+            verdicts.append(DifferenceVerdict(
+                metric=metric,
+                observed_value=value,
+                expected_range=expected,
+                classification="expected",
+                reasoning=(
+                    f"{metric}={value} is within expected range "
+                    f"[{expected.expected_min}, {expected.expected_max}] "
+                    f"for {profile.name}"
+                ),
+            ))
+        else:
+            # Determine severity: how far outside the range?
+            if metric == "cosine_sim":
+                # For cosine similarity, below min is bad
+                if value < expected.expected_min:
+                    gap = expected.expected_min - value
+                    severity = "likely_bug" if gap > 0.01 else "suspicious"
+                else:
+                    severity = "expected"  # above max for cosine is fine
+            else:
+                # For diff metrics, above max is bad
+                if value > expected.expected_max:
+                    max_val = expected.expected_max
+                    ratio = value / max_val if max_val > 0 else float("inf")
+                    severity = "likely_bug" if ratio > 2.0 else "suspicious"
+                else:
+                    severity = "expected"  # below min for diff is fine (less error)
+
+            verdicts.append(DifferenceVerdict(
+                metric=metric,
+                observed_value=value,
+                expected_range=expected,
+                classification=severity,
+                reasoning=(
+                    f"{metric}={value} is outside expected range "
+                    f"[{expected.expected_min}, {expected.expected_max}] "
+                    f"for {profile.name}"
+                ),
+            ))
+
+    return ClassificationReport(profile_name=profile.name, verdicts=verdicts)
+
+
+# --- Built-in Profiles ---
+
+_BUILTIN_PROFILES: list[HardwareProfile] = [
+    HardwareProfile(
+        name="a100-bf16-tp1",
+        gpu_arch="A100",
+        precision_mode="BF16",
+        attention_impl="FlashAttention-v2",
+        tp_degree=1,
+        ranges=[
+            PrecisionRange("max_abs_diff", 0.0, 0.002, "BF16 rounding on A100"),
+            PrecisionRange("mean_abs_diff", 0.0, 0.0005, "Average BF16 error"),
+            PrecisionRange("cosine_sim", 0.9995, 1.0, "Expected cosine similarity"),
+        ],
+    ),
+    HardwareProfile(
+        name="a100-fp16-tp1",
+        gpu_arch="A100",
+        precision_mode="FP16",
+        attention_impl="FlashAttention-v2",
+        tp_degree=1,
+        ranges=[
+            PrecisionRange("max_abs_diff", 0.0, 0.001, "FP16 rounding on A100"),
+            PrecisionRange("mean_abs_diff", 0.0, 0.0002, "Average FP16 error"),
+            PrecisionRange("cosine_sim", 0.9998, 1.0, "Expected cosine similarity"),
+        ],
+    ),
+    HardwareProfile(
+        name="h100-bf16-tp4",
+        gpu_arch="H100",
+        precision_mode="BF16",
+        attention_impl="FlashAttention-v3",
+        tp_degree=4,
+        ranges=[
+            PrecisionRange("max_abs_diff", 0.0, 0.005, "BF16 + TP=4 accumulation on H100"),
+            PrecisionRange("mean_abs_diff", 0.0, 0.001, "Average BF16 TP=4 error"),
+            PrecisionRange("cosine_sim", 0.999, 1.0, "Expected cosine similarity"),
+        ],
+    ),
+    HardwareProfile(
+        name="h100-fp8-tp4",
+        gpu_arch="H100",
+        precision_mode="FP8",
+        attention_impl="FlashAttention-v3",
+        tp_degree=4,
+        ranges=[
+            PrecisionRange("max_abs_diff", 0.0, 0.02, "FP8 quantization noise on H100"),
+            PrecisionRange("mean_abs_diff", 0.0, 0.005, "Average FP8 error"),
+            PrecisionRange("cosine_sim", 0.995, 1.0, "Expected cosine similarity with FP8"),
+        ],
+    ),
+    HardwareProfile(
+        name="h200-bf16-tp8",
+        gpu_arch="H200",
+        precision_mode="BF16",
+        attention_impl="FlashAttention-v3",
+        tp_degree=8,
+        ranges=[
+            PrecisionRange("max_abs_diff", 0.0, 0.008, "BF16 + TP=8 on H200"),
+            PrecisionRange("mean_abs_diff", 0.0, 0.002, "Average BF16 TP=8 error"),
+            PrecisionRange("cosine_sim", 0.998, 1.0, "Expected cosine similarity"),
+        ],
+    ),
+    HardwareProfile(
+        name="gaudi2-bf16-tp1",
+        gpu_arch="Gaudi2",
+        precision_mode="BF16",
+        attention_impl="PagedAttention",
+        tp_degree=1,
+        ranges=[
+            PrecisionRange("max_abs_diff", 0.0, 0.003, "BF16 on Gaudi2"),
+            PrecisionRange("mean_abs_diff", 0.0, 0.0008, "Average BF16 error on Gaudi2"),
+            PrecisionRange("cosine_sim", 0.9993, 1.0, "Expected cosine similarity"),
+        ],
+    ),
+    HardwareProfile(
+        name="gaudi3-fp8-tp4",
+        gpu_arch="Gaudi3",
+        precision_mode="FP8",
+        attention_impl="PagedAttention",
+        tp_degree=4,
+        ranges=[
+            PrecisionRange("max_abs_diff", 0.0, 0.025, "FP8 on Gaudi3 TP=4"),
+            PrecisionRange("mean_abs_diff", 0.0, 0.006, "Average FP8 error on Gaudi3"),
+            PrecisionRange("cosine_sim", 0.994, 1.0, "Expected cosine similarity"),
+        ],
+    ),
+    HardwareProfile(
+        name="a100-int8kv-tp2",
+        gpu_arch="A100",
+        precision_mode="INT8-KV",
+        attention_impl="PagedAttention",
+        tp_degree=2,
+        ranges=[
+            PrecisionRange("max_abs_diff", 0.0, 0.015, "INT8 KV cache quantization on A100"),
+            PrecisionRange("mean_abs_diff", 0.0, 0.004, "Average INT8-KV error"),
+            PrecisionRange("cosine_sim", 0.996, 1.0, "Expected cosine similarity with INT8-KV"),
+        ],
+    ),
+]
+
+
+class BaselineDB:
+    """Database of hardware precision profiles."""
+
+    def __init__(self) -> None:
+        self._profiles: dict[str, HardwareProfile] = {}
+        # Load built-in profiles
+        for p in _BUILTIN_PROFILES:
+            self._profiles[p.name] = p
+
+    def list_profiles(self) -> list[str]:
+        """Return sorted list of available profile names."""
+        return sorted(self._profiles.keys())
+
+    def get_profile(self, name: str) -> HardwareProfile | None:
+        """Look up a profile by name."""
+        return self._profiles.get(name)
+
+    def add_profile(self, profile: HardwareProfile) -> None:
+        """Add or replace a profile."""
+        self._profiles[profile.name] = profile
+
+    def remove_profile(self, name: str) -> bool:
+        """Remove a profile. Returns True if it existed."""
+        if name in self._profiles:
+            del self._profiles[name]
+            return True
+        return False
+
+    def export_json(self, path: str | Path) -> None:
+        """Export all profiles to a JSON file."""
+        data = {
+            "version": 1,
+            "profiles": [p.to_dict() for p in self._profiles.values()],
+        }
+        Path(path).write_text(json.dumps(data, indent=2) + "\n")
+
+    def import_json(self, path: str | Path) -> int:
+        """Import profiles from a JSON file. Returns count of imported profiles."""
+        data = json.loads(Path(path).read_text())
+        profiles = data.get("profiles", [])
+        count = 0
+        for p_data in profiles:
+            profile = HardwareProfile.from_dict(p_data)
+            self._profiles[profile.name] = profile
+            count += 1
+        return count
+
+    def find_profiles(
+        self,
+        gpu_arch: str | None = None,
+        precision_mode: str | None = None,
+        tp_degree: int | None = None,
+    ) -> list[HardwareProfile]:
+        """Find profiles matching given criteria."""
+        results = []
+        for p in self._profiles.values():
+            if gpu_arch and p.gpu_arch.lower() != gpu_arch.lower():
+                continue
+            if precision_mode and p.precision_mode.lower() != precision_mode.lower():
+                continue
+            if tp_degree is not None and p.tp_degree != tp_degree:
+                continue
+            results.append(p)
+        return sorted(results, key=lambda p: p.name)
+
+
+def format_profile(profile: HardwareProfile) -> str:
+    """Format a single profile for terminal display."""
+    lines: list[str] = []
+    lines.append(f"Profile: {profile.name}")
+    lines.append(f"  GPU: {profile.gpu_arch}")
+    lines.append(f"  Precision: {profile.precision_mode}")
+    lines.append(f"  Attention: {profile.attention_impl}")
+    lines.append(f"  TP Degree: {profile.tp_degree}")
+
+    if profile.ranges:
+        lines.append("  Expected Ranges:")
+        for r in profile.ranges:
+            desc = f" ({r.description})" if r.description else ""
+            lines.append(f"    {r.metric}: [{r.expected_min}, {r.expected_max}]{desc}")
+
+    if profile.metadata:
+        lines.append(f"  Metadata: {profile.metadata}")
+
+    return "\n".join(lines)
+
+
+def format_profile_list(db: BaselineDB) -> str:
+    """Format the profile list for terminal display."""
+    names = db.list_profiles()
+    if not names:
+        return "No profiles available."
+
+    lines: list[str] = []
+    hdr = f"{'Name':<25} {'GPU':<10} {'Precision':<10} {'Attention':<20} {'TP':>3}"
+    lines.append(hdr)
+    lines.append("-" * len(hdr))
+
+    for name in names:
+        p = db.get_profile(name)
+        if p:
+            lines.append(
+                f"{p.name:<25} {p.gpu_arch:<10} {p.precision_mode:<10} "
+                f"{p.attention_impl:<20} {p.tp_degree:>3}"
+            )
+
+    return "\n".join(lines)
+
+
+def format_classification(report: ClassificationReport) -> str:
+    """Format a classification report for terminal display."""
+    lines: list[str] = []
+    icon_map = {
+        "expected": "✅",
+        "suspicious": "⚠️",
+        "likely_bug": "❌",
+        "unknown": "❓",
+    }
+    overall_icon = icon_map.get(report.overall_classification, "❓")
+    lines.append(f"Hardware Profile: {report.profile_name}")
+    lines.append(f"Overall: {overall_icon} {report.overall_classification}")
+    lines.append("")
+
+    for v in report.verdicts:
+        icon = icon_map.get(v.classification, "❓")
+        lines.append(f"  {icon} {v.metric} = {v.observed_value}")
+        lines.append(f"     {v.reasoning}")
+
+    return "\n".join(lines)
diff --git a/tests/test_hw_baseline.py b/tests/test_hw_baseline.py
new file mode 100644
index 0000000..4e4debb
--- /dev/null
+++ b/tests/test_hw_baseline.py
@@ -0,0 +1,334 @@
+"""Tests for hw_baseline module — Hardware Precision Baseline Library."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+from xpyd_acc.hw_baseline import (
+    BaselineDB,
+    ClassificationReport,
+    DifferenceVerdict,
+    HardwareProfile,
+    PrecisionRange,
+    classify_difference,
+    format_classification,
+    format_profile,
+    format_profile_list,
+)
+
+# --- PrecisionRange tests ---
+
+
+class TestPrecisionRange:
+    def test_contains_within(self):
+        r = PrecisionRange("max_abs_diff", 0.0, 0.01)
+        assert r.contains(0.005)
+
+    def test_contains_at_boundary(self):
+        r = PrecisionRange("max_abs_diff", 0.0, 0.01)
+        assert r.contains(0.0)
+        assert r.contains(0.01)
+
+    def test_contains_outside(self):
+        r = PrecisionRange("max_abs_diff", 0.0, 0.01)
+        assert not r.contains(0.02)
+
+    def test_round_trip(self):
+        r = PrecisionRange("cosine_sim", 0.99, 1.0, "test desc")
+        d = r.to_dict()
+        r2 = PrecisionRange.from_dict(d)
+        assert r2.metric == r.metric
+        assert r2.expected_min == r.expected_min
+        assert r2.expected_max == r.expected_max
+        assert r2.description == r.description
+
+
+# --- HardwareProfile tests ---
+
+
+class TestHardwareProfile:
+    def test_round_trip(self):
+        p = HardwareProfile(
+            name="test-profile",
+            gpu_arch="A100",
+            precision_mode="BF16",
+            attention_impl="FlashAttention-v2",
+            tp_degree=2,
+            ranges=[PrecisionRange("max_abs_diff", 0.0, 0.005)],
+            metadata={"note": "test"},
+        )
+        d = p.to_dict()
+        p2 = HardwareProfile.from_dict(d)
+        assert p2.name == p.name
+        assert p2.gpu_arch == p.gpu_arch
+        assert p2.tp_degree == p.tp_degree
+        assert len(p2.ranges) == 1
+        assert p2.metadata == {"note": "test"}
+
+    def test_get_range_found(self):
+        p = HardwareProfile(
+            name="t", gpu_arch="A100", precision_mode="BF16",
+            attention_impl="FA", tp_degree=1,
+            ranges=[PrecisionRange("max_abs_diff", 0.0, 0.01)],
+        )
+        r = p.get_range("max_abs_diff")
+        assert r is not None
+        assert r.expected_max == 0.01
+
+    def test_get_range_not_found(self):
+        p = HardwareProfile(
+            name="t", gpu_arch="A100", precision_mode="BF16",
+            attention_impl="FA", tp_degree=1,
+        )
+        assert p.get_range("nonexistent") is None
+
+
+# --- classify_difference tests ---
+
+
+class TestClassifyDifference:
+    def _make_profile(self):
+        return HardwareProfile(
+            name="test",
+            gpu_arch="A100",
+            precision_mode="BF16",
+            attention_impl="FA",
+            tp_degree=1,
+            ranges=[
+                PrecisionRange("max_abs_diff", 0.0, 0.002),
+                PrecisionRange("mean_abs_diff", 0.0, 0.0005),
+                PrecisionRange("cosine_sim", 0.9995, 1.0),
+            ],
+        )
+
+    def test_all_expected(self):
+        profile = self._make_profile()
+        report = classify_difference(profile, {
+            "max_abs_diff": 0.001,
+            "mean_abs_diff": 0.0003,
+            "cosine_sim": 0.9998,
+        })
+        assert report.overall_classification == "expected"
+        assert all(v.classification == "expected" for v in report.verdicts)
+
+    def test_suspicious_diff(self):
+        profile = self._make_profile()
+        report = classify_difference(profile, {
+            "max_abs_diff": 0.003,  # slightly above 0.002 max, ratio < 2
+        })
+        assert report.verdicts[0].classification == "suspicious"
+
+    def test_likely_bug_diff(self):
+        profile = self._make_profile()
+        report = classify_difference(profile, {
+            "max_abs_diff": 0.01,  # 5x the max -> likely_bug
+        })
+        assert report.verdicts[0].classification == "likely_bug"
+
+    def test_cosine_sim_below_min(self):
+        profile = self._make_profile()
+        report = classify_difference(profile, {
+            "cosine_sim": 0.98,  # far below 0.9995
+        })
+        assert report.verdicts[0].classification == "likely_bug"
+
+    def test_cosine_sim_suspicious(self):
+        profile = self._make_profile()
+        report = classify_difference(profile, {
+            "cosine_sim": 0.998,  # below 0.9995 but gap < 0.01
+        })
+        assert report.verdicts[0].classification == "suspicious"
+
+    def test_unknown_metric(self):
+        profile = self._make_profile()
+        report = classify_difference(profile, {"unknown_metric": 42.0})
+        assert report.verdicts[0].classification == "unknown"
+
+    def test_empty_observations(self):
+        profile = self._make_profile()
+        report = classify_difference(profile, {})
+        assert report.overall_classification == "unknown"
+        assert len(report.verdicts) == 0
+
+    def test_overall_worst(self):
+        profile = self._make_profile()
+        report = classify_difference(profile, {
+            "max_abs_diff": 0.001,  # expected
+            "cosine_sim": 0.98,  # likely_bug
+        })
+        assert report.overall_classification == "likely_bug"
+
+    def test_below_min_diff_is_expected(self):
+        """Values below min for diff metrics (less error) should be fine."""
+        profile = HardwareProfile(
+            name="t", gpu_arch="A100", precision_mode="BF16",
+            attention_impl="FA", tp_degree=1,
+            ranges=[PrecisionRange("max_abs_diff", 0.001, 0.01)],
+        )
+        report = classify_difference(profile, {"max_abs_diff": 0.0005})
+        # Below min for a diff metric → classified as expected (less error is fine)
+        assert report.verdicts[0].classification == "expected"
+
+
+# --- BaselineDB tests ---
+
+
+class TestBaselineDB:
+    def test_builtin_profiles_loaded(self):
+        db = BaselineDB()
+        names = db.list_profiles()
+        assert len(names) >= 6
+        assert "a100-bf16-tp1" in names
+        assert "h100-fp8-tp4" in names
+
+    def test_get_profile(self):
+        db = BaselineDB()
+        p = db.get_profile("a100-bf16-tp1")
+        assert p is not None
+        assert p.gpu_arch == "A100"
+
+    def test_get_profile_not_found(self):
+        db = BaselineDB()
+        assert db.get_profile("nonexistent") is None
+
+    def test_add_and_remove(self):
+        db = BaselineDB()
+        custom = HardwareProfile(
+            name="custom-test", gpu_arch="Custom", precision_mode="FP32",
+            attention_impl="Naive", tp_degree=1,
+        )
+        db.add_profile(custom)
+        assert "custom-test" in db.list_profiles()
+        assert db.remove_profile("custom-test") is True
+        assert "custom-test" not in db.list_profiles()
+
+    def test_remove_nonexistent(self):
+        db = BaselineDB()
+        assert db.remove_profile("nonexistent") is False
+
+    def test_export_import_json(self, tmp_path: Path):
+        db = BaselineDB()
+        path = tmp_path / "profiles.json"
+        db.export_json(path)
+        assert path.exists()
+        data = json.loads(path.read_text())
+        assert data["version"] == 1
+        assert len(data["profiles"]) >= 6
+
+        # Import into fresh DB
+        db2 = BaselineDB()
+        db2._profiles.clear()
+        count = db2.import_json(path)
+        assert count >= 6
+        assert "a100-bf16-tp1" in db2.list_profiles()
+
+    def test_find_profiles_by_gpu(self):
+        db = BaselineDB()
+        results = db.find_profiles(gpu_arch="A100")
+        assert len(results) >= 2
+        assert all(p.gpu_arch == "A100" for p in results)
+
+    def test_find_profiles_by_precision(self):
+        db = BaselineDB()
+        results = db.find_profiles(precision_mode="FP8")
+        assert len(results) >= 1
+        assert all(p.precision_mode == "FP8" for p in results)
+
+    def test_find_profiles_by_tp(self):
+        db = BaselineDB()
+        results = db.find_profiles(tp_degree=4)
+        assert len(results) >= 1
+        assert all(p.tp_degree == 4 for p in results)
+
+    def test_find_profiles_no_match(self):
+        db = BaselineDB()
+        results = db.find_profiles(gpu_arch="NonexistentGPU")
+        assert results == []
+
+
+# --- Format tests ---
+
+
+class TestFormatting:
+    def test_format_profile(self):
+        p = HardwareProfile(
+            name="test", gpu_arch="A100", precision_mode="BF16",
+            attention_impl="FA-v2", tp_degree=1,
+            ranges=[PrecisionRange("max_abs_diff", 0.0, 0.01, "test range")],
+        )
+        text = format_profile(p)
+        assert "test" in text
+        assert "A100" in text
+        assert "BF16" in text
+        assert "max_abs_diff" in text
+
+    def test_format_profile_list(self):
+        db = BaselineDB()
+        text = format_profile_list(db)
+        assert "a100-bf16-tp1" in text
+        assert "GPU" in text
+
+    def test_format_profile_list_empty(self):
+        db = BaselineDB()
+        db._profiles.clear()
+        text = format_profile_list(db)
+        assert "No profiles" in text
+
+    def test_format_classification_expected(self):
+        report = ClassificationReport(
+            profile_name="test",
+            verdicts=[DifferenceVerdict(
+                metric="max_abs_diff",
+                observed_value=0.001,
+                expected_range=PrecisionRange("max_abs_diff", 0.0, 0.01),
+                classification="expected",
+                reasoning="within range",
+            )],
+        )
+        text = format_classification(report)
+        assert "✅" in text
+        assert "expected" in text
+
+    def test_format_classification_bug(self):
+        report = ClassificationReport(
+            profile_name="test",
+            verdicts=[DifferenceVerdict(
+                metric="max_abs_diff",
+                observed_value=0.1,
+                expected_range=PrecisionRange("max_abs_diff", 0.0, 0.01),
+                classification="likely_bug",
+                reasoning="outside range",
+            )],
+        )
+        text = format_classification(report)
+        assert "❌" in text
+        assert "likely_bug" in text
+
+
+# --- Serialization round-trip ---
+
+
+class TestSerialization:
+    def test_classification_report_to_dict(self):
+        report = ClassificationReport(
+            profile_name="test",
+            verdicts=[DifferenceVerdict(
+                metric="m", observed_value=0.5,
+                expected_range=PrecisionRange("m", 0.0, 1.0),
+                classification="expected", reasoning="ok",
+            )],
+        )
+        d = report.to_dict()
+        assert d["profile_name"] == "test"
+        assert d["overall_classification"] == "expected"
+        assert len(d["verdicts"]) == 1
+
+    def test_verdict_to_dict_no_range(self):
+        v = DifferenceVerdict(
+            metric="x", observed_value=1.0,
+            expected_range=None,
+            classification="unknown", reasoning="no range",
+        )
+        d = v.to_dict()
+        assert d["expected_range"] is None