diff --git a/ROADMAP.md b/ROADMAP.md
index 1fd88dc..5c27e5b 100644
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -879,7 +879,7 @@ get from a Python script calling `/v1/chat/completions`.
 - Transforms raw numbers into actionable verdicts: "expected hardware variance" vs "likely software bug"
 - Community-contributed baselines: users can submit anonymized precision profiles to build the database
 
-## M91: Smart Retry for Divergent Samples
+## M91: Smart Retry for Divergent Samples ✅
 - After initial batch comparison, automatically retry divergent samples with deterministic settings (temperature=0, seed=42)
 - Classifies divergence as: `deterministic` (reproduces under greedy decoding) or `stochastic` (disappears with greedy)
 - `batch-compare --smart-retry` flag triggers automatic rerun of divergent samples
@@ -891,3 +891,20 @@ get from a Python script calling `/v1/chat/completions`.
 - Integrates with existing `--fail-threshold`: only deterministic divergences count toward threshold
 - `smart_retry.py` module: `run_smart_retry()`, `SmartRetryResult`, `format_smart_retry()`
 - Tests covering retry logic, classification, integration, JSON export, CLI
+
+## M92: Test Suite Generation from Divergent Samples
+- `xpyd-acc generate-suite --report <path> --output <path>` extracts divergent samples into a reusable test dataset
+- Output format: JSONL (compatible with `batch-compare --dataset`)
+- Each entry includes: original prompt, expected baseline output, divergence metadata
+- `--classification <value>` filter: only include `likely_bug`, `likely_uncertainty`, etc.
+- `--deterministic-only` flag: only include samples classified as deterministic (from smart-retry)
+- `--min-logprob-gap <float>` filter: only include high-confidence divergences
+- `--max-samples <int>` cap the number of samples in the generated suite
+- `--include-expected` flag: embed baseline output as `expected` field for exact-match regression testing
+- `SuiteEntry` dataclass: id, prompt, expected (optional), metadata (classification, divergence_index, logprob_gap)
+- `GenerateSuiteConfig` dataclass for filter settings
+- `generate_suite()` function: load report → filter → emit JSONL
+- Round-trip: generated suite works directly with `batch-compare --dataset`
+- Enables CI workflow: batch-compare → generate-suite → commit suite → rerun on every deploy
+- `test_suite_gen.py` module with `generate_suite()`, `SuiteEntry`, `GenerateSuiteConfig`
+- 15 tests covering generation, filtering, round-trip compatibility, edge cases, CLI integration
diff --git a/src/xpyd_acc/cli/__init__.py b/src/xpyd_acc/cli/__init__.py
index a0203f6..681e6b2 100644
--- a/src/xpyd_acc/cli/__init__.py
+++ b/src/xpyd_acc/cli/__init__.py
@@ -15,6 +15,7 @@
     _run_entropy,
     _run_file_compare,
     _run_fingerprint,
+    _run_generate_suite,
     _run_latency_regression,
     _run_length_bias,
     _run_sensitivity,
@@ -149,6 +150,7 @@ def main(argv: list[str] | None = None) -> None:
         "topology-scan": lambda: handle_topology_scan(args),
         "baseline-db": lambda: handle_baseline_db(args),
         "smart-retry": lambda: _run_smart_retry(args),
+        "generate-suite": lambda: _run_generate_suite(args),
     }
 
     if args.command in _early:
diff --git a/src/xpyd_acc/cli/analysis.py b/src/xpyd_acc/cli/analysis.py
index baede3c..255faa0 100644
--- a/src/xpyd_acc/cli/analysis.py
+++ b/src/xpyd_acc/cli/analysis.py
@@ -629,3 +629,28 @@ def _run_smart_retry(args: argparse.Namespace) -> None:
 
     if result.deterministic_count > 0:
         raise SystemExit(1)
+
+
+def _run_generate_suite(args: argparse.Namespace) -> None:
+    """Handle the generate-suite subcommand."""
+    from pathlib import Path
+
+    from xpyd_acc.batch_compare import load_report
+    from xpyd_acc.test_suite_gen import (
+        GenerateSuiteConfig,
+        format_suite_summary,
+        generate_suite,
+        write_suite,
+    )
+
+    report = load_report(args.report)
+    config = GenerateSuiteConfig(
+        classification=args.classification,
+        min_logprob_gap=args.min_logprob_gap,
+        max_samples=args.max_samples,
+        include_expected=args.include_expected,
+    )
+    entries = generate_suite(report, config)
+    write_suite(entries, Path(args.output))
+    print(format_suite_summary(entries, report))
+    print(f"\nSuite written to {args.output}")
diff --git a/src/xpyd_acc/cli/parsers.py b/src/xpyd_acc/cli/parsers.py
index 57bdd54..90366d5 100644
--- a/src/xpyd_acc/cli/parsers.py
+++ b/src/xpyd_acc/cli/parsers.py
@@ -58,6 +58,7 @@ def register_all(sub: argparse._SubParsersAction) -> None:
     _register_topology_scan(sub)
     _register_baseline_db(sub)
     _register_smart_retry(sub)
+    _register_generate_suite(sub)
 def _register_compare(sub):
     lp = sub.add_parser("compare-logprobs", help="Compare logprobs between two endpoints")
     lp.add_argument("--baseline", required=True, help="Baseline endpoint URL")
@@ -815,3 +816,15 @@ def _register_smart_retry(sub):
     p.add_argument("--timeout", type=float, default=120.0, help="HTTP timeout seconds")
     p.add_argument("--json", default=None, dest="json_path", help="Export results as JSON")
     p.add_argument("--skip-validation", action="store_true", help="Skip response validation")
+
+def _register_generate_suite(sub):
+    p = sub.add_parser("generate-suite", help="Generate test dataset from divergent samples")
+    p.add_argument("--report", required=True, help="Path to batch report JSON")
+    p.add_argument("--output", required=True, help="Output JSONL path")
+    p.add_argument("--classification", default=None, help="Filter by classification")
+    p.add_argument("--min-logprob-gap", type=float, default=None, help="Minimum logprob gap filter")
+    p.add_argument("--max-samples", type=int, default=None, help="Maximum samples in suite")
+    p.add_argument(
+        "--include-expected", action="store_true",
+        help="Include baseline output as expected",
+    )
diff --git a/src/xpyd_acc/test_suite_gen.py b/src/xpyd_acc/test_suite_gen.py
new file mode 100644
index 0000000..1a39bf8
--- /dev/null
+++ b/src/xpyd_acc/test_suite_gen.py
@@ -0,0 +1,157 @@
+"""Generate reusable test datasets from divergent samples in batch reports."""
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+from .batch_compare import BatchReport
+from .log import get_logger
+
+logger = get_logger(__name__)
+
+
+@dataclass
+class SuiteEntry:
+    """A single entry in a generated test suite."""
+
+    id: str
+    prompt: str
+    expected: str | None = None
+    metadata: dict[str, Any] = field(default_factory=dict)
+
+    def to_dict(self) -> dict[str, Any]:
+        """Serialize to dict, omitting None expected."""
+        d: dict[str, Any] = {"id": self.id, "prompt": self.prompt}
+        if self.expected is not None:
+            d["expected"] = self.expected
+        if self.metadata:
+            d["metadata"] = self.metadata
+        return d
+
+    def to_jsonl_line(self) -> str:
+        """Serialize to a single JSONL line."""
+        return json.dumps(self.to_dict(), ensure_ascii=False)
+
+
+@dataclass
+class GenerateSuiteConfig:
+    """Configuration for test suite generation filters."""
+
+    classification: str | None = None
+    deterministic_only: bool = False
+    min_logprob_gap: float | None = None
+    max_samples: int | None = None
+    include_expected: bool = False
+
+
+def generate_suite(
+    report: BatchReport,
+    config: GenerateSuiteConfig | None = None,
+) -> list[SuiteEntry]:
+    """Generate a test suite from divergent samples in a batch report.
+
+    Args:
+        report: A BatchReport containing comparison results.
+        config: Optional filtering configuration.
+
+    Returns:
+        List of SuiteEntry objects for the generated test suite.
+    """
+    if config is None:
+        config = GenerateSuiteConfig()
+
+    entries: list[SuiteEntry] = []
+
+    for result in report.results:
+        # Only include divergent samples
+        if not result.is_divergent():
+            continue
+
+        # Filter by classification
+        if config.classification and result.classification != config.classification:
+            continue
+
+        # Filter by minimum logprob gap
+        if config.min_logprob_gap is not None:
+            if result.logprob_gap is None or result.logprob_gap < config.min_logprob_gap:
+                continue
+
+        metadata: dict[str, Any] = {
+            "classification": result.classification,
+            "divergence_index": result.first_divergence_index,
+            "logprob_gap": result.logprob_gap,
+            "context_length": result.context_length,
+        }
+
+        expected = result.baseline_output if config.include_expected else None
+
+        entries.append(
+            SuiteEntry(
+                id=result.sample_id,
+                prompt=result.prompt,
+                expected=expected,
+                metadata=metadata,
+            )
+        )
+
+    # Cap the number of samples
+    if config.max_samples is not None and len(entries) > config.max_samples:
+        entries = entries[: config.max_samples]
+
+    logger.info("Generated test suite with %d entries from %d divergent samples",
+                len(entries), report.divergent_samples)
+    return entries
+
+
+def write_suite(entries: list[SuiteEntry], output: Path) -> None:
+    """Write test suite entries to a JSONL file.
+
+    Args:
+        entries: List of SuiteEntry objects.
+        output: Path to the output JSONL file.
+    """
+    output.parent.mkdir(parents=True, exist_ok=True)
+    with open(output, "w", encoding="utf-8") as f:
+        for entry in entries:
+            f.write(entry.to_jsonl_line() + "\n")
+    logger.info("Wrote %d entries to %s", len(entries), output)
+
+
+def format_suite_summary(entries: list[SuiteEntry], report: BatchReport) -> str:
+    """Format a human-readable summary of the generated test suite.
+
+    Args:
+        entries: Generated suite entries.
+        report: The source batch report.
+
+    Returns:
+        Formatted summary string.
+    """
+    lines = [
+        "Test Suite Generation Summary",
+        "=" * 40,
+        f"Source report: {report.total_samples} total samples, "
+        f"{report.divergent_samples} divergent",
+        f"Generated suite: {len(entries)} samples",
+        "",
+    ]
+
+    if entries:
+        classifications: dict[str, int] = {}
+        for entry in entries:
+            cls = entry.metadata.get("classification", "unknown")
+            classifications[cls] = classifications.get(cls, 0) + 1
+
+        lines.append("By classification:")
+        for cls, count in sorted(classifications.items()):
+            lines.append(f"  {cls}: {count}")
+
+        has_expected = sum(1 for e in entries if e.expected is not None)
+        lines.append(f"\nWith expected output: {has_expected}/{len(entries)}")
+    else:
+        lines.append("No samples matched the filter criteria.")
+
+    return "\n".join(lines)
diff --git a/tests/test_test_suite_gen.py b/tests/test_test_suite_gen.py
new file mode 100644
index 0000000..159256b
--- /dev/null
+++ b/tests/test_test_suite_gen.py
@@ -0,0 +1,225 @@
+"""Tests for test suite generation from batch reports."""
+
+from __future__ import annotations
+
+import json
+import tempfile
+from pathlib import Path
+
+from xpyd_acc.batch_compare import BatchReport, SampleResult
+from xpyd_acc.test_suite_gen import (
+    GenerateSuiteConfig,
+    SuiteEntry,
+    format_suite_summary,
+    generate_suite,
+    write_suite,
+)
+
+
+def _make_result(
+    sample_id: str,
+    prompt: str = "test prompt",
+    match: bool = False,
+    classification: str = "likely_bug",
+    logprob_gap: float | None = 0.5,
+    divergence_index: int | None = 5,
+    baseline_output: str = "baseline",
+    target_output: str = "target",
+) -> SampleResult:
+    return SampleResult(
+        sample_id=sample_id,
+        prompt=prompt,
+        baseline_output=baseline_output,
+        target_output=target_output,
+        exact_match=match,
+        first_divergence_index=divergence_index,
+        baseline_logprob_at_divergence=-0.1 if not match else None,
+        target_logprob_at_divergence=-0.6 if not match else None,
+        logprob_gap=logprob_gap,
+        classification=classification,
+        context_length=100,
+    )
+
+
+def _make_report(results: list[SampleResult]) -> BatchReport:
+    divergent = [r for r in results if not r.exact_match]
+    return BatchReport(
+        total_samples=len(results),
+        divergent_samples=len(divergent),
+        match_samples=len(results) - len(divergent),
+        divergence_rate=len(divergent) / len(results) if results else 0.0,
+        results=results,
+    )
+
+
+class TestSuiteEntry:
+    def test_to_dict_without_expected(self):
+        entry = SuiteEntry(id="s1", prompt="hello")
+        d = entry.to_dict()
+        assert d == {"id": "s1", "prompt": "hello"}
+        assert "expected" not in d
+        assert "metadata" not in d
+
+    def test_to_dict_with_expected_and_metadata(self):
+        entry = SuiteEntry(id="s1", prompt="hello", expected="world",
+                           metadata={"classification": "likely_bug"})
+        d = entry.to_dict()
+        assert d["expected"] == "world"
+        assert d["metadata"]["classification"] == "likely_bug"
+
+    def test_to_jsonl_line(self):
+        entry = SuiteEntry(id="s1", prompt="hello")
+        line = entry.to_jsonl_line()
+        parsed = json.loads(line)
+        assert parsed["id"] == "s1"
+        assert parsed["prompt"] == "hello"
+
+
+class TestGenerateSuite:
+    def test_basic_generation(self):
+        results = [
+            _make_result("s1", match=True),
+            _make_result("s2", match=False),
+            _make_result("s3", match=False),
+        ]
+        report = _make_report(results)
+        entries = generate_suite(report)
+        assert len(entries) == 2
+        assert {e.id for e in entries} == {"s2", "s3"}
+
+    def test_empty_report(self):
+        report = _make_report([])
+        entries = generate_suite(report)
+        assert entries == []
+
+    def test_no_divergent_samples(self):
+        results = [_make_result("s1", match=True)]
+        report = _make_report(results)
+        entries = generate_suite(report)
+        assert entries == []
+
+    def test_filter_by_classification(self):
+        results = [
+            _make_result("s1", match=False, classification="likely_bug"),
+            _make_result("s2", match=False, classification="likely_uncertainty"),
+            _make_result("s3", match=False, classification="likely_bug"),
+        ]
+        report = _make_report(results)
+        config = GenerateSuiteConfig(classification="likely_bug")
+        entries = generate_suite(report, config)
+        assert len(entries) == 2
+        assert all(e.metadata["classification"] == "likely_bug" for e in entries)
+
+    def test_filter_by_min_logprob_gap(self):
+        results = [
+            _make_result("s1", match=False, logprob_gap=0.1),
+            _make_result("s2", match=False, logprob_gap=0.5),
+            _make_result("s3", match=False, logprob_gap=None),
+        ]
+        report = _make_report(results)
+        config = GenerateSuiteConfig(min_logprob_gap=0.3)
+        entries = generate_suite(report, config)
+        assert len(entries) == 1
+        assert entries[0].id == "s2"
+
+    def test_max_samples(self):
+        results = [_make_result(f"s{i}", match=False) for i in range(10)]
+        report = _make_report(results)
+        config = GenerateSuiteConfig(max_samples=3)
+        entries = generate_suite(report, config)
+        assert len(entries) == 3
+
+    def test_include_expected(self):
+        results = [_make_result("s1", match=False, baseline_output="expected output")]
+        report = _make_report(results)
+        config = GenerateSuiteConfig(include_expected=True)
+        entries = generate_suite(report, config)
+        assert entries[0].expected == "expected output"
+
+    def test_exclude_expected_by_default(self):
+        results = [_make_result("s1", match=False, baseline_output="expected output")]
+        report = _make_report(results)
+        entries = generate_suite(report)
+        assert entries[0].expected is None
+
+    def test_metadata_populated(self):
+        results = [_make_result("s1", match=False, classification="likely_bug",
+                                logprob_gap=0.42, divergence_index=7)]
+        report = _make_report(results)
+        entries = generate_suite(report)
+        meta = entries[0].metadata
+        assert meta["classification"] == "likely_bug"
+        assert meta["logprob_gap"] == 0.42
+        assert meta["divergence_index"] == 7
+
+    def test_combined_filters(self):
+        results = [
+            _make_result("s1", match=False, classification="likely_bug", logprob_gap=0.1),
+            _make_result("s2", match=False, classification="likely_bug", logprob_gap=0.5),
+            _make_result("s3", match=False, classification="likely_uncertainty", logprob_gap=0.8),
+        ]
+        report = _make_report(results)
+        config = GenerateSuiteConfig(classification="likely_bug", min_logprob_gap=0.3)
+        entries = generate_suite(report, config)
+        assert len(entries) == 1
+        assert entries[0].id == "s2"
+
+
+class TestWriteSuite:
+    def test_write_and_read_back(self):
+        entries = [
+            SuiteEntry(id="s1", prompt="hello", metadata={"classification": "likely_bug"}),
+            SuiteEntry(id="s2", prompt="world", expected="output"),
+        ]
+        with tempfile.TemporaryDirectory() as td:
+            path = Path(td) / "suite.jsonl"
+            write_suite(entries, path)
+
+            lines = path.read_text().strip().split("\n")
+            assert len(lines) == 2
+
+            parsed = [json.loads(line) for line in lines]
+            assert parsed[0]["id"] == "s1"
+            assert parsed[0]["prompt"] == "hello"
+            assert parsed[1]["expected"] == "output"
+
+    def test_round_trip_with_batch_compare_dataset(self):
+        """Generated suite should be loadable as a batch-compare dataset."""
+        entries = [
+            SuiteEntry(id="s1", prompt="What is 2+2?"),
+            SuiteEntry(id="s2", prompt="Hello world"),
+        ]
+        with tempfile.TemporaryDirectory() as td:
+            path = Path(td) / "suite.jsonl"
+            write_suite(entries, path)
+
+            # Verify JSONL is valid and has required 'prompt' field
+            for line in path.read_text().strip().split("\n"):
+                data = json.loads(line)
+                assert "prompt" in data
+                assert "id" in data
+
+    def test_creates_parent_dirs(self):
+        with tempfile.TemporaryDirectory() as td:
+            path = Path(td) / "sub" / "dir" / "suite.jsonl"
+            write_suite([SuiteEntry(id="s1", prompt="test")], path)
+            assert path.exists()
+
+
+class TestFormatSuiteSummary:
+    def test_summary_with_entries(self):
+        results = [
+            _make_result("s1", match=False, classification="likely_bug"),
+            _make_result("s2", match=False, classification="likely_uncertainty"),
+        ]
+        report = _make_report(results)
+        entries = generate_suite(report)
+        summary = format_suite_summary(entries, report)
+        assert "2 samples" in summary
+        assert "likely_bug" in summary
+        assert "likely_uncertainty" in summary
+
+    def test_summary_empty(self):
+        report = _make_report([_make_result("s1", match=True)])
+        summary = format_suite_summary([], report)
+        assert "No samples matched" in summary