xPyD-hub · hlin99 · Apr 6, 2026 · May 3, 2026
diff --git a/ROADMAP.md b/ROADMAP.md
@@ -879,7 +879,7 @@ get from a Python script calling `/v1/chat/completions`.
 - Transforms raw numbers into actionable verdicts: "expected hardware variance" vs "likely software bug"
 - Community-contributed baselines: users can submit anonymized precision profiles to build the database
 
-## M91: Smart Retry for Divergent Samples
+## M91: Smart Retry for Divergent Samples ✅
 - After initial batch comparison, automatically retry divergent samples with deterministic settings (temperature=0, seed=42)
 - Classifies divergence as: `deterministic` (reproduces under greedy decoding) or `stochastic` (disappears with greedy)
 - `batch-compare --smart-retry` flag triggers automatic rerun of divergent samples
@@ -891,3 +891,20 @@ get from a Python script calling `/v1/chat/completions`.
 - Integrates with existing `--fail-threshold`: only deterministic divergences count toward threshold
 - `smart_retry.py` module: `run_smart_retry()`, `SmartRetryResult`, `format_smart_retry()`
 - Tests covering retry logic, classification, integration, JSON export, CLI
+
+## M92: Test Suite Generation from Divergent Samples
+- `xpyd-acc generate-suite --report <path> --output <path>` extracts divergent samples into a reusable test dataset
+- Output format: JSONL (compatible with `batch-compare --dataset`)
+- Each entry includes: original prompt, expected baseline output, divergence metadata
+- `--classification <value>` filter: only include `likely_bug`, `likely_uncertainty`, etc.
+- `--deterministic-only` flag: only include samples classified as deterministic (from smart-retry)
+- `--min-logprob-gap <float>` filter: only include high-confidence divergences
+- `--max-samples <int>` cap the number of samples in the generated suite
+- `--include-expected` flag: embed baseline output as `expected` field for exact-match regression testing
+- `SuiteEntry` dataclass: id, prompt, expected (optional), metadata (classification, divergence_index, logprob_gap)
+- `GenerateSuiteConfig` dataclass for filter settings
+- `generate_suite()` function: load report → filter → emit JSONL
+- Round-trip: generated suite works directly with `batch-compare --dataset`
+- Enables CI workflow: batch-compare → generate-suite → commit suite → rerun on every deploy
+- `test_suite_gen.py` module with `generate_suite()`, `SuiteEntry`, `GenerateSuiteConfig`
+- 15 tests covering generation, filtering, round-trip compatibility, edge cases, CLI integration
diff --git a/src/xpyd_acc/cli/__init__.py b/src/xpyd_acc/cli/__init__.py
@@ -15,6 +15,7 @@
     _run_entropy,
     _run_file_compare,
     _run_fingerprint,
+    _run_generate_suite,
     _run_latency_regression,
     _run_length_bias,
     _run_sensitivity,
@@ -149,6 +150,7 @@ def main(argv: list[str] | None = None) -> None:
         "topology-scan": lambda: handle_topology_scan(args),
         "baseline-db": lambda: handle_baseline_db(args),
         "smart-retry": lambda: _run_smart_retry(args),
+        "generate-suite": lambda: _run_generate_suite(args),
     }
 
     if args.command in _early:

diff --git a/src/xpyd_acc/cli/analysis.py b/src/xpyd_acc/cli/analysis.py
@@ -629,3 +629,28 @@ def _run_smart_retry(args: argparse.Namespace) -> None:
 
     if result.deterministic_count > 0:
         raise SystemExit(1)
+
+
+def _run_generate_suite(args: argparse.Namespace) -> None:
+    """Handle the generate-suite subcommand."""
+    from pathlib import Path
+
+    from xpyd_acc.batch_compare import load_report
+    from xpyd_acc.test_suite_gen import (
+        GenerateSuiteConfig,
+        format_suite_summary,
+        generate_suite,
+        write_suite,
+    )
+
+    report = load_report(args.report)
+    config = GenerateSuiteConfig(
+        classification=args.classification,
+        min_logprob_gap=args.min_logprob_gap,
+        max_samples=args.max_samples,
+        include_expected=args.include_expected,
+    )
+    entries = generate_suite(report, config)
+    write_suite(entries, Path(args.output))
+    print(format_suite_summary(entries, report))
+    print(f"\nSuite written to {args.output}")
diff --git a/src/xpyd_acc/cli/parsers.py b/src/xpyd_acc/cli/parsers.py
@@ -58,6 +58,7 @@ def register_all(sub: argparse._SubParsersAction) -> None:
     _register_topology_scan(sub)
     _register_baseline_db(sub)
     _register_smart_retry(sub)
+    _register_generate_suite(sub)
 def _register_compare(sub):
     lp = sub.add_parser("compare-logprobs", help="Compare logprobs between two endpoints")
     lp.add_argument("--baseline", required=True, help="Baseline endpoint URL")
@@ -815,3 +816,15 @@ def _register_smart_retry(sub):
     p.add_argument("--timeout", type=float, default=120.0, help="HTTP timeout seconds")
     p.add_argument("--json", default=None, dest="json_path", help="Export results as JSON")
     p.add_argument("--skip-validation", action="store_true", help="Skip response validation")
+
+def _register_generate_suite(sub):
+    p = sub.add_parser("generate-suite", help="Generate test dataset from divergent samples")
+    p.add_argument("--report", required=True, help="Path to batch report JSON")
+    p.add_argument("--output", required=True, help="Output JSONL path")
+    p.add_argument("--classification", default=None, help="Filter by classification")
+    p.add_argument("--min-logprob-gap", type=float, default=None, help="Minimum logprob gap filter")
+    p.add_argument("--max-samples", type=int, default=None, help="Maximum samples in suite")
+    p.add_argument(
+        "--include-expected", action="store_true",
+        help="Include baseline output as expected",
+    )
diff --git a/src/xpyd_acc/test_suite_gen.py b/src/xpyd_acc/test_suite_gen.py
@@ -0,0 +1,157 @@
+"""Generate reusable test datasets from divergent samples in batch reports."""
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+from .batch_compare import BatchReport
+from .log import get_logger
+
+logger = get_logger(__name__)
+
+
+@dataclass
+class SuiteEntry:
+    """A single entry in a generated test suite."""
+
+    id: str
+    prompt: str
+    expected: str | None = None
+    metadata: dict[str, Any] = field(default_factory=dict)
+
+    def to_dict(self) -> dict[str, Any]:
+        """Serialize to dict, omitting None expected."""
+        d: dict[str, Any] = {"id": self.id, "prompt": self.prompt}
+        if self.expected is not None:
+            d["expected"] = self.expected
+        if self.metadata:
+            d["metadata"] = self.metadata
+        return d
+
+    def to_jsonl_line(self) -> str:
+        """Serialize to a single JSONL line."""
+        return json.dumps(self.to_dict(), ensure_ascii=False)
+
+
+@dataclass
+class GenerateSuiteConfig:
+    """Configuration for test suite generation filters."""
+
+    classification: str | None = None
+    deterministic_only: bool = False
+    min_logprob_gap: float | None = None
+    max_samples: int | None = None
+    include_expected: bool = False
+
+
+def generate_suite(
+    report: BatchReport,
+    config: GenerateSuiteConfig | None = None,
+) -> list[SuiteEntry]:
+    """Generate a test suite from divergent samples in a batch report.
+
+    Args:
+        report: A BatchReport containing comparison results.
+        config: Optional filtering configuration.
+
+    Returns:
+        List of SuiteEntry objects for the generated test suite.
+    """
+    if config is None:
+        config = GenerateSuiteConfig()
+
+    entries: list[SuiteEntry] = []
+
+    for result in report.results:
+        # Only include divergent samples
+        if not result.is_divergent():
+            continue
+
+        # Filter by classification
+        if config.classification and result.classification != config.classification:
+            continue
+
+        # Filter by minimum logprob gap
+        if config.min_logprob_gap is not None:
+            if result.logprob_gap is None or result.logprob_gap < config.min_logprob_gap:
+                continue
+
+        metadata: dict[str, Any] = {
+            "classification": result.classification,
+            "divergence_index": result.first_divergence_index,
+            "logprob_gap": result.logprob_gap,
+            "context_length": result.context_length,
+        }
+
+        expected = result.baseline_output if config.include_expected else None
+
+        entries.append(
+            SuiteEntry(
+                id=result.sample_id,
+                prompt=result.prompt,
+                expected=expected,
+                metadata=metadata,
+            )
+        )
+
+    # Cap the number of samples
+    if config.max_samples is not None and len(entries) > config.max_samples:
+        entries = entries[: config.max_samples]
+
+    logger.info("Generated test suite with %d entries from %d divergent samples",
+                len(entries), report.divergent_samples)
+    return entries
+
+
+def write_suite(entries: list[SuiteEntry], output: Path) -> None:
+    """Write test suite entries to a JSONL file.
+
+    Args:
+        entries: List of SuiteEntry objects.
+        output: Path to the output JSONL file.
+    """
+    output.parent.mkdir(parents=True, exist_ok=True)
+    with open(output, "w", encoding="utf-8") as f:
+        for entry in entries:
+            f.write(entry.to_jsonl_line() + "\n")
+    logger.info("Wrote %d entries to %s", len(entries), output)
+
+
+def format_suite_summary(entries: list[SuiteEntry], report: BatchReport) -> str:
+    """Format a human-readable summary of the generated test suite.
+
+    Args:
+        entries: Generated suite entries.
+        report: The source batch report.
+
+    Returns:
+        Formatted summary string.
+    """
+    lines = [
+        "Test Suite Generation Summary",
+        "=" * 40,
+        f"Source report: {report.total_samples} total samples, "
+        f"{report.divergent_samples} divergent",
+        f"Generated suite: {len(entries)} samples",
+        "",
+    ]
+
+    if entries:
+        classifications: dict[str, int] = {}
+        for entry in entries:
+            cls = entry.metadata.get("classification", "unknown")
+            classifications[cls] = classifications.get(cls, 0) + 1
+
+        lines.append("By classification:")
+        for cls, count in sorted(classifications.items()):
+            lines.append(f"  {cls}: {count}")
+
+        has_expected = sum(1 for e in entries if e.expected is not None)
+        lines.append(f"\nWith expected output: {has_expected}/{len(entries)}")
+    else:
+        lines.append("No samples matched the filter criteria.")
+
+    return "\n".join(lines)