diff --git a/ROADMAP.md b/ROADMAP.md index 1fd88dc..5c27e5b 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -879,7 +879,7 @@ get from a Python script calling `/v1/chat/completions`. - Transforms raw numbers into actionable verdicts: "expected hardware variance" vs "likely software bug" - Community-contributed baselines: users can submit anonymized precision profiles to build the database -## M91: Smart Retry for Divergent Samples +## M91: Smart Retry for Divergent Samples ✅ - After initial batch comparison, automatically retry divergent samples with deterministic settings (temperature=0, seed=42) - Classifies divergence as: `deterministic` (reproduces under greedy decoding) or `stochastic` (disappears with greedy) - `batch-compare --smart-retry` flag triggers automatic rerun of divergent samples @@ -891,3 +891,20 @@ get from a Python script calling `/v1/chat/completions`. - Integrates with existing `--fail-threshold`: only deterministic divergences count toward threshold - `smart_retry.py` module: `run_smart_retry()`, `SmartRetryResult`, `format_smart_retry()` - Tests covering retry logic, classification, integration, JSON export, CLI + +## M92: Test Suite Generation from Divergent Samples +- `xpyd-acc generate-suite --report --output ` extracts divergent samples into a reusable test dataset +- Output format: JSONL (compatible with `batch-compare --dataset`) +- Each entry includes: original prompt, expected baseline output, divergence metadata +- `--classification ` filter: only include `likely_bug`, `likely_uncertainty`, etc. +- `--deterministic-only` flag: only include samples classified as deterministic (from smart-retry) +- `--min-logprob-gap ` filter: only include high-confidence divergences +- `--max-samples ` cap the number of samples in the generated suite +- `--include-expected` flag: embed baseline output as `expected` field for exact-match regression testing +- `SuiteEntry` dataclass: id, prompt, expected (optional), metadata (classification, divergence_index, logprob_gap) +- `GenerateSuiteConfig` dataclass for filter settings +- `generate_suite()` function: load report → filter → emit JSONL +- Round-trip: generated suite works directly with `batch-compare --dataset` +- Enables CI workflow: batch-compare → generate-suite → commit suite → rerun on every deploy +- `test_suite_gen.py` module with `generate_suite()`, `SuiteEntry`, `GenerateSuiteConfig` +- 15 tests covering generation, filtering, round-trip compatibility, edge cases, CLI integration diff --git a/src/xpyd_acc/cli/__init__.py b/src/xpyd_acc/cli/__init__.py index a0203f6..681e6b2 100644 --- a/src/xpyd_acc/cli/__init__.py +++ b/src/xpyd_acc/cli/__init__.py @@ -15,6 +15,7 @@ _run_entropy, _run_file_compare, _run_fingerprint, + _run_generate_suite, _run_latency_regression, _run_length_bias, _run_sensitivity, @@ -149,6 +150,7 @@ def main(argv: list[str] | None = None) -> None: "topology-scan": lambda: handle_topology_scan(args), "baseline-db": lambda: handle_baseline_db(args), "smart-retry": lambda: _run_smart_retry(args), + "generate-suite": lambda: _run_generate_suite(args), } if args.command in _early: diff --git a/src/xpyd_acc/cli/analysis.py b/src/xpyd_acc/cli/analysis.py index baede3c..255faa0 100644 --- a/src/xpyd_acc/cli/analysis.py +++ b/src/xpyd_acc/cli/analysis.py @@ -629,3 +629,28 @@ def _run_smart_retry(args: argparse.Namespace) -> None: if result.deterministic_count > 0: raise SystemExit(1) + + +def _run_generate_suite(args: argparse.Namespace) -> None: + """Handle the generate-suite subcommand.""" + from pathlib import Path + + from xpyd_acc.batch_compare import load_report + from xpyd_acc.test_suite_gen import ( + GenerateSuiteConfig, + format_suite_summary, + generate_suite, + write_suite, + ) + + report = load_report(args.report) + config = GenerateSuiteConfig( + classification=args.classification, + min_logprob_gap=args.min_logprob_gap, + max_samples=args.max_samples, + include_expected=args.include_expected, + ) + entries = generate_suite(report, config) + write_suite(entries, Path(args.output)) + print(format_suite_summary(entries, report)) + print(f"\nSuite written to {args.output}") diff --git a/src/xpyd_acc/cli/parsers.py b/src/xpyd_acc/cli/parsers.py index 57bdd54..90366d5 100644 --- a/src/xpyd_acc/cli/parsers.py +++ b/src/xpyd_acc/cli/parsers.py @@ -58,6 +58,7 @@ def register_all(sub: argparse._SubParsersAction) -> None: _register_topology_scan(sub) _register_baseline_db(sub) _register_smart_retry(sub) + _register_generate_suite(sub) def _register_compare(sub): lp = sub.add_parser("compare-logprobs", help="Compare logprobs between two endpoints") lp.add_argument("--baseline", required=True, help="Baseline endpoint URL") @@ -815,3 +816,15 @@ def _register_smart_retry(sub): p.add_argument("--timeout", type=float, default=120.0, help="HTTP timeout seconds") p.add_argument("--json", default=None, dest="json_path", help="Export results as JSON") p.add_argument("--skip-validation", action="store_true", help="Skip response validation") + +def _register_generate_suite(sub): + p = sub.add_parser("generate-suite", help="Generate test dataset from divergent samples") + p.add_argument("--report", required=True, help="Path to batch report JSON") + p.add_argument("--output", required=True, help="Output JSONL path") + p.add_argument("--classification", default=None, help="Filter by classification") + p.add_argument("--min-logprob-gap", type=float, default=None, help="Minimum logprob gap filter") + p.add_argument("--max-samples", type=int, default=None, help="Maximum samples in suite") + p.add_argument( + "--include-expected", action="store_true", + help="Include baseline output as expected", + ) diff --git a/src/xpyd_acc/test_suite_gen.py b/src/xpyd_acc/test_suite_gen.py new file mode 100644 index 0000000..1a39bf8 --- /dev/null +++ b/src/xpyd_acc/test_suite_gen.py @@ -0,0 +1,157 @@ +"""Generate reusable test datasets from divergent samples in batch reports.""" + +from __future__ import annotations + +import json +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +from .batch_compare import BatchReport +from .log import get_logger + +logger = get_logger(__name__) + + +@dataclass +class SuiteEntry: + """A single entry in a generated test suite.""" + + id: str + prompt: str + expected: str | None = None + metadata: dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> dict[str, Any]: + """Serialize to dict, omitting None expected.""" + d: dict[str, Any] = {"id": self.id, "prompt": self.prompt} + if self.expected is not None: + d["expected"] = self.expected + if self.metadata: + d["metadata"] = self.metadata + return d + + def to_jsonl_line(self) -> str: + """Serialize to a single JSONL line.""" + return json.dumps(self.to_dict(), ensure_ascii=False) + + +@dataclass +class GenerateSuiteConfig: + """Configuration for test suite generation filters.""" + + classification: str | None = None + deterministic_only: bool = False + min_logprob_gap: float | None = None + max_samples: int | None = None + include_expected: bool = False + + +def generate_suite( + report: BatchReport, + config: GenerateSuiteConfig | None = None, +) -> list[SuiteEntry]: + """Generate a test suite from divergent samples in a batch report. + + Args: + report: A BatchReport containing comparison results. + config: Optional filtering configuration. + + Returns: + List of SuiteEntry objects for the generated test suite. + """ + if config is None: + config = GenerateSuiteConfig() + + entries: list[SuiteEntry] = [] + + for result in report.results: + # Only include divergent samples + if not result.is_divergent(): + continue + + # Filter by classification + if config.classification and result.classification != config.classification: + continue + + # Filter by minimum logprob gap + if config.min_logprob_gap is not None: + if result.logprob_gap is None or result.logprob_gap < config.min_logprob_gap: + continue + + metadata: dict[str, Any] = { + "classification": result.classification, + "divergence_index": result.first_divergence_index, + "logprob_gap": result.logprob_gap, + "context_length": result.context_length, + } + + expected = result.baseline_output if config.include_expected else None + + entries.append( + SuiteEntry( + id=result.sample_id, + prompt=result.prompt, + expected=expected, + metadata=metadata, + ) + ) + + # Cap the number of samples + if config.max_samples is not None and len(entries) > config.max_samples: + entries = entries[: config.max_samples] + + logger.info("Generated test suite with %d entries from %d divergent samples", + len(entries), report.divergent_samples) + return entries + + +def write_suite(entries: list[SuiteEntry], output: Path) -> None: + """Write test suite entries to a JSONL file. + + Args: + entries: List of SuiteEntry objects. + output: Path to the output JSONL file. + """ + output.parent.mkdir(parents=True, exist_ok=True) + with open(output, "w", encoding="utf-8") as f: + for entry in entries: + f.write(entry.to_jsonl_line() + "\n") + logger.info("Wrote %d entries to %s", len(entries), output) + + +def format_suite_summary(entries: list[SuiteEntry], report: BatchReport) -> str: + """Format a human-readable summary of the generated test suite. + + Args: + entries: Generated suite entries. + report: The source batch report. + + Returns: + Formatted summary string. + """ + lines = [ + "Test Suite Generation Summary", + "=" * 40, + f"Source report: {report.total_samples} total samples, " + f"{report.divergent_samples} divergent", + f"Generated suite: {len(entries)} samples", + "", + ] + + if entries: + classifications: dict[str, int] = {} + for entry in entries: + cls = entry.metadata.get("classification", "unknown") + classifications[cls] = classifications.get(cls, 0) + 1 + + lines.append("By classification:") + for cls, count in sorted(classifications.items()): + lines.append(f" {cls}: {count}") + + has_expected = sum(1 for e in entries if e.expected is not None) + lines.append(f"\nWith expected output: {has_expected}/{len(entries)}") + else: + lines.append("No samples matched the filter criteria.") + + return "\n".join(lines) diff --git a/tests/test_test_suite_gen.py b/tests/test_test_suite_gen.py new file mode 100644 index 0000000..159256b --- /dev/null +++ b/tests/test_test_suite_gen.py @@ -0,0 +1,225 @@ +"""Tests for test suite generation from batch reports.""" + +from __future__ import annotations + +import json +import tempfile +from pathlib import Path + +from xpyd_acc.batch_compare import BatchReport, SampleResult +from xpyd_acc.test_suite_gen import ( + GenerateSuiteConfig, + SuiteEntry, + format_suite_summary, + generate_suite, + write_suite, +) + + +def _make_result( + sample_id: str, + prompt: str = "test prompt", + match: bool = False, + classification: str = "likely_bug", + logprob_gap: float | None = 0.5, + divergence_index: int | None = 5, + baseline_output: str = "baseline", + target_output: str = "target", +) -> SampleResult: + return SampleResult( + sample_id=sample_id, + prompt=prompt, + baseline_output=baseline_output, + target_output=target_output, + exact_match=match, + first_divergence_index=divergence_index, + baseline_logprob_at_divergence=-0.1 if not match else None, + target_logprob_at_divergence=-0.6 if not match else None, + logprob_gap=logprob_gap, + classification=classification, + context_length=100, + ) + + +def _make_report(results: list[SampleResult]) -> BatchReport: + divergent = [r for r in results if not r.exact_match] + return BatchReport( + total_samples=len(results), + divergent_samples=len(divergent), + match_samples=len(results) - len(divergent), + divergence_rate=len(divergent) / len(results) if results else 0.0, + results=results, + ) + + +class TestSuiteEntry: + def test_to_dict_without_expected(self): + entry = SuiteEntry(id="s1", prompt="hello") + d = entry.to_dict() + assert d == {"id": "s1", "prompt": "hello"} + assert "expected" not in d + assert "metadata" not in d + + def test_to_dict_with_expected_and_metadata(self): + entry = SuiteEntry(id="s1", prompt="hello", expected="world", + metadata={"classification": "likely_bug"}) + d = entry.to_dict() + assert d["expected"] == "world" + assert d["metadata"]["classification"] == "likely_bug" + + def test_to_jsonl_line(self): + entry = SuiteEntry(id="s1", prompt="hello") + line = entry.to_jsonl_line() + parsed = json.loads(line) + assert parsed["id"] == "s1" + assert parsed["prompt"] == "hello" + + +class TestGenerateSuite: + def test_basic_generation(self): + results = [ + _make_result("s1", match=True), + _make_result("s2", match=False), + _make_result("s3", match=False), + ] + report = _make_report(results) + entries = generate_suite(report) + assert len(entries) == 2 + assert {e.id for e in entries} == {"s2", "s3"} + + def test_empty_report(self): + report = _make_report([]) + entries = generate_suite(report) + assert entries == [] + + def test_no_divergent_samples(self): + results = [_make_result("s1", match=True)] + report = _make_report(results) + entries = generate_suite(report) + assert entries == [] + + def test_filter_by_classification(self): + results = [ + _make_result("s1", match=False, classification="likely_bug"), + _make_result("s2", match=False, classification="likely_uncertainty"), + _make_result("s3", match=False, classification="likely_bug"), + ] + report = _make_report(results) + config = GenerateSuiteConfig(classification="likely_bug") + entries = generate_suite(report, config) + assert len(entries) == 2 + assert all(e.metadata["classification"] == "likely_bug" for e in entries) + + def test_filter_by_min_logprob_gap(self): + results = [ + _make_result("s1", match=False, logprob_gap=0.1), + _make_result("s2", match=False, logprob_gap=0.5), + _make_result("s3", match=False, logprob_gap=None), + ] + report = _make_report(results) + config = GenerateSuiteConfig(min_logprob_gap=0.3) + entries = generate_suite(report, config) + assert len(entries) == 1 + assert entries[0].id == "s2" + + def test_max_samples(self): + results = [_make_result(f"s{i}", match=False) for i in range(10)] + report = _make_report(results) + config = GenerateSuiteConfig(max_samples=3) + entries = generate_suite(report, config) + assert len(entries) == 3 + + def test_include_expected(self): + results = [_make_result("s1", match=False, baseline_output="expected output")] + report = _make_report(results) + config = GenerateSuiteConfig(include_expected=True) + entries = generate_suite(report, config) + assert entries[0].expected == "expected output" + + def test_exclude_expected_by_default(self): + results = [_make_result("s1", match=False, baseline_output="expected output")] + report = _make_report(results) + entries = generate_suite(report) + assert entries[0].expected is None + + def test_metadata_populated(self): + results = [_make_result("s1", match=False, classification="likely_bug", + logprob_gap=0.42, divergence_index=7)] + report = _make_report(results) + entries = generate_suite(report) + meta = entries[0].metadata + assert meta["classification"] == "likely_bug" + assert meta["logprob_gap"] == 0.42 + assert meta["divergence_index"] == 7 + + def test_combined_filters(self): + results = [ + _make_result("s1", match=False, classification="likely_bug", logprob_gap=0.1), + _make_result("s2", match=False, classification="likely_bug", logprob_gap=0.5), + _make_result("s3", match=False, classification="likely_uncertainty", logprob_gap=0.8), + ] + report = _make_report(results) + config = GenerateSuiteConfig(classification="likely_bug", min_logprob_gap=0.3) + entries = generate_suite(report, config) + assert len(entries) == 1 + assert entries[0].id == "s2" + + +class TestWriteSuite: + def test_write_and_read_back(self): + entries = [ + SuiteEntry(id="s1", prompt="hello", metadata={"classification": "likely_bug"}), + SuiteEntry(id="s2", prompt="world", expected="output"), + ] + with tempfile.TemporaryDirectory() as td: + path = Path(td) / "suite.jsonl" + write_suite(entries, path) + + lines = path.read_text().strip().split("\n") + assert len(lines) == 2 + + parsed = [json.loads(line) for line in lines] + assert parsed[0]["id"] == "s1" + assert parsed[0]["prompt"] == "hello" + assert parsed[1]["expected"] == "output" + + def test_round_trip_with_batch_compare_dataset(self): + """Generated suite should be loadable as a batch-compare dataset.""" + entries = [ + SuiteEntry(id="s1", prompt="What is 2+2?"), + SuiteEntry(id="s2", prompt="Hello world"), + ] + with tempfile.TemporaryDirectory() as td: + path = Path(td) / "suite.jsonl" + write_suite(entries, path) + + # Verify JSONL is valid and has required 'prompt' field + for line in path.read_text().strip().split("\n"): + data = json.loads(line) + assert "prompt" in data + assert "id" in data + + def test_creates_parent_dirs(self): + with tempfile.TemporaryDirectory() as td: + path = Path(td) / "sub" / "dir" / "suite.jsonl" + write_suite([SuiteEntry(id="s1", prompt="test")], path) + assert path.exists() + + +class TestFormatSuiteSummary: + def test_summary_with_entries(self): + results = [ + _make_result("s1", match=False, classification="likely_bug"), + _make_result("s2", match=False, classification="likely_uncertainty"), + ] + report = _make_report(results) + entries = generate_suite(report) + summary = format_suite_summary(entries, report) + assert "2 samples" in summary + assert "likely_bug" in summary + assert "likely_uncertainty" in summary + + def test_summary_empty(self): + report = _make_report([_make_result("s1", match=True)]) + summary = format_suite_summary([], report) + assert "No samples matched" in summary