Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion ROADMAP.md
Original file line number Diff line number Diff line change
Expand Up @@ -879,7 +879,7 @@ get from a Python script calling `/v1/chat/completions`.
- Transforms raw numbers into actionable verdicts: "expected hardware variance" vs "likely software bug"
- Community-contributed baselines: users can submit anonymized precision profiles to build the database

## M91: Smart Retry for Divergent Samples
## M91: Smart Retry for Divergent Samples
- After initial batch comparison, automatically retry divergent samples with deterministic settings (temperature=0, seed=42)
- Classifies divergence as: `deterministic` (reproduces under greedy decoding) or `stochastic` (disappears with greedy)
- `batch-compare --smart-retry` flag triggers automatic rerun of divergent samples
Expand All @@ -891,3 +891,20 @@ get from a Python script calling `/v1/chat/completions`.
- Integrates with existing `--fail-threshold`: only deterministic divergences count toward threshold
- `smart_retry.py` module: `run_smart_retry()`, `SmartRetryResult`, `format_smart_retry()`
- Tests covering retry logic, classification, integration, JSON export, CLI

## M92: Test Suite Generation from Divergent Samples
- `xpyd-acc generate-suite --report <path> --output <path>` extracts divergent samples into a reusable test dataset
- Output format: JSONL (compatible with `batch-compare --dataset`)
- Each entry includes: original prompt, expected baseline output, divergence metadata
- `--classification <value>` filter: only include `likely_bug`, `likely_uncertainty`, etc.
- `--deterministic-only` flag: only include samples classified as deterministic (from smart-retry)
- `--min-logprob-gap <float>` filter: only include high-confidence divergences
- `--max-samples <int>` cap the number of samples in the generated suite
- `--include-expected` flag: embed baseline output as `expected` field for exact-match regression testing
- `SuiteEntry` dataclass: id, prompt, expected (optional), metadata (classification, divergence_index, logprob_gap)
- `GenerateSuiteConfig` dataclass for filter settings
- `generate_suite()` function: load report → filter → emit JSONL
- Round-trip: generated suite works directly with `batch-compare --dataset`
- Enables CI workflow: batch-compare → generate-suite → commit suite → rerun on every deploy
- `test_suite_gen.py` module with `generate_suite()`, `SuiteEntry`, `GenerateSuiteConfig`
- 15 tests covering generation, filtering, round-trip compatibility, edge cases, CLI integration
2 changes: 2 additions & 0 deletions src/xpyd_acc/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
_run_entropy,
_run_file_compare,
_run_fingerprint,
_run_generate_suite,
_run_latency_regression,
_run_length_bias,
_run_sensitivity,
Expand Down Expand Up @@ -149,6 +150,7 @@ def main(argv: list[str] | None = None) -> None:
"topology-scan": lambda: handle_topology_scan(args),
"baseline-db": lambda: handle_baseline_db(args),
"smart-retry": lambda: _run_smart_retry(args),
"generate-suite": lambda: _run_generate_suite(args),
}

if args.command in _early:
Expand Down
25 changes: 25 additions & 0 deletions src/xpyd_acc/cli/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -629,3 +629,28 @@ def _run_smart_retry(args: argparse.Namespace) -> None:

if result.deterministic_count > 0:
raise SystemExit(1)


def _run_generate_suite(args: argparse.Namespace) -> None:
"""Handle the generate-suite subcommand."""
from pathlib import Path

from xpyd_acc.batch_compare import load_report
from xpyd_acc.test_suite_gen import (
GenerateSuiteConfig,
format_suite_summary,
generate_suite,
write_suite,
)

report = load_report(args.report)
config = GenerateSuiteConfig(
classification=args.classification,
min_logprob_gap=args.min_logprob_gap,
max_samples=args.max_samples,
include_expected=args.include_expected,
)
entries = generate_suite(report, config)
write_suite(entries, Path(args.output))
print(format_suite_summary(entries, report))
print(f"\nSuite written to {args.output}")
13 changes: 13 additions & 0 deletions src/xpyd_acc/cli/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def register_all(sub: argparse._SubParsersAction) -> None:
_register_topology_scan(sub)
_register_baseline_db(sub)
_register_smart_retry(sub)
_register_generate_suite(sub)
def _register_compare(sub):
lp = sub.add_parser("compare-logprobs", help="Compare logprobs between two endpoints")
lp.add_argument("--baseline", required=True, help="Baseline endpoint URL")
Expand Down Expand Up @@ -815,3 +816,15 @@ def _register_smart_retry(sub):
p.add_argument("--timeout", type=float, default=120.0, help="HTTP timeout seconds")
p.add_argument("--json", default=None, dest="json_path", help="Export results as JSON")
p.add_argument("--skip-validation", action="store_true", help="Skip response validation")

def _register_generate_suite(sub):
p = sub.add_parser("generate-suite", help="Generate test dataset from divergent samples")
p.add_argument("--report", required=True, help="Path to batch report JSON")
p.add_argument("--output", required=True, help="Output JSONL path")
p.add_argument("--classification", default=None, help="Filter by classification")
p.add_argument("--min-logprob-gap", type=float, default=None, help="Minimum logprob gap filter")
p.add_argument("--max-samples", type=int, default=None, help="Maximum samples in suite")
p.add_argument(
"--include-expected", action="store_true",
help="Include baseline output as expected",
)
157 changes: 157 additions & 0 deletions src/xpyd_acc/test_suite_gen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
"""Generate reusable test datasets from divergent samples in batch reports."""

from __future__ import annotations

import json
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any

from .batch_compare import BatchReport
from .log import get_logger

logger = get_logger(__name__)


@dataclass
class SuiteEntry:
"""A single entry in a generated test suite."""

id: str
prompt: str
expected: str | None = None
metadata: dict[str, Any] = field(default_factory=dict)

def to_dict(self) -> dict[str, Any]:
"""Serialize to dict, omitting None expected."""
d: dict[str, Any] = {"id": self.id, "prompt": self.prompt}
if self.expected is not None:
d["expected"] = self.expected
if self.metadata:
d["metadata"] = self.metadata
return d

def to_jsonl_line(self) -> str:
"""Serialize to a single JSONL line."""
return json.dumps(self.to_dict(), ensure_ascii=False)


@dataclass
class GenerateSuiteConfig:
"""Configuration for test suite generation filters."""

classification: str | None = None
deterministic_only: bool = False
min_logprob_gap: float | None = None
max_samples: int | None = None
include_expected: bool = False


def generate_suite(
report: BatchReport,
config: GenerateSuiteConfig | None = None,
) -> list[SuiteEntry]:
"""Generate a test suite from divergent samples in a batch report.

Args:
report: A BatchReport containing comparison results.
config: Optional filtering configuration.

Returns:
List of SuiteEntry objects for the generated test suite.
"""
if config is None:
config = GenerateSuiteConfig()

entries: list[SuiteEntry] = []

for result in report.results:
# Only include divergent samples
if not result.is_divergent():
continue

# Filter by classification
if config.classification and result.classification != config.classification:
continue

# Filter by minimum logprob gap
if config.min_logprob_gap is not None:
if result.logprob_gap is None or result.logprob_gap < config.min_logprob_gap:
continue

metadata: dict[str, Any] = {
"classification": result.classification,
"divergence_index": result.first_divergence_index,
"logprob_gap": result.logprob_gap,
"context_length": result.context_length,
}

expected = result.baseline_output if config.include_expected else None

entries.append(
SuiteEntry(
id=result.sample_id,
prompt=result.prompt,
expected=expected,
metadata=metadata,
)
)

# Cap the number of samples
if config.max_samples is not None and len(entries) > config.max_samples:
entries = entries[: config.max_samples]

logger.info("Generated test suite with %d entries from %d divergent samples",
len(entries), report.divergent_samples)
return entries


def write_suite(entries: list[SuiteEntry], output: Path) -> None:
"""Write test suite entries to a JSONL file.

Args:
entries: List of SuiteEntry objects.
output: Path to the output JSONL file.
"""
output.parent.mkdir(parents=True, exist_ok=True)
with open(output, "w", encoding="utf-8") as f:
for entry in entries:
f.write(entry.to_jsonl_line() + "\n")
logger.info("Wrote %d entries to %s", len(entries), output)


def format_suite_summary(entries: list[SuiteEntry], report: BatchReport) -> str:
"""Format a human-readable summary of the generated test suite.

Args:
entries: Generated suite entries.
report: The source batch report.

Returns:
Formatted summary string.
"""
lines = [
"Test Suite Generation Summary",
"=" * 40,
f"Source report: {report.total_samples} total samples, "
f"{report.divergent_samples} divergent",
f"Generated suite: {len(entries)} samples",
"",
]

if entries:
classifications: dict[str, int] = {}
for entry in entries:
cls = entry.metadata.get("classification", "unknown")
classifications[cls] = classifications.get(cls, 0) + 1

lines.append("By classification:")
for cls, count in sorted(classifications.items()):
lines.append(f" {cls}: {count}")

has_expected = sum(1 for e in entries if e.expected is not None)
lines.append(f"\nWith expected output: {has_expected}/{len(entries)}")
else:
lines.append("No samples matched the filter criteria.")

return "\n".join(lines)
Loading
Loading