diff --git a/evalview/core/chaos.py b/evalview/core/chaos.py index 6d78b6a..8f50229 100644 --- a/evalview/core/chaos.py +++ b/evalview/core/chaos.py @@ -30,7 +30,7 @@ import hashlib from dataclasses import dataclass, field -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Set, Tuple # ── Mode constants ────────────────────────────────────────────────────────── @@ -193,7 +193,7 @@ def build_scenario( The "one disruption per step" rule keeps the simulator deterministic and the scenario easy for humans to read. """ - seen: set[int] = set() + seen: Set[int] = set() unique: List[ChaosDisruption] = [] for d in disruptions: if d.step_index in seen: @@ -233,7 +233,7 @@ def random_scenario( raise ValueError("max_steps must be positive") disruptions: List[ChaosDisruption] = [] - used_steps: set[int] = set() + used_steps: Set[int] = set() for i in range(n_disruptions): mode = modes[_seeded_choice(seed, "mode", i) % len(modes)] # Pick a step that isn't taken yet. After many collisions we diff --git a/evalview/core/fleet.py b/evalview/core/fleet.py index 2295b42..6243c5f 100644 --- a/evalview/core/fleet.py +++ b/evalview/core/fleet.py @@ -26,7 +26,7 @@ import statistics from dataclasses import dataclass from pathlib import Path -from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple +from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple # ── Defaults ──────────────────────────────────────────────────────────────── @@ -216,7 +216,7 @@ def summarize_instance(name: str, entries: Sequence[Dict[str, Any]]) -> Instance cost = 0.0 first_seen: Optional[str] = None last_seen: Optional[str] = None - failing: set[str] = set() + failing: Set[str] = set() for e in entries: if "total_tests" not in e: @@ -416,7 +416,7 @@ def discover_history_files( found.extend(sorted(dp.glob("*.jsonl"))) # Dedup while preserving order. - seen: set[Path] = set() + seen: Set[Path] = set() out: List[Path] = [] for p in found: resolved = p.resolve() diff --git a/evalview/core/freshness.py b/evalview/core/freshness.py index 816ff7d..b25ce76 100644 --- a/evalview/core/freshness.py +++ b/evalview/core/freshness.py @@ -25,7 +25,9 @@ import re from dataclasses import dataclass from datetime import datetime, timezone -from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple +from typing import Any, Dict, FrozenSet, Iterable, List, Optional, Sequence, Tuple + +from evalview.core.text import STOPWORDS as _STOPWORDS # ── Tunables ──────────────────────────────────────────────────────────────── @@ -49,26 +51,10 @@ _MAX_EXAMPLES_PER_CLUSTER = 5 -# A short English stoplist. Keeping this tiny on purpose: Jaccard is already -# coarse and overly aggressive stopword filtering throws away signal. These -# are the words whose presence is least informative for query similarity. -_STOPWORDS = frozenset({ - "a", "an", "the", "is", "are", "was", "were", "be", "been", "being", - "do", "does", "did", "doing", "have", "has", "had", "having", - "i", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us", "them", - "my", "your", "his", "its", "our", "their", "mine", "yours", "ours", - "this", "that", "these", "those", - "and", "or", "but", "if", "then", "else", "of", "in", "on", "at", "to", - "for", "with", "by", "from", "as", "about", "into", "than", - "can", "could", "would", "should", "will", "shall", "may", "might", "must", - "not", "no", "so", "just", "also", "very", "really", "please", -}) - - # ── Tokenization & similarity ─────────────────────────────────────────────── -def normalize_query(query: str) -> frozenset[str]: +def normalize_query(query: str) -> FrozenSet[str]: """Lower-case, strip punctuation, collapse numbers, drop stopwords, tokenize. The returned set is used directly for Jaccard similarity. We deliberately @@ -93,7 +79,7 @@ def normalize_query(query: str) -> frozenset[str]: return frozenset(tokens) -def jaccard(a: frozenset[str], b: frozenset[str]) -> float: +def jaccard(a: FrozenSet[str], b: FrozenSet[str]) -> float: """Standard Jaccard set similarity in ``[0.0, 1.0]``. Defined as ``|A ∩ B| / |A ∪ B|``. Returns 0.0 when both sets are empty @@ -169,7 +155,7 @@ def compute_coverage( Jaccard similarity. If the suite is empty, every production query is classified as uncovered with ``similarity == 0.0``. """ - suite_tokens: List[Tuple[str, frozenset[str]]] = [ + suite_tokens: List[Tuple[str, FrozenSet[str]]] = [ (q, normalize_query(q)) for q in suite_queries if q ] @@ -230,7 +216,7 @@ def examples(self, limit: int = _MAX_EXAMPLES_PER_CLUSTER) -> List[str]: def _pick_representative( members: Sequence[str], - token_cache: Dict[str, frozenset[str]], + token_cache: Dict[str, FrozenSet[str]], ) -> Tuple[str, float]: """Return ``(representative, avg_intra_similarity)`` for a cluster. @@ -279,11 +265,11 @@ def cluster_queries( volumes typical of an early-production agent (hundreds to low thousands), this is plenty fast. """ - token_cache: Dict[str, frozenset[str]] = {} + token_cache: Dict[str, FrozenSet[str]] = {} # Use ``id(seed)`` would be unstable across runs; instead key by the # seed string itself, with a counter as tiebreaker for duplicates. cluster_members: List[List[str]] = [] - cluster_seeds: List[frozenset[str]] = [] + cluster_seeds: List[FrozenSet[str]] = [] for q in queries: if not q: diff --git a/evalview/core/goal_drift.py b/evalview/core/goal_drift.py index 4324f52..9365d84 100644 --- a/evalview/core/goal_drift.py +++ b/evalview/core/goal_drift.py @@ -31,7 +31,9 @@ import re from dataclasses import dataclass, field -from typing import Callable, Iterable, List, Optional, Sequence, Tuple +from typing import Callable, FrozenSet, Iterable, List, Optional, Sequence, Tuple + +from evalview.core.text import STOPWORDS as _STOPWORDS # ── Tunables ──────────────────────────────────────────────────────────────── @@ -47,21 +49,6 @@ _MAX_TEXT_CHARS = 4096 -# Mirror the small stoplist in evalview.core.freshness so the two modules -# behave consistently when a future refactor unifies them. -_STOPWORDS = frozenset({ - "a", "an", "the", "is", "are", "was", "were", "be", "been", "being", - "do", "does", "did", "doing", "have", "has", "had", "having", - "i", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us", "them", - "my", "your", "his", "its", "our", "their", "mine", "yours", "ours", - "this", "that", "these", "those", - "and", "or", "but", "if", "then", "else", "of", "in", "on", "at", "to", - "for", "with", "by", "from", "as", "about", "into", "than", - "can", "could", "would", "should", "will", "shall", "may", "might", "must", - "not", "no", "so", "just", "also", "very", "really", "please", -}) - - # ── Data shapes ───────────────────────────────────────────────────────────── @@ -106,7 +93,7 @@ def severity(self) -> str: # ── Tokenization (kept local to avoid coupling to freshness module) ───────── -def _tokens(text: str) -> frozenset[str]: +def _tokens(text: str) -> FrozenSet[str]: """Lower / strip / collapse digits / drop stopwords → token set. Same digit normalization as the freshness module: order numbers and @@ -124,7 +111,7 @@ def _tokens(text: str) -> frozenset[str]: ) -def _jaccard(a: frozenset[str], b: frozenset[str]) -> float: +def _jaccard(a: FrozenSet[str], b: FrozenSet[str]) -> float: if not a or not b: return 0.0 union = len(a | b) diff --git a/evalview/core/retrieval_lineage.py b/evalview/core/retrieval_lineage.py index e2392fb..5f4ecf2 100644 --- a/evalview/core/retrieval_lineage.py +++ b/evalview/core/retrieval_lineage.py @@ -27,23 +27,12 @@ import re from dataclasses import dataclass, field -from typing import Callable, Dict, List, Optional, Sequence, Tuple +from typing import Callable, Dict, FrozenSet, List, Optional, Sequence, Tuple +from evalview.core.text import STOPWORDS as _STOPWORDS -# ── Tunables ──────────────────────────────────────────────────────────────── -# Stoplist mirrors the freshness/goal_drift modules — keep them in sync. -_STOPWORDS = frozenset({ - "a", "an", "the", "is", "are", "was", "were", "be", "been", "being", - "do", "does", "did", "doing", "have", "has", "had", "having", - "i", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us", "them", - "my", "your", "his", "its", "our", "their", "mine", "yours", "ours", - "this", "that", "these", "those", - "and", "or", "but", "if", "then", "else", "of", "in", "on", "at", "to", - "for", "with", "by", "from", "as", "about", "into", "than", - "can", "could", "would", "should", "will", "shall", "may", "might", "must", - "not", "no", "so", "just", "also", "very", "really", "please", -}) +# ── Tunables ──────────────────────────────────────────────────────────────── _MAX_TEXT_CHARS = 8192 @@ -139,7 +128,7 @@ class StaleMemoryFlag: # ── Tokenization ──────────────────────────────────────────────────────────── -def _tokens(text: str) -> frozenset[str]: +def _tokens(text: str) -> FrozenSet[str]: if not text: return frozenset() truncated = text[:_MAX_TEXT_CHARS].lower() @@ -151,7 +140,7 @@ def _tokens(text: str) -> frozenset[str]: ) -def _overlap(chunk_tokens: frozenset[str], output_tokens: frozenset[str]) -> float: +def _overlap(chunk_tokens: FrozenSet[str], output_tokens: FrozenSet[str]) -> float: """Fraction of chunk tokens that appear in the output. This is *recall on the chunk*, not Jaccard. We want "did the output diff --git a/evalview/core/text.py b/evalview/core/text.py new file mode 100644 index 0000000..29f1524 --- /dev/null +++ b/evalview/core/text.py @@ -0,0 +1,23 @@ +"""Tiny shared text helpers used by the Jaccard-style analyses. + +Centralizing the stoplist here keeps ``freshness``, ``goal_drift``, and +``retrieval_lineage`` consistent. Kept intentionally small: Jaccard on a bag +of words is already coarse, and aggressive stopword filtering throws away +signal. +""" +from __future__ import annotations + +from typing import FrozenSet + + +STOPWORDS: FrozenSet[str] = frozenset({ + "a", "an", "the", "is", "are", "was", "were", "be", "been", "being", + "do", "does", "did", "doing", "have", "has", "had", "having", + "i", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us", "them", + "my", "your", "his", "its", "our", "their", "mine", "yours", "ours", + "this", "that", "these", "those", + "and", "or", "but", "if", "then", "else", "of", "in", "on", "at", "to", + "for", "with", "by", "from", "as", "about", "into", "than", + "can", "could", "would", "should", "will", "shall", "may", "might", "must", + "not", "no", "so", "just", "also", "very", "really", "please", +})