Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions evalview/core/chaos.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@

import hashlib
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Tuple
from typing import Any, Dict, List, Optional, Set, Tuple


# ── Mode constants ──────────────────────────────────────────────────────────
Expand Down Expand Up @@ -193,7 +193,7 @@ def build_scenario(
The "one disruption per step" rule keeps the simulator
deterministic and the scenario easy for humans to read.
"""
seen: set[int] = set()
seen: Set[int] = set()
unique: List[ChaosDisruption] = []
for d in disruptions:
if d.step_index in seen:
Expand Down Expand Up @@ -233,7 +233,7 @@ def random_scenario(
raise ValueError("max_steps must be positive")

disruptions: List[ChaosDisruption] = []
used_steps: set[int] = set()
used_steps: Set[int] = set()
for i in range(n_disruptions):
mode = modes[_seeded_choice(seed, "mode", i) % len(modes)]
# Pick a step that isn't taken yet. After many collisions we
Expand Down
6 changes: 3 additions & 3 deletions evalview/core/fleet.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
import statistics
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple


# ── Defaults ────────────────────────────────────────────────────────────────
Expand Down Expand Up @@ -216,7 +216,7 @@ def summarize_instance(name: str, entries: Sequence[Dict[str, Any]]) -> Instance
cost = 0.0
first_seen: Optional[str] = None
last_seen: Optional[str] = None
failing: set[str] = set()
failing: Set[str] = set()

for e in entries:
if "total_tests" not in e:
Expand Down Expand Up @@ -416,7 +416,7 @@ def discover_history_files(
found.extend(sorted(dp.glob("*.jsonl")))

# Dedup while preserving order.
seen: set[Path] = set()
seen: Set[Path] = set()
out: List[Path] = []
for p in found:
resolved = p.resolve()
Expand Down
32 changes: 9 additions & 23 deletions evalview/core/freshness.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@
import re
from dataclasses import dataclass
from datetime import datetime, timezone
from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
from typing import Any, Dict, FrozenSet, Iterable, List, Optional, Sequence, Tuple

from evalview.core.text import STOPWORDS as _STOPWORDS


# ── Tunables ────────────────────────────────────────────────────────────────
Expand All @@ -49,26 +51,10 @@
_MAX_EXAMPLES_PER_CLUSTER = 5


# A short English stoplist. Keeping this tiny on purpose: Jaccard is already
# coarse and overly aggressive stopword filtering throws away signal. These
# are the words whose presence is least informative for query similarity.
_STOPWORDS = frozenset({
"a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
"do", "does", "did", "doing", "have", "has", "had", "having",
"i", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us", "them",
"my", "your", "his", "its", "our", "their", "mine", "yours", "ours",
"this", "that", "these", "those",
"and", "or", "but", "if", "then", "else", "of", "in", "on", "at", "to",
"for", "with", "by", "from", "as", "about", "into", "than",
"can", "could", "would", "should", "will", "shall", "may", "might", "must",
"not", "no", "so", "just", "also", "very", "really", "please",
})


# ── Tokenization & similarity ───────────────────────────────────────────────


def normalize_query(query: str) -> frozenset[str]:
def normalize_query(query: str) -> FrozenSet[str]:
"""Lower-case, strip punctuation, collapse numbers, drop stopwords, tokenize.

The returned set is used directly for Jaccard similarity. We deliberately
Expand All @@ -93,7 +79,7 @@ def normalize_query(query: str) -> frozenset[str]:
return frozenset(tokens)


def jaccard(a: frozenset[str], b: frozenset[str]) -> float:
def jaccard(a: FrozenSet[str], b: FrozenSet[str]) -> float:
"""Standard Jaccard set similarity in ``[0.0, 1.0]``.

Defined as ``|A ∩ B| / |A ∪ B|``. Returns 0.0 when both sets are empty
Expand Down Expand Up @@ -169,7 +155,7 @@ def compute_coverage(
Jaccard similarity. If the suite is empty, every production query is
classified as uncovered with ``similarity == 0.0``.
"""
suite_tokens: List[Tuple[str, frozenset[str]]] = [
suite_tokens: List[Tuple[str, FrozenSet[str]]] = [
(q, normalize_query(q)) for q in suite_queries if q
]

Expand Down Expand Up @@ -230,7 +216,7 @@ def examples(self, limit: int = _MAX_EXAMPLES_PER_CLUSTER) -> List[str]:

def _pick_representative(
members: Sequence[str],
token_cache: Dict[str, frozenset[str]],
token_cache: Dict[str, FrozenSet[str]],
) -> Tuple[str, float]:
"""Return ``(representative, avg_intra_similarity)`` for a cluster.

Expand Down Expand Up @@ -279,11 +265,11 @@ def cluster_queries(
volumes typical of an early-production agent (hundreds to low thousands),
this is plenty fast.
"""
token_cache: Dict[str, frozenset[str]] = {}
token_cache: Dict[str, FrozenSet[str]] = {}
# Use ``id(seed)`` would be unstable across runs; instead key by the
# seed string itself, with a counter as tiebreaker for duplicates.
cluster_members: List[List[str]] = []
cluster_seeds: List[frozenset[str]] = []
cluster_seeds: List[FrozenSet[str]] = []

for q in queries:
if not q:
Expand Down
23 changes: 5 additions & 18 deletions evalview/core/goal_drift.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@

import re
from dataclasses import dataclass, field
from typing import Callable, Iterable, List, Optional, Sequence, Tuple
from typing import Callable, FrozenSet, Iterable, List, Optional, Sequence, Tuple

from evalview.core.text import STOPWORDS as _STOPWORDS


# ── Tunables ────────────────────────────────────────────────────────────────
Expand All @@ -47,21 +49,6 @@
_MAX_TEXT_CHARS = 4096


# Mirror the small stoplist in evalview.core.freshness so the two modules
# behave consistently when a future refactor unifies them.
_STOPWORDS = frozenset({
"a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
"do", "does", "did", "doing", "have", "has", "had", "having",
"i", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us", "them",
"my", "your", "his", "its", "our", "their", "mine", "yours", "ours",
"this", "that", "these", "those",
"and", "or", "but", "if", "then", "else", "of", "in", "on", "at", "to",
"for", "with", "by", "from", "as", "about", "into", "than",
"can", "could", "would", "should", "will", "shall", "may", "might", "must",
"not", "no", "so", "just", "also", "very", "really", "please",
})


# ── Data shapes ─────────────────────────────────────────────────────────────


Expand Down Expand Up @@ -106,7 +93,7 @@ def severity(self) -> str:
# ── Tokenization (kept local to avoid coupling to freshness module) ─────────


def _tokens(text: str) -> frozenset[str]:
def _tokens(text: str) -> FrozenSet[str]:
"""Lower / strip / collapse digits / drop stopwords → token set.

Same digit normalization as the freshness module: order numbers and
Expand All @@ -124,7 +111,7 @@ def _tokens(text: str) -> frozenset[str]:
)


def _jaccard(a: frozenset[str], b: frozenset[str]) -> float:
def _jaccard(a: FrozenSet[str], b: FrozenSet[str]) -> float:
if not a or not b:
return 0.0
union = len(a | b)
Expand Down
21 changes: 5 additions & 16 deletions evalview/core/retrieval_lineage.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,23 +27,12 @@

import re
from dataclasses import dataclass, field
from typing import Callable, Dict, List, Optional, Sequence, Tuple
from typing import Callable, Dict, FrozenSet, List, Optional, Sequence, Tuple

from evalview.core.text import STOPWORDS as _STOPWORDS

# ── Tunables ────────────────────────────────────────────────────────────────

# Stoplist mirrors the freshness/goal_drift modules — keep them in sync.
_STOPWORDS = frozenset({
"a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
"do", "does", "did", "doing", "have", "has", "had", "having",
"i", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us", "them",
"my", "your", "his", "its", "our", "their", "mine", "yours", "ours",
"this", "that", "these", "those",
"and", "or", "but", "if", "then", "else", "of", "in", "on", "at", "to",
"for", "with", "by", "from", "as", "about", "into", "than",
"can", "could", "would", "should", "will", "shall", "may", "might", "must",
"not", "no", "so", "just", "also", "very", "really", "please",
})
# ── Tunables ────────────────────────────────────────────────────────────────

_MAX_TEXT_CHARS = 8192

Expand Down Expand Up @@ -139,7 +128,7 @@ class StaleMemoryFlag:
# ── Tokenization ────────────────────────────────────────────────────────────


def _tokens(text: str) -> frozenset[str]:
def _tokens(text: str) -> FrozenSet[str]:
if not text:
return frozenset()
truncated = text[:_MAX_TEXT_CHARS].lower()
Expand All @@ -151,7 +140,7 @@ def _tokens(text: str) -> frozenset[str]:
)


def _overlap(chunk_tokens: frozenset[str], output_tokens: frozenset[str]) -> float:
def _overlap(chunk_tokens: FrozenSet[str], output_tokens: FrozenSet[str]) -> float:
"""Fraction of chunk tokens that appear in the output.

This is *recall on the chunk*, not Jaccard. We want "did the output
Expand Down
23 changes: 23 additions & 0 deletions evalview/core/text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
"""Tiny shared text helpers used by the Jaccard-style analyses.

Centralizing the stoplist here keeps ``freshness``, ``goal_drift``, and
``retrieval_lineage`` consistent. Kept intentionally small: Jaccard on a bag
of words is already coarse, and aggressive stopword filtering throws away
signal.
"""
from __future__ import annotations

from typing import FrozenSet


STOPWORDS: FrozenSet[str] = frozenset({
"a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
"do", "does", "did", "doing", "have", "has", "had", "having",
"i", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us", "them",
"my", "your", "his", "its", "our", "their", "mine", "yours", "ours",
"this", "that", "these", "those",
"and", "or", "but", "if", "then", "else", "of", "in", "on", "at", "to",
"for", "with", "by", "from", "as", "about", "into", "than",
"can", "could", "would", "should", "will", "shall", "may", "might", "must",
"not", "no", "so", "just", "also", "very", "really", "please",
})
Loading