bootstrap-basil/score_override.py at main · hunterooc/bootstrap-basil · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
"""
Programmatic Score Floor — Englishness Detection

Two-phase scoring system for Basil's output:
  Phase 1 (this module):  Programmatic floor based on English word detection (0-2)
  Phase 2 (grader_agent): LLM grader scores task compliance (0-7)
  Final score = max(programmatic_floor, llm_grader_score)

Programmatic floor levels:
  0: No English words at all (pure gibberish)
  1: Exactly 1 unique English word
  2: 2+ unique English words (vocabulary diversity)

Topic-relevance scoring (3+) is left entirely to the LLM grader, which
is better calibrated to distinguish content-specific domain words from
generic function words that happen to appear in the session text.

Usage:
    from score_override import apply_score_floor

    grade = grade_response(...)  # LLM grader returns 0-7
    grade = apply_score_floor(grade, basil_output, session_text_before_basil)
    # grade["score"] is now max(programmatic_floor, grader_score)
"""

import re
from spellchecker import SpellChecker

# Minimum word length: 2 chars allows "no", "me", "am", "if", etc.
# Excludes single letters since pyspellchecker treats all 26 as valid.
MIN_WORD_LENGTH = 2

# Module-level SpellChecker instance (dictionary loads once, ~200ms)
_spell = SpellChecker()

# Role names that should count as English (scores 1-2) but NOT as
# session/domain-relevant words (score 3).  These appear in every session
# and would inflate the programmatic floor if treated as on-topic.
ROLE_NAMES = {"sophie", "basil", "tutor"}

def extract_words(text: str, min_length: int = MIN_WORD_LENGTH) -> list:
    """
    Extract lowercased alphabetic tokens from text, filtered by min length.

    Returns a list (preserving duplicates for frequency analysis).
    """
    return [w.lower() for w in re.findall(r'\b[a-zA-Z]+\b', text) if len(w) >= min_length]


def get_english_words(tokens: list) -> set:
    """
    Return the subset of tokens that are real English words (via pyspellchecker).

    Args:
        tokens: List of lowercased word strings.

    Returns:
        Set of tokens recognized as English dictionary words.
    """
    if not tokens:
        return set()
    unique = set(tokens)
    return _spell.known(unique)


def is_excessively_repetitive(tokens: list) -> bool:
    """
    Detect excessive repetition in Basil's output tokens.

    Returns True if:
      - Any word appears 3+ times consecutively (e.g., "cat cat cat")
      - A single word accounts for >60% of all tokens (dominance)

    This caps the programmatic floor at 1 — the output contains English,
    but it's looping, not meaningful language production.
    """
    if len(tokens) < 3:
        return False

    # Check consecutive repetition (same word 3+ times in a row)
    for i in range(len(tokens) - 2):
        if tokens[i] == tokens[i + 1] == tokens[i + 2]:
            return True

    # Check single-word dominance (>60% of all tokens).
    # Require at least 5 tokens — with fewer, even normal output like
    # "dog cat dog" would falsely trigger (2/3 = 67%).
    if len(tokens) >= 5:
        from collections import Counter
        counts = Counter(tokens)
        most_common_count = counts.most_common(1)[0][1]
        if most_common_count / len(tokens) > 0.60:
            return True

    return False


def compute_programmatic_score(basil_output: str, session_text: str = "",
                                min_length: int = MIN_WORD_LENGTH) -> dict:
    """
    Compute the programmatic Englishness floor score (0-2).

    The programmatic floor only detects English word presence and diversity.
    Topic-relevance scoring (3+) is left to the LLM grader, which is better
    calibrated to distinguish content-specific vocabulary from generic
    function words that happen to appear in the session text.

    Args:
        basil_output: Basil's generated response text.
        session_text: All Tutor/Sophie text before Basil's turn (for audit logging).
        min_length: Minimum word length to consider.

    Returns:
        Dict with:
            score: int (0-2)
            english_words: list of unique English words found
            session_words_matched: list of words matching session text (audit only)
            reason: str describing why this score was assigned
    """
    basil_tokens = extract_words(basil_output, min_length)
    basil_english = get_english_words(basil_tokens)
    unique_count = len(basil_english)

    result = {
        "score": 0,
        "english_words": sorted(basil_english),
        "session_words_matched": [],
        "reason": "",
    }

    if unique_count == 0:
        result["reason"] = "no English words found"
        return result

    # Repetition check — cap at score 1 if output is looping
    repetitive = is_excessively_repetitive(basil_tokens)
    result["repetitive"] = repetitive
    if repetitive:
        result["score"] = 1
        result["reason"] = (
            f"{unique_count} unique English word(s), but excessively repetitive — "
            f"capped at 1"
        )
        return result

    # Check for session word overlap (logged for audit, but no longer
    # used to inflate the programmatic score to 3).
    if session_text and unique_count >= 1:
        session_tokens = extract_words(session_text, min_length)
        session_english = get_english_words(session_tokens)
        result["session_words_matched"] = sorted(
            (basil_english & session_english) - ROLE_NAMES
        )

    if unique_count >= 2:
        result["score"] = 2
        result["reason"] = f"{unique_count} unique English words (diversity)"
    else:
        result["score"] = 1
        result["reason"] = f"1 unique English word: {sorted(basil_english)}"

    return result


def apply_score_floor(grade: dict, basil_output: str,
                      session_text_before_basil: str = "",
                      targets: list = None,
                      verbose: bool = False) -> dict:
    """
    Apply programmatic floors to a grader result.

    Two floors are checked:
      1. Englishness floor (0-2): based on English word detection
      2. Target-word presence floor (6): if any target word from the task
         is literally present in Basil's output (using text_contains_token)

    Final score = max(grader_score, englishness_floor, target_floor).
    Modifies grade dict in-place and returns it.

    Args:
        grade: Grade dict from grader_agent (must have "score" key).
        basil_output: Basil's generated response text.
        session_text_before_basil: All Tutor/Sophie text before Basil's turn.
        targets: List of target words/phrases from task_spec (optional).
        verbose: Print floor details.

    Returns:
        The (possibly modified) grade dict with added audit fields.
    """
    if not basil_output:
        return grade

    prog = compute_programmatic_score(basil_output, session_text_before_basil)
    prog_score = prog["score"]
    grader_score = grade.get("score", 0)

    # Record programmatic analysis for audit
    grade["programmatic_score"] = prog_score
    grade["programmatic_english_words"] = prog["english_words"]
    if prog["session_words_matched"]:
        grade["programmatic_session_words"] = prog["session_words_matched"]

    # Target-word presence floor: if any target from the task_spec is
    # found in Basil's output, guarantee at least score 6.
    # Exception: excessively repetitive output stays capped at 3.
    target_floor = 0
    target_matched = None
    if targets:
        from task_contract import text_contains_token
        basil_lower = basil_output.lower()
        for target in targets:
            if text_contains_token(basil_lower, target.lower()):
                target_matched = target
                basil_tokens = extract_words(basil_output)
                if is_excessively_repetitive(basil_tokens):
                    target_floor = 3
                else:
                    target_floor = 6
                break

    if target_matched:
        grade["target_floor_matched"] = target_matched
        grade["target_floor"] = target_floor

    # Combined floor = max(englishness, target)
    combined_floor = max(prog_score, target_floor)

    # Apply floor: final = max(grader, combined_floor)
    if combined_floor > grader_score:
        grade["score"] = combined_floor
        grade["floor_applied"] = True
        if target_floor > prog_score and target_floor > grader_score:
            grade["floor_reason"] = (
                f"target word '{target_matched}' present → floor {target_floor}"
            )
        else:
            grade["floor_reason"] = prog["reason"]

        if verbose:
            print(f"  [Floor] {grader_score}→{combined_floor}: {grade['floor_reason']}")
    else:
        grade["floor_applied"] = False

    return grade