bootstrap-basil/test_model_comparison.py at main · hunterooc/bootstrap-basil · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
#!/usr/bin/env python3
"""
Model Comparison Script
=======================
Runs one realistic API call for each of the 6 agent roles through each
candidate model, printing results side-by-side with latency and token counts.

Models tested:
  - gpt-4o-mini    (current — baseline)
  - gpt-4.1-mini   (newer, cheaper, faster)
  - gpt-4.1-nano   (cheapest / fastest)
  - gpt-5-mini     (GPT-5 class, built-in reasoning, $0.25/$2.00 per 1M)
  - gpt-5-nano     (GPT-5 class, fastest/cheapest, $0.05/$0.40 per 1M)
  - o4-mini         (reasoning model — tested on grader only)

Usage:
  python test_model_comparison.py              # run all roles
  python test_model_comparison.py --role grader # run one role only
  python test_model_comparison.py --models gpt-4.1-mini gpt-4.1-nano  # subset of models
"""

import argparse
import json
import time
import textwrap
from openai import OpenAI

client = OpenAI()  # uses OPENAI_API_KEY from env

# ── Models to compare ────────────────────────────────────────────────────────

DEFAULT_MODELS = ["gpt-4o-mini", "gpt-4.1-mini", "gpt-4.1-nano", "gpt-5-mini", "gpt-5-nano"]
REASONING_MODELS = ["o4-mini"]  # only tested on grader

# ── Colour helpers ───────────────────────────────────────────────────────────

class C:
    BOLD  = "\033[1m"
    DIM   = "\033[2m"
    CYAN  = "\033[36m"
    GREEN = "\033[32m"
    YELLOW = "\033[33m"
    RED   = "\033[31m"
    RESET = "\033[0m"

def banner(text):
    width = 80
    print(f"\n{C.BOLD}{C.CYAN}{'=' * width}")
    print(f"  {text}")
    print(f"{'=' * width}{C.RESET}\n")

def sub_banner(text):
    print(f"\n{C.BOLD}{C.YELLOW}--- {text} ---{C.RESET}")

# ── API call wrapper ─────────────────────────────────────────────────────────

def call_model(model, messages, temperature=0.7, max_tokens=512, json_mode=False):
    """Call a model and return (response_text, latency_ms, prompt_tokens, completion_tokens)."""
    kwargs = dict(
        model=model,
        messages=messages,
        max_tokens=max_tokens,
    )

    # Reasoning models (o-series) and GPT-5 models use max_completion_tokens
    # instead of max_tokens, and may use reasoning tokens internally.
    # o-series also doesn't support temperature or response_format.
    is_o_series = model.startswith("o")
    is_gpt5 = model.startswith("gpt-5")
    needs_completion_tokens = is_o_series or is_gpt5

    if needs_completion_tokens:
        del kwargs["max_tokens"]
        # These models use reasoning tokens internally — give extra budget
        kwargs["max_completion_tokens"] = max(max_tokens * 4, 2048)

    if is_o_series or is_gpt5:
        # o-series and GPT-5 models only support temperature=1 (default)
        pass
    else:
        kwargs["temperature"] = temperature

    if not is_o_series:
        # GPT-5 supports response_format; o-series does not
        if json_mode:
            kwargs["response_format"] = {"type": "json_object"}

    t0 = time.perf_counter()
    try:
        response = client.chat.completions.create(**kwargs)
    except Exception as e:
        elapsed = (time.perf_counter() - t0) * 1000
        return f"[ERROR] {e}", elapsed, 0, 0
    elapsed = (time.perf_counter() - t0) * 1000

    msg = response.choices[0].message
    text = (msg.content or "").strip()
    usage = response.usage
    comp_tokens = usage.completion_tokens

    # For reasoning models, report reasoning vs output tokens separately
    if needs_completion_tokens and hasattr(usage, "completion_tokens_details") and usage.completion_tokens_details:
        details = usage.completion_tokens_details
        reasoning = getattr(details, "reasoning_tokens", 0)
        if reasoning:
            comp_tokens = f"{reasoning}r+{comp_tokens - reasoning}"

    return text, elapsed, usage.prompt_tokens, comp_tokens


def print_result(model, text, latency_ms, prompt_tok, comp_tok):
    """Pretty-print one model's result."""
    cost_label = f"{prompt_tok}+{comp_tok} tok"
    print(f"  {C.BOLD}{model:<20}{C.RESET} {C.DIM}({latency_ms:,.0f} ms | {cost_label}){C.RESET}")
    if not text:
        print(f"    {C.RED}[empty response]{C.RESET}")
    else:
        # Wrap long output for readability
        for line in text.split("\n"):
            wrapped = textwrap.fill(line, width=76, initial_indent="    ", subsequent_indent="    ")
            print(wrapped)
    print()

# ── Test cases ───────────────────────────────────────────────────────────────
# Each returns (role_name, messages, temperature, max_tokens, json_mode, extra_models)

def make_tutor_test():
    """TUTOR: Generate a teaching paragraph + microtask (Phase B.1)."""
    system = textwrap.dedent("""\
        You are Tutor, a patient and encouraging teacher helping a baby language model
        named Basil learn to speak. Basil is like a very young child who is just
        learning language.

        ## Basil's Current Assessment
        Age Band: 1 - first words (closed sets, simple labels)
        Max Output Tokens: 40

        ## Session Info
        Subject: Ocean Life
        Lesson: Sea Creatures

        ## Teaching Style
        Provide a teaching paragraph (3-6 sentences) about the lesson topic, then
        end with exactly ONE task line: TASK: <simple instruction>

        ## Recent Conversation
        Tutor: Let's learn about sea creatures today, Basil!
        Sophie: Ooh, I love the ocean! Did you know octopuses have three hearts?
        Basil: bla mmm fish da
        Tutor: That's right, Basil — fish! Fish live in the ocean. They have fins to swim.
        TASK: Say 'fish'
        Basil: fi fish
    """)
    messages = [
        {"role": "system", "content": system},
        {"role": "user", "content": "Continue as Tutor. Teach a coherent paragraph about sea creatures, then give Basil one clear TASK."},
    ]
    return ("tutor", messages, 0.9, 300, False, [])


def make_sophie_test():
    """SOPHIE: React to Tutor's teaching (Phase B.2)."""
    system = textwrap.dedent("""\
        You are Sophie, a kind and articulate 12-year-old who is Basil's older sibling.

        ## Context
        Subject: Ocean Life
        Lesson: Sea Creatures
        Basil's age_band: 1

        ## Tutor Just Said
        Great try saying 'fish', Basil! Let me tell you about another amazing sea
        creature — the sea turtle. Sea turtles are some of the oldest animals on Earth.
        They've been swimming in the oceans for over 100 million years! Baby sea turtles
        hatch from eggs on sandy beaches and then crawl all the way to the water.
        Some sea turtles can hold their breath for up to 5 hours while they sleep underwater.

        ## Your Task
        React naturally to what Tutor just taught. Write 1-2 short sentences:
        - Acknowledge or show interest in what Tutor said
        - You can ADD an additional fact or BUILD on what Tutor said
        - Keep it warm and conversational (12-year-old voice)
        - Do NOT mention Basil or tasks yet
        - Keep under 40 words total
    """)
    messages = [
        {"role": "system", "content": system},
        {"role": "user", "content": "React to the teaching as Sophie."},
    ]
    return ("sophie", messages, 0.8, 100, False, [])


def make_task_agent_test():
    """TASK AGENT: Generate task + rubric (JSON)."""
    system = textwrap.dedent("""\
        You are a Task Agent that generates evaluation tasks for Basil, a baby language
        model learning to speak.

        ## Basil's Assessment
        Age Band: 1 - first words (closed sets, simple labels)
        Max Output Tokens: 40

        ## Current Session Context
        Subject: Ocean Life
        Lesson: Sea Creatures
        Allowed Topic Keywords: fish, whale, shark, crab, turtle, shell, wave, swim, blue, sea

        ## Recent Targets (AVOID repeating)
        fish, whale

        ## Previous Askers
        tutor, tutor, sophie

        ## Task Generation Rules
        Age Band 1: English fragments, no compliance. Focus on control, vocab attempts.
        Examples: "Say 'cat'", "Say yes or no"

        ## Ground Tasks in the Lesson (CRITICAL)
        The target word MUST come from the Allowed Topic Keywords list.

        ## OUTPUT FORMAT (JSON ONLY)
        Return valid JSON:
        {
          "task_text": "The question or prompt for Basil (short, clear)",
          "task_category": "control|vocab|relevance|memory|conversation",
          "stage_hint": 0-4,
          "asker": "tutor|sophie",
          "targets": ["word1"],
          "choices": [],
          "rubric": {
            "0": "...", "1": "...", "2": "...", "3": "...", "4": "...", "5": "...", "6": "...", "7": "..."
          },
          "grader_instructions": "Concise guidance",
          "max_tokens": 40
        }
    """)
    messages = [
        {"role": "system", "content": "You are a task generation agent. Output valid JSON only."},
        {"role": "user", "content": system},
    ]
    return ("task_agent", messages, 0.7, 400, True, [])


def make_grader_test():
    """GRADER: Score Basil's response (JSON). Also tests o4-mini."""
    system = textwrap.dedent("""\
        You are a Grader Agent evaluating responses from Basil, a baby language model
        learning to speak.

        ## Important Guidelines
        - Follow the rubric exactly - no hidden requirements
        - Be generous with partial credit for early-stage learners
        - Quote specific evidence from Basil's output
        - Remember: Basil is learning from scratch, so ANY recognizable word is progress

        ## Task Given to Basil
        Say 'turtle'

        ## Rubric
        0: No output or completely unintelligible noise
        1: Any recognizable English word (even if wrong)
        2: An animal-related or ocean-related word
        3: A word starting with 't' or rhyming with 'turtle'
        4: A close attempt like 'turt', 'turle', 'turtl'
        5: Clearly says 'turtle'

        ## Grader Instructions
        Look for any approximation of the target word 'turtle'. Award partial credit
        for ocean-related words or words starting with 't'. Be generous with partial
        phonetic matches since Basil is at age_band 1.

        ## Context (recent conversation)
        Tutor: Sea turtles are amazing creatures. They can swim thousands of miles!
        Sophie: Wow, I want to see one someday!
        Sophie: Hey Basil, say 'turtle'!

        ## Basil's Response
        tut tur mmm turtle da

        ## Output Format (JSON)
        {"score": 0-7, "justification": "...", "evidence": ["..."], "notes": "..."}

        Respond with ONLY the JSON object.
    """)
    messages = [
        {"role": "system", "content": "Output valid JSON only. No explanation."},
        {"role": "user", "content": system},
    ]
    # Grader also tests o4-mini since accuracy matters most here
    return ("grader", messages, 0.3, 300, True, REASONING_MODELS)


def make_summarizer_test():
    """SUMMARIZER: Write brief teacher notes for a session."""
    system = textwrap.dedent("""\
        You are writing brief teacher notes for a language model training session.
        Write 1-3 SHORT sentences in a factual, observational style.

        ## RULES
        - NO anthropomorphic evaluations (no "effort", "willingness", "creativity")
        - NO corporate prose ("demonstrated", "evidenced", "highlighted the need for")
        - YES concrete observations: what tasks were given, what words appeared, what to try next
        - Keep it under 60 words

        ## SESSION DATA
        Subject: Ocean Life
        Lesson: Sea Creatures
        Task types: vocab (8), control (4)
        Graded turns: 12
        Average score: 1.8/5
        Strategy trend: simplify → simplify → maintain

        Target words asked: fish, whale, turtle, crab, shark, shell, wave, swim, blue, sea, fin, splash
        Words Basil produced (hits): fish (2x), turtle (1x)

        ## EXAMPLE OUTPUT (good)
        "Practiced 'red' and 'blue' vocab. Basil produced 'red' correctly once. Scores low (0.8 avg). Try simpler yes/no tasks next."

        Write teacher notes now:
    """)
    messages = [
        {"role": "system", "content": "You write brief factual teacher notes. No corporate prose. No anthropomorphism."},
        {"role": "user", "content": system},
    ]
    return ("summarizer", messages, 0.5, 150, False, [])


def make_naturalizer_test():
    """TASK NATURALIZER: Rewrite a raw task into natural dialogue."""
    system = textwrap.dedent("""\
        You are a task naturalizer that rewrites raw task instructions into natural,
        spoken dialogue appropriate for a child-friendly tutoring session.

        ## Input
        - Raw task: Say 'crab'
        - Speaker: sophie
        - Task category: vocab
        - Age band: 1 (first words)
        - Subject: Ocean Life
        - Lesson: Sea Creatures
        - Target words: crab

        ## Rules
        1. Preserve ALL target words exactly
        2. Keep it EXTREMELY short - max 15-20 words
        3. Address Basil by name
        4. Match Sophie's voice: casual, playful, sibling-like
        5. NEVER add new target words
        6. End with ? or !

        ## Examples
        Raw: "Say 'fire'" → Sophie: "Hey Basil, say 'fire'!"
        Raw: "Choose one: yes or no." → Sophie: "Basil, yes or no?"

        Return ONLY the rewritten line. No explanation, no quotes, no prefix.
    """)
    messages = [
        {"role": "system", "content": "You rewrite task instructions into natural dialogue. Output ONLY the rewritten line, nothing else."},
        {"role": "user", "content": system},
    ]
    return ("naturalizer", messages, 0.7, 60, False, [])


def make_subject_generator_test():
    """SUBJECT GENERATOR: Generate age-appropriate subject list (JSON)."""
    system = textwrap.dedent("""\
        You are a curriculum specialist generating school subjects for a specific
        developmental level.

        ## Target Learner Age
        Toddler / pre-preschool (age band 1: first words)

        ## Learner's Current Capabilities
        Can occasionally produce a recognizable English word. Mostly babble with
        emerging proto-words. Responds to simple prompts like "Say 'dog'" with
        partial phonetic matches.

        ## Subjects to AVOID (recently used)
        Ocean Life, Animals, Colors, Shapes, Numbers, Weather, Food, Family, Music,
        Farm Life, Vehicles, Dinosaurs

        ## Rules
        1. Age-appropriate but think BROADLY — the subject can be ambitious if
           tasks are simplified separately
        2. School subjects only (1-3 words, NOT lesson topics)
        3. No repeats or synonyms of the AVOID list
        4. Balanced mix across many areas (science, arts, social studies, etc.)

        ## Output Format (JSON)
        {"age_desc": "Toddler / pre-preschool", "subjects": [{"name": "Subject 1"}, ...]}

        Generate 20 subjects. Output JSON only, no explanation.
    """)
    messages = [
        {"role": "system", "content": system},
        {"role": "user", "content": "Generate the subject list now."},
    ]
    return ("subject_generator", messages, 0.9, 600, True, [])


# ── All test cases ───────────────────────────────────────────────────────────

ALL_TESTS = {
    "tutor":             make_tutor_test,
    "sophie":            make_sophie_test,
    "task_agent":        make_task_agent_test,
    "grader":            make_grader_test,
    "summarizer":        make_summarizer_test,
    "naturalizer":       make_naturalizer_test,
    "subject_generator": make_subject_generator_test,
}

ROLE_DESCRIPTIONS = {
    "tutor":             "Tutor teaching paragraph + microtask  (Phase B.1)",
    "sophie":            "Sophie reacting to teaching            (Phase B.2)",
    "task_agent":        "Task Agent generating task + rubric    (JSON)",
    "grader":            "Grader scoring Basil's response        (JSON)",
    "summarizer":        "Summarizer writing teacher notes",
    "naturalizer":       "Naturalizer rewriting task dialogue",
    "subject_generator": "Subject Generator producing subjects   (JSON)",
}

# ── Main ─────────────────────────────────────────────────────────────────────

def run_comparison(roles, models):
    total_calls = 0
    results = {}  # {role: [(model, text, ms, ptok, ctok), ...]}

    for role in roles:
        test_fn = ALL_TESTS[role]
        role_name, messages, temperature, max_tokens, json_mode, extra_models = test_fn()
        desc = ROLE_DESCRIPTIONS[role]
        banner(f"{role.upper()}: {desc}")

        role_models = models + [m for m in extra_models if m not in models]
        results[role] = []

        for model in role_models:
            sub_banner(model)
            text, ms, ptok, ctok = call_model(
                model, messages, temperature, max_tokens, json_mode
            )
            print_result(model, text, ms, ptok, ctok)
            results[role].append((model, text, ms, ptok, ctok))
            total_calls += 1

    # ── Summary table ────────────────────────────────────────────────────
    banner("SUMMARY")
    header = f"  {'Role':<20} {'Model':<20} {'Latency':>10} {'Prompt':>8} {'Compl':>8} {'JSON OK':>8}"
    print(header)
    print(f"  {'-'*74}")

    for role in roles:
        for i, (model, text, ms, ptok, ctok) in enumerate(results[role]):
            json_ok = ""
            # Check JSON validity for JSON roles
            if role in ("task_agent", "grader", "subject_generator"):
                try:
                    json.loads(text)
                    json_ok = f"{C.GREEN}  yes{C.RESET}"
                except (json.JSONDecodeError, Exception):
                    json_ok = f"{C.RED}   NO{C.RESET}"

            role_label = role if i == 0 else ""
            print(f"  {role_label:<20} {model:<20} {ms:>8,.0f}ms {ptok:>8} {ctok:>8} {json_ok}")
        print()

    print(f"  {C.DIM}Total API calls: {total_calls}{C.RESET}\n")


def main():
    parser = argparse.ArgumentParser(description="Compare OpenAI models across Basil agent roles")
    parser.add_argument(
        "--role", "-r",
        choices=list(ALL_TESTS.keys()),
        nargs="+",
        default=None,
        help="Run specific role(s) only. Default: all roles.",
    )
    parser.add_argument(
        "--models", "-m",
        nargs="+",
        default=None,
        help=f"Models to test. Default: {', '.join(DEFAULT_MODELS)}",
    )
    args = parser.parse_args()

    roles = args.role or list(ALL_TESTS.keys())
    models = args.models or DEFAULT_MODELS

    print(f"\n{C.BOLD}Model Comparison for Bootstrap Basil{C.RESET}")
    print(f"{C.DIM}Models: {', '.join(models)}{C.RESET}")
    print(f"{C.DIM}Roles:  {', '.join(roles)}{C.RESET}")
    if "grader" in roles and not args.models:
        print(f"{C.DIM}Grader also tests: {', '.join(REASONING_MODELS)}{C.RESET}")

    run_comparison(roles, models)


if __name__ == "__main__":
    main()