bootstrap-basil/auto_session.py at main · hunterooc/bootstrap-basil · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# auto_session.py
# Runs automated training sessions with Task Agent and Grader Agent.
# Includes curriculum rotation: subject selection and lesson picking.

import os
import json
import re
import argparse
from datetime import datetime
from typing import Optional
import torch
# Model loading delegated to model_cache.py (per-process cache)
from openai import OpenAI

from config import (
    LOG_DIR,
    PROMPT_TUTOR_KICKOFF,
    PROMPT_TUTOR_PRIMER,
    PROMPT_SOPHIE_LESSON_SELECT,
    PROMPT_SOPHIE_POST_GRADE,
    PROMPT_SOPHIE_REACT_TEACHING,
    PROMPT_TUTOR_WRAPUP,
    PROMPT_SOPHIE_WRAPUP,
    PROMPT_TUTOR_PHASE_B,
    PROMPT_TUTOR_PHASE_F,
    get_age_band_natural_description,
    get_length_guidance,
    get_max_tokens_ceiling,
    get_task_guidance_for_age_band,
    PROMPT_TASK_VALIDATOR,
    PROMPT_TASK_NATURALIZER,
    TASK_NATURALIZER_CACHE_FILE,
    TASK_NATURALIZER_MODEL,
    TUTOR_MODEL,
    SOPHIE_MODEL,
    TASK_AGENT_MODEL,
    SESSION_MAX_TURNS,
    GRADE_EVERY_N_TURNS,
    BASIL_MAX_TOKENS,
    CONTEXT_WINDOW_LINES,
    SCORE_FLOOR,
    SCORE_FLOOR_WINDOW,
    score_to_weight,
    get_basil_model_name,
    DEBUG_SESSION,
    USE_LLM_TUTOR_FOR_AGE_BAND_0,
    # Phase 4 lifecycle settings
    MIN_GRADED_TURNS_PER_SESSION,
    MAX_GRADED_TURNS_PER_SESSION,
    EARLY_STOP_WINDOW_TURNS,
    EARLY_STOP_MIN_AVG_SCORE,
    EARLY_STOP_MAX_COMPLIANCE,
    ENABLE_GRACEFUL_WRAPUP,
    WRAPUP_TURNS,
    COMPLIANCE_SCORE_THRESHOLD,
    # Contract validation
    STRICT_TASK_CONTRACT,
    ALLOW_DEPRECATED_TOPIC_CHECK,
    # Logging
    LOG_TEACHING_PREVIEW_CHARS,
    # Training threshold
    get_train_every_graded_turns,
    # Popquiz
    POPQUIZ_ENABLED,
    PROMPT_TUTOR_QUIZ_SOPHIE,
    get_basil_generation_settings,
)
from task_contract import (
    normalize_token,
    normalize_targets,
    text_contains_token,
    validate_task_spec_contract,
    validate_naturalized_contains_targets,
)
from task_agent import generate_task, generate_task_candidates, select_best_task, _enforce_target_invariants
from grader_agent import grade_response
from score_override import apply_score_floor
from assessment_agent import assess_developmental_age_band
from memory_manager import (
    update_metrics_after_turn,
    update_metrics_after_session,
    generate_session_summary,
    load_metrics,
    load_basil_assessment,
    update_basil_assessment,
    get_assessment_for_prompt,
)
from curriculum_manager import (
    record_subject,
    record_lesson,
    get_blacklists,
    format_blacklist_for_prompt,
    normalize_subject,
    load_used_lessons,
    add_used_lesson,
    check_lesson_overlap,
)
from dedup_utils import fuzzy_match as dedup_fuzzy_match
from file_lock_utils import get_lock
from subject_generator import generate_subject_candidates
from teaching_angle_generator import pick_angle
from subject_topic_generator import pick_topic
from metrics_manager import (
    create_session_metrics,
    load_rolling_metrics,
    save_session_metrics,
    update_rolling_metrics,
)

from llm_client import create_smart_client
client = create_smart_client()

# Stopwords for keyword extraction (excluded from topic keywords)
STOPWORDS = {
    "the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
    "have", "has", "had", "do", "does", "did", "will", "would", "could",
    "should", "may", "might", "must", "can", "about", "above", "after",
    "again", "all", "and", "any", "because", "before", "being", "below",
    "between", "both", "but", "by", "could", "during", "each", "few",
    "for", "from", "further", "here", "how", "into", "its", "just",
    "more", "most", "not", "now", "off", "only", "other", "our", "out",
    "over", "own", "same", "some", "such", "than", "that", "their",
    "them", "then", "there", "these", "they", "this", "those", "through",
    "too", "under", "until", "very", "what", "when", "where", "which",
    "while", "who", "why", "with", "you", "your", "today", "things",
}

SOPHIE_SCORE_REACTIONS = {
    0: ("Basil produced no English at all (0/7).",
        "Be gentle and warm. Don't reference what Basil said. Just encourage trying again."),
    1: ("Basil said an English word, but it had nothing to do with the lesson (1/7).",
        "Acknowledge the effort kindly. Note that Basil is finding words, even if they're not on topic yet."),
    2: ("Basil used several English words, but none related to the lesson topic (2/7).",
        "Supportive and patient. Basil is speaking more but hasn't connected to the lesson yet."),
    3: ("Basil used a word from the lesson topic -- real progress! (3/7).",
        "Noticeably pleased! Basil is starting to connect words to what's being taught. Show you noticed."),
    4: ("Basil used words directly related to the question -- getting close! (4/7).",
        "Excited and encouraging. Basil is clearly understanding the lesson. Cheer them on."),
    5: ("Basil tried to answer in the right format but didn't quite get the target word (5/7).",
        "Genuinely impressed! Basil almost got it. Express that you can tell Basil is so close."),
    6: ("Basil said the target word! It was buried in extra words, but it was there (6/7).",
        "Thrilled! Celebrate that Basil found the word. Be proud and specific about what Basil got right."),
    7: ("Basil nailed it -- clean, correct response! (7/7).",
        "Over the moon! This is a big deal. Celebrate genuinely with real excitement and pride."),
}

# Prompt cache for episode-specific prompts
_prompt_cache = {}

def _load_episode_prompt(file_path: str, prompt_name: str) -> str:
    """Load and cache an episode prompt file."""
    if file_path not in _prompt_cache:
        try:
            with open(file_path, "r") as f:
                _prompt_cache[file_path] = f.read()
            if DEBUG_SESSION:
                print(f"[Prompt] Loaded {prompt_name}: {file_path}")
        except FileNotFoundError:
            print(f"[Prompt] Warning: {file_path} not found, using fallback")
            _prompt_cache[file_path] = None
    return _prompt_cache.get(file_path)


# =============================================================================
# TASK NATURALIZER CACHE
# =============================================================================
# In-memory cache for task naturalization (persisted at session end)
_task_naturalizer_cache = {}
_task_naturalizer_cache_dirty = False


def _load_task_naturalizer_cache() -> dict:
    """Load task naturalizer cache from disk."""
    global _task_naturalizer_cache
    if _task_naturalizer_cache:
        return _task_naturalizer_cache
    try:
        if os.path.exists(TASK_NATURALIZER_CACHE_FILE):
            with open(TASK_NATURALIZER_CACHE_FILE, "r") as f:
                _task_naturalizer_cache = json.load(f)
            if DEBUG_SESSION:
                print(f"[TaskNaturalizer] Loaded cache with {len(_task_naturalizer_cache)} entries")
    except (json.JSONDecodeError, IOError) as e:
        print(f"[TaskNaturalizer] Cache load error: {e}")
        _task_naturalizer_cache = {}
    return _task_naturalizer_cache


def _save_task_naturalizer_cache():
    """Save task naturalizer cache to disk (called at session end). Process-safe via file lock."""
    global _task_naturalizer_cache_dirty
    if not _task_naturalizer_cache_dirty:
        return
    try:
        with get_lock(TASK_NATURALIZER_CACHE_FILE):
            # Re-read from disk under lock to merge any entries added by other workers
            disk_cache = {}
            if os.path.exists(TASK_NATURALIZER_CACHE_FILE):
                try:
                    with open(TASK_NATURALIZER_CACHE_FILE, "r") as f:
                        disk_cache = json.load(f)
                except (json.JSONDecodeError, IOError):
                    pass
            # Merge: our in-memory entries take precedence
            disk_cache.update(_task_naturalizer_cache)
            os.makedirs(os.path.dirname(TASK_NATURALIZER_CACHE_FILE), exist_ok=True)
            with open(TASK_NATURALIZER_CACHE_FILE, "w") as f:
                json.dump(disk_cache, f, indent=2)
        _task_naturalizer_cache_dirty = False
        if DEBUG_SESSION:
            print(f"[TaskNaturalizer] Saved cache with {len(_task_naturalizer_cache)} entries")
    except IOError as e:
        print(f"[TaskNaturalizer] Cache save error: {e}")


def _get_naturalizer_cache_key(task_text: str, task_category: str, age_band: int, speaker: str) -> str:
    """Generate a cache key for task naturalization."""
    import hashlib
    key_str = f"{age_band}|{task_category}|{speaker}|{task_text.strip().lower()}"
    return hashlib.md5(key_str.encode()).hexdigest()[:16]


def parse_lesson_from_response(response: str) -> Optional[str]:
    """
    Parse LESSON_OF_THE_DAY: <lesson title> from Sophie's response.

    Returns the lesson title or None if not found.
    """
    # Look for the marker
    pattern = r'LESSON_OF_THE_DAY:\s*(.+?)(?:\n|$)'
    match = re.search(pattern, response, re.IGNORECASE)

    if match:
        lesson = match.group(1).strip()
        # Clean up any trailing punctuation or formatting
        lesson = lesson.rstrip('.,;:')
        return lesson

    return None


class AutoSession:
    """Runs an automated Basil training session with curriculum support."""

    def __init__(
        self,
        session_id: Optional[str] = None,
        include_sophie: bool = True,
        content_snippet: str = "",
        subject: Optional[str] = None,  # Override subject selection
        verbose: bool = True,
        training_phase: str = "normal",  # "normal", "pretrain_eval", "posttrain_eval"
        batch_graded_path: Optional[str] = None,   # Batch-level graded JSONL (append)
        batch_sessions_path: Optional[str] = None,  # Batch-level debug log (append)
        batch_meta_path: Optional[str] = None,      # Batch-level metrics+summaries (append)
        model_path: Optional[str] = None,           # Override model path (experiment support)
        lora_strength: Optional[float] = None,      # Override LoRA strength (experiment support)
        skip_assessment: bool = False,               # Skip assessment/metrics updates (experiment eval)
    ):
        self.session_id = session_id or datetime.now().strftime("%Y%m%d_%H%M%S")
        # Store experiment overrides for _load_basil_model
        self._model_path_override = model_path
        self._lora_strength_override = lora_strength
        self.skip_assessment = skip_assessment
        self.include_sophie = include_sophie
        self.verbose = verbose
        self.training_phase = training_phase

        # Batch file paths (None = standalone mode, creates per-session files)
        self.batch_graded_path = batch_graded_path
        self.batch_sessions_path = batch_sessions_path
        self.batch_meta_path = batch_meta_path

        # Session state
        self.transcript = []  # List of (speaker, text) tuples
        self.graded_turns = []  # List of graded turn data
        self.turn_number = 0
        self._quiz_words_used = []  # Track quiz answer words to avoid repetition
        self.graded_turn_count = 0  # Phase 4: explicit graded turn counter
        self.recent_scores = []  # For rolling average (used for early stopping)
        self.total_basil_tokens = 0  # Track token usage
        self.episode_transcript_start = 0  # Index into self.transcript where current episode began
        self.recent_tasks = []  # List of (task_category, task_text) tuples for anti-repetition
        self.recent_sophie_questions = []  # List of normalized question topics for novelty
        self.recent_teaching_facts = []  # List of recent key facts/details used in teaching (for variety)
        self.previous_askers = []  # Track who asked recent tasks (for variety)
        self.recent_targets = []  # List of recent target words used in tasks (for variety)

        # Phase 4: stop state
        self.early_stopped = False
        self.stop_reason = "completed"

        # Curriculum state
        self.subject_of_the_day = subject  # Will be set in prelude if None
        self.lesson_of_the_day = None
        self.topic = None  # Set during prelude (Subject -> Topic -> Lesson hierarchy)
        self.teaching_angle = None  # Set during prelude (age_band >= 4 only)
        self.blacklists = get_blacklists()

        # Load Basil assessment
        self.assessment = get_assessment_for_prompt()
        self.age_band_start = self.assessment.get("age_band", 0)  # Track starting age_band
        basil_gen_settings = get_basil_generation_settings(self.age_band_start)
        self.basil_max_tokens = basil_gen_settings["max_tokens"]
        self.basil_temperature = basil_gen_settings["temperature"]

        # Load prompts
        self.tutor_kickoff_template = self._load_file(PROMPT_TUTOR_KICKOFF)
        self.tutor_primer_template = self._load_file(PROMPT_TUTOR_PRIMER)
        # Load separate Sophie prompts for lesson selection, reaction to teaching, and post-grade
        self.sophie_lesson_select_template = self._load_file(PROMPT_SOPHIE_LESSON_SELECT) if include_sophie else None
        self.sophie_react_teaching_template = self._load_file(PROMPT_SOPHIE_REACT_TEACHING) if include_sophie else None
        self.sophie_post_grade_template = self._load_file(PROMPT_SOPHIE_POST_GRADE) if include_sophie else None
        self.tutor_wrapup_template = self._load_file(PROMPT_TUTOR_WRAPUP)
        self.sophie_wrapup_template = self._load_file(PROMPT_SOPHIE_WRAPUP) if include_sophie else None

        # Load Basil model
        self._load_basil_model()

        if self.verbose:
            print(f"[Session {self.session_id}] Initialized")
            print(f"  - Sophie: {'enabled' if include_sophie else 'disabled'}")
            print(f"  - Training phase: {training_phase}")
            print(f"  - Graded turns: min={MIN_GRADED_TURNS_PER_SESSION}, max={MAX_GRADED_TURNS_PER_SESSION}")
            print(f"  - Basil age_band: {self.assessment['age_band']} ({self.assessment['age_band_description']})")
            print(f"  - Basil max tokens: {self.basil_max_tokens}")

    def _load_file(self, filepath: str) -> str:
        """Load a text file."""
        if os.path.exists(filepath):
            with open(filepath, "r") as f:
                return f.read()
        return ""

    def _load_basil_model(self):
        """Load the Basil model, using per-process cache to avoid reloading every session.

        Supports model path override via:
          1. self.model_path (constructor parameter)
          2. BASIL_MODEL_PATH env var (from config)
          3. get_basil_model_name() (default: latest model)

        LoRA strength resolution (first non-None wins):
          1. self._lora_strength_override (constructor parameter)
          2. BASIL_LORA_STRENGTH env var (from config, for experiments)
          3. lora_strength_for_age_band(age_band) — smooth 0.0→1.0 scaling

        Always calls set_lora_strength() to ensure the adapter contribution is
        correctly scaled — even for age_band=0 (sets to 0.0, effectively trunk-only).
        """
        from model_cache import get_cached_model, set_lora_strength
        from config import BASIL_MODEL_PATH, BASIL_LORA_STRENGTH, lora_strength_for_age_band

        # Resolve model path: constructor param > env var > default
        if hasattr(self, '_model_path_override') and self._model_path_override:
            model_path = self._model_path_override
        elif BASIL_MODEL_PATH:
            model_path = BASIL_MODEL_PATH
        else:
            model_path = get_basil_model_name()

        if self.verbose:
            print(f"  - Basil model: {model_path}")
        self.model, self.tokenizer, self.device = get_cached_model(
            model_path, verbose=self.verbose, age_band=self.age_band_start)

        # Resolve LoRA strength: constructor override > env var > age-band scaling
        lora_strength = getattr(self, '_lora_strength_override', None)
        if lora_strength is None and BASIL_LORA_STRENGTH is not None:
            lora_strength = BASIL_LORA_STRENGTH
        if lora_strength is None:
            lora_strength = lora_strength_for_age_band(self.age_band_start)
        # Always apply — even 0.0 is meaningful (trunk-only at early age bands)
        set_lora_strength(self.model, lora_strength, verbose=self.verbose)

    def _get_recent_conversation(self, n_lines: int = None, clean_for_training: bool = True) -> str:
        """
        Get recent conversation as formatted text.

        Shows the full session transcript up to this point (not just current episode)
        to prevent intra-session repetition across episodes.

        Args:
            n_lines: Number of transcript entries to include (defaults to CONTEXT_WINDOW_LINES)
            clean_for_training: If True, strips machine markers (TASK:, SUBJECT_OF_THE_DAY:, etc.)
                               to keep training data natural
        """
        # Use last N entries from the full session transcript
        n = n_lines or CONTEXT_WINDOW_LINES
        recent = self.transcript[-n:] if self.transcript else []

        lines = []
        for speaker, text in recent:
            if clean_for_training:
                text = self._clean_transcript_for_training(text)
            lines.append(f"{speaker}: {text}")

        return "\n".join(lines)

    def _clean_transcript_for_training(self, text: str) -> str:
        """Delegate to shared utility so inference and LoRA training use same cleaning."""
        from utils.transcript import clean_transcript_for_training
        return clean_transcript_for_training(text)

    def _get_session_text_before_basil(self) -> str:
        """
        Get all Tutor/Sophie text from the transcript up to the current point.

        Used for session-word override: checks if Basil repeated any word
        that Tutor or Sophie said earlier in the session (kickoff, lesson
        selection, primer, episode content). Excludes prior Basil turns.
        """
        parts = []
        for speaker, text in self.transcript:
            if speaker != "Basil":
                parts.append(text)
        return " ".join(parts)

    def _get_rolling_transcript_window(self, max_entries: int = 20, max_chars: int = 3000, clean_for_training: bool = True) -> str:
        """
        Get a rolling transcript window for Tutor prompts.

        Returns formatted dialogue context, bounded by both entry count and character limit.
        Truncates oldest entries first if over limit.

        Args:
            max_entries: Maximum number of transcript entries to include
            max_chars: Maximum total characters (truncates oldest first)
            clean_for_training: If True, strips machine markers

        Returns:
            Formatted string of recent dialogue: "Role: content\n..."
        """
        if not self.transcript:
            return "[No conversation yet]"

        # Get last N entries
        recent = self.transcript[-max_entries:]

        # Format entries
        formatted_lines = []
        total_chars = 0

        # Work backwards from most recent to oldest
        for speaker, text in reversed(recent):
            # Clean markers if requested
            if clean_for_training:
                text = self._clean_transcript_for_training(text)

            # Skip empty entries after cleaning
            if not text:
                continue

            # Truncate individual entries if too long
            if len(text) > 300:
                text = text[:300] + "..."

            line = f"{speaker}: {text}"

            # Check if adding this line would exceed limit
            if total_chars + len(line) + 1 > max_chars:
                # Stop adding more entries
                break

            formatted_lines.insert(0, line)  # Insert at beginning to maintain order
            total_chars += len(line) + 1  # +1 for newline

        if not formatted_lines:
            # At minimum include the most recent entry
            speaker, text = self.transcript[-1]
            if len(text) > 300:
                text = text[:300] + "..."
            formatted_lines = [f"{speaker}: {text}"]

        return "\n".join(formatted_lines)

    def _get_episode_transcript(self, clean_for_training: bool = True) -> str:
        """
        Get only the current episode's transcript entries.

        Used for Basil (Phase C) and Grader (Phase D) which only need to see
        the current episode, not the full session history.
        """
        episode_entries = self.transcript[self.episode_transcript_start:]
        if not episode_entries:
            return ""

        lines = []
        for speaker, text in episode_entries:
            if clean_for_training:
                text = self._clean_transcript_for_training(text)
            if text:
                lines.append(f"{speaker}: {text}")

        return "\n".join(lines)

    def _get_full_session_transcript(self, max_chars: int = 15000, clean_for_training: bool = True) -> str:
        """
        Get the full session transcript up to this point.

        Used for Tutor, Sophie, and TaskAgent prompts that benefit from seeing
        the entire session context to avoid repetition and maintain continuity.

        Args:
            max_chars: Safety cap on total characters. Truncates oldest entries first.
            clean_for_training: If True, strips machine markers.
        """
        if not self.transcript:
            return "[No conversation yet]"

        # Format all entries
        formatted_lines = []
        total_chars = 0

        # Work backwards from most recent, stop when we hit the char limit
        for speaker, text in reversed(self.transcript):
            if clean_for_training:
                text = self._clean_transcript_for_training(text)
            if not text:
                continue
            # Truncate individual entries if very long
            if len(text) > 400:
                text = text[:400] + "..."

            line = f"{speaker}: {text}"
            if total_chars + len(line) + 1 > max_chars:
                break

            formatted_lines.insert(0, line)
            total_chars += len(line) + 1

        if not formatted_lines:
            speaker, text = self.transcript[-1]
            if len(text) > 400:
                text = text[:400] + "..."
            formatted_lines = [f"{speaker}: {text}"]

        return "\n".join(formatted_lines)

    def _debug_session_state(self, agent_name: str, context_preview: str = ""):
        """Debug instrumentation: print session state continuity info."""
        if not DEBUG_SESSION:
            return

        print(f"\n[DEBUG] {agent_name} Input:")
        print(f"  Session ID: {id(self)}")
        print(f"  Transcript ID: {id(self.transcript)}")
        print(f"  Transcript length: {len(self.transcript)} entries")
        print(f"  Turn number: {self.turn_number}")

        # Show last 3 messages
        if self.transcript:
            print(f"  Last 3 transcript entries:")
            for i, (speaker, text) in enumerate(self.transcript[-3:], 1):
                preview = text[:120] + "..." if len(text) > 120 else text
                print(f"    {i}. {speaker}: {preview}")

        if context_preview:
            print(f"  Context preview: {context_preview[:200]}...")
        print()

    def _debug_prompt_payload(self, episode_id: str, phase: str, agent_name: str,
                               messages_or_prompt, recent_conv: str = None):
        """
        Debug instrumentation: log the EXACT prompt payload being sent to LLM.

        Args:
            episode_id: Current episode ID
            phase: Current phase (A/B/C/D/E/F)
            agent_name: Agent being called
            messages_or_prompt: Either the messages list (for chat models) or prompt string
            recent_conv: The recent_conversation string if available
        """
        if not DEBUG_SESSION:
            return

        print(f"\n{'='*60}")
        print(f"[PROMPT DEBUG] Episode: {episode_id} | Phase: {phase} | Agent: {agent_name}")
        print(f"{'='*60}")

        # Show transcript state
        print(f"Transcript entries: {len(self.transcript)}")

        # Show last 5 messages from transcript
        if self.transcript:
            print(f"\nLast 5 transcript entries (role + first 120 chars):")
            for i, (speaker, text) in enumerate(self.transcript[-5:], 1):
                preview = text[:120].replace('\n', ' ')
                if len(text) > 120:
                    preview += "..."
                print(f"  {i}. [{speaker}]: {preview}")

        # Show recent_conversation if provided
        if recent_conv is not None:
            print(f"\nrecent_conversation length: {len(recent_conv)} chars")
            if recent_conv:
                print(f"recent_conversation last 400 chars:")
                print(f"  ...{recent_conv[-400:]}")
            else:
                print("  [EMPTY - THIS IS A BUG!]")

        # Show the actual prompt payload
        if isinstance(messages_or_prompt, list):
            # Chat format (list of message dicts)
            print(f"\nMessages list length: {len(messages_or_prompt)}")
            for i, msg in enumerate(messages_or_prompt):
                role = msg.get('role', '?')
                content = msg.get('content', '')
                print(f"\n  Message {i+1} [{role}] - {len(content)} chars:")
                # Show last 800 chars of each message
                if len(content) > 800:
                    print(f"    ...{content[-800:]}")
                else:
                    print(f"    {content}")
        else:
            # String prompt
            prompt_str = str(messages_or_prompt)
            print(f"\nPrompt string length: {len(prompt_str)} chars")
            print(f"Last 800 chars of rendered prompt:")
            if len(prompt_str) > 800:
                print(f"  ...{prompt_str[-800:]}")
            else:
                print(f"  {prompt_str}")

        print(f"{'='*60}\n")

    def _format_prompt_vars(self) -> dict:
        """Get common prompt template variables."""
        # Format assessment for prompt injection
        assessment_text = self._format_assessment_for_prompt()
        age_band = self.assessment.get("age_band", 0)
        basil_age_description = get_age_band_natural_description(age_band)

        # Get used lessons for the current training run
        used_lessons = load_used_lessons()

        # Build the optional teaching angle section (age_band >= 4 only)
        if self.teaching_angle:
            teaching_angle_section = (
                f"\n**Teaching Angle (bonus complexity):** \"{self.teaching_angle}\"\n"
                f"The teaching angle is an extra creative lens to push toward a less-obvious lesson. "
                f"Use it if it naturally fits the topic — but if the combination is forced, "
                f"just let it inspire a less-obvious direction. The goal is a FRESH lesson, not a forced one.\n"
            )
        else:
            teaching_angle_section = ""

        return {
            "session_transcript": self._get_full_session_transcript() or "(Session just started)",
            "subject_of_the_day": self.subject_of_the_day or "(Not yet selected)",
            "lesson_of_the_day": self.lesson_of_the_day or "(Not yet selected)",
            "topic": self.topic or "(general)",
            "teaching_angle": self.teaching_angle or "(general exploration)",
            "teaching_angle_section": teaching_angle_section,
            "recent_subjects": format_blacklist_for_prompt(self.blacklists["recent_subjects"]),
            "used_lessons": format_blacklist_for_prompt(used_lessons, max_items=999),
            # Assessment variables
            "basil_assessment": assessment_text,
            "basil_age_description": basil_age_description,
        }

    def _format_assessment_for_prompt(self) -> str:
        """Format Basil assessment as readable text for prompt injection."""
        # Only pass age_band_description - it's human-readable and self-explanatory
        age_band_desc = self.assessment.get('age_band_description', 'unknown')
        return f"Age Band: {age_band_desc}"

    def _generate_tutor_kickoff(self) -> str:
        """Generate Tutor's session kickoff: greet, announce subject, ask Sophie for lesson."""
        vars = self._format_prompt_vars()

        try:
            prompt = self.tutor_kickoff_template.format(**vars)
        except KeyError as e:
            print(f"[Session] Warning: Missing prompt variable {e}")
            prompt = f"Recent conversation:\n{vars['session_transcript']}\n\nContinue as Tutor."

        system_msg = f"You are Tutor. This is the FIRST turn. You MUST include 'SUBJECT_OF_THE_DAY: {self.subject_of_the_day}' and ask Sophie to choose a lesson."

        try:
            response = client.chat.completions.create(
                model=TUTOR_MODEL,
                messages=[
                    {"role": "system", "content": system_msg},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.7,
                max_tokens=200,
            )
            result = response.choices[0].message.content.strip()
            # Fix "Tutor: Tutor:" double prefix
            if result.startswith("Tutor:"):
                result = result[6:].strip()
            return result
        except Exception as e:
            print(f"[Session] Tutor kickoff error: {e}")
            return f"SUBJECT_OF_THE_DAY: {self.subject_of_the_day}\n\nHello everyone! Today we're going to learn about {self.subject_of_the_day}. Sophie, what specific lesson should we explore today?"

    def _generate_sophie_lesson_select(self, rejected_lessons: list = None, temperature: float = 0.7) -> str:
        """Generate Sophie's lesson selection response during session prelude.

        Args:
            rejected_lessons: List of previously rejected lesson names within the
                              current topic, used as feedback so the LLM avoids similar picks.
            temperature: LLM temperature for this call (ramps on retries).
        """
        if not self.include_sophie:
            return ""

        vars = self._format_prompt_vars()
        recent_conv = vars.get('session_transcript', '')
        self._debug_session_state("Sophie-LessonSelect", recent_conv)

        # Use the dedicated lesson selection prompt
        try:
            if self.sophie_lesson_select_template:
                prompt = self.sophie_lesson_select_template.format(**vars)
            else:
                # Fallback if template missing
                prompt = f"Subject: {self.subject_of_the_day}\n\nPick a lesson and respond as Sophie."
        except KeyError as e:
            print(f"[Session] Warning: Missing prompt variable {e}")
            prompt = f"Subject: {self.subject_of_the_day}\n\nPick a lesson and respond as Sophie."

        if rejected_lessons:
            prompt += (
                f"\n\nIMPORTANT: These lessons from {self.topic} were already used or "
                f"too similar: {', '.join(rejected_lessons)}. "
                f"Pick a DIFFERENT lesson from {self.topic} that is clearly distinct."
            )

        system_msg = f"You are Sophie picking today's lesson. Subject is {self.subject_of_the_day}. You MUST include 'LESSON_OF_THE_DAY: <lesson title>' in your response. Keep under 80 words."

        try:
            response = client.chat.completions.create(
                model=SOPHIE_MODEL,
                messages=[
                    {"role": "system", "content": system_msg},
                    {"role": "user", "content": prompt}
                ],
                temperature=temperature,
                max_tokens=100,
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            print(f"[Session] Sophie lesson selection error: {e}")
            return f"LESSON_OF_THE_DAY: Introduction to {self.subject_of_the_day}\n\nLet's start with the basics today!"

    def _generate_basil_turn(self, prompt_suffix: str = "Basil:") -> str:
        """Generate Basil's response using model.generate() directly."""
        # Build context from current episode only (Basil doesn't need full session history)
        context = self._get_episode_transcript()
        full_prompt = f"{context}\n{prompt_suffix}" if context else prompt_suffix

        # Truncate to fit within GPT-2's 1024 token limit
        # Reserve tokens for generation (basil_max_tokens) plus safety margin
        max_context_tokens = 1024 - self.basil_max_tokens - 20  # ~970 tokens for context

        # Tokenize with attention mask
        inputs = self.tokenizer(
            full_prompt,
            return_tensors="pt",
            truncation=True,
            max_length=max_context_tokens,
            padding=False,
        )

        input_ids = inputs["input_ids"].to(self.device)
        attention_mask = inputs["attention_mask"].to(self.device)

        if input_ids.shape[1] >= max_context_tokens:
            print(f"[Session] Context truncated to {input_ids.shape[1]} tokens")

        try:
            # Sampling for all age bands — temperature scales from 1.4 (band 0,
            # encourages discovery via randomness) down to 1.0 (band 7, more
            # precise).  No repetition_penalty: repetition is penalized through
            # the training signal (grading), not at inference time.
            with torch.no_grad():
                output_ids = self.model.generate(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    max_new_tokens=self.basil_max_tokens,
                    do_sample=True,
                    temperature=self.basil_temperature,
                    top_k=50,
                    pad_token_id=self.tokenizer.eos_token_id,
                    eos_token_id=self.tokenizer.eos_token_id,
                )

            # Decode full output
            full_output = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)

            # Extract just Basil's response (after the prompt)
            if "Basil:" in full_output:
                response = full_output.split("Basil:")[-1].strip()
            else:
                # Get only the newly generated tokens
                prompt_len = len(self.tokenizer.decode(input_ids[0], skip_special_tokens=True))
                response = full_output[prompt_len:].strip()

            # Truncate at newline (stop at first turn boundary)
            if "\n" in response:
                response = response.split("\n")[0].strip()

            return response
        except Exception as e:
            print(f"[Session] Basil generation error: {e}")
            return ""

    def _should_stop_early(self) -> tuple:
        """
        Check if session should stop based on hybrid stop policy.

        Returns:
            (should_stop: bool, reason: str)
        """
        # Hard max: always stop at MAX_GRADED_TURNS_PER_SESSION
        if self.graded_turn_count >= MAX_GRADED_TURNS_PER_SESSION:
            return True, "max_graded_turns"

        # Don't stop before minimum unless hard failure
        if self.graded_turn_count < MIN_GRADED_TURNS_PER_SESSION:
            return False, ""

        # Early stop: check rolling window for low signal
        if len(self.recent_scores) >= EARLY_STOP_WINDOW_TURNS:
            window_scores = self.recent_scores[-EARLY_STOP_WINDOW_TURNS:]
            avg_score = sum(window_scores) / len(window_scores)

            # Compute compliance over the window
            compliant_count = sum(1 for s in window_scores if s >= COMPLIANCE_SCORE_THRESHOLD)
            compliance_rate = compliant_count / len(window_scores)

            # Early stop if BOTH score and compliance are below thresholds
            if avg_score < EARLY_STOP_MIN_AVG_SCORE and compliance_rate < EARLY_STOP_MAX_COMPLIANCE:
                if self.verbose:
                    print(f"[Session] Early stop: avg_score={avg_score:.2f} < {EARLY_STOP_MIN_AVG_SCORE}, "
                          f"compliance={compliance_rate:.1%} < {EARLY_STOP_MAX_COMPLIANCE:.0%}")
                return True, "early_stop_low_signal"

        # Legacy: also check old SCORE_FLOOR logic
        if len(self.recent_scores) >= SCORE_FLOOR_WINDOW:
            recent_avg = sum(self.recent_scores[-SCORE_FLOOR_WINDOW:]) / SCORE_FLOOR_WINDOW
            if recent_avg < SCORE_FLOOR:
                if self.verbose:
                    print(f"[Session] Stopping: rolling avg {recent_avg:.2f} < {SCORE_FLOOR}")
                return True, "score_floor"

        return False, ""

    def _generate_wrapup_tutor(self) -> str:
        """Generate Tutor's wrap-up recap line (no graded task)."""
        avg_score = 0
        if self.graded_turns:
            avg_score = sum(t["grade"]["score"] for t in self.graded_turns) / len(self.graded_turns)

        # Fallback messages calibrated to performance
        if avg_score >= 5.0:
            fallback = f"Nice work today on {self.lesson_of_the_day}, Basil! You're getting the hang of it."
        elif avg_score >= 3.0:
            fallback = f"Good try today, Basil! Learning {self.lesson_of_the_day} takes practice."
        else:
            fallback = f"Thanks for trying today, Basil. We'll practice {self.lesson_of_the_day} more next time."

        early_stop_note = ""
        if self.early_stopped:
            early_stop_note = "Note: The session ended early. Close gently with 'We'll stop here for today and try again soon.'"

        session_transcript = self._get_full_session_transcript()

        # Use prompt template
        wrapup_prompt = None
        if self.tutor_wrapup_template:
            try:
                wrapup_prompt = self.tutor_wrapup_template.format(
                    subject=self.subject_of_the_day or "(unknown)",
                    lesson=self.lesson_of_the_day or "(unknown)",
                    graded_turns=self.graded_turn_count,
                    avg_score=f"{avg_score:.1f}",
                    early_stop_note=early_stop_note,
                    session_transcript=session_transcript,
                    basil_age_description=get_age_band_natural_description(self.age_band_start),
                    length_guidance=get_length_guidance("tutor_wrapup", self.age_band_start),
                )
            except KeyError as e:
                print(f"[Session] Warning: Missing wrapup prompt variable {e}")
                wrapup_prompt = None

        # Fallback inline prompt
        if not wrapup_prompt:
            length_hint = get_length_guidance("tutor_wrapup", self.age_band_start)
            wrapup_prompt = f"""This session on "{self.lesson_of_the_day}" is ending.
Subject: {self.subject_of_the_day}
Graded turns: {self.graded_turn_count}
Average score: {avg_score:.1f}/7
{early_stop_note}

{length_hint} Your recap should:
1. Thank Basil for being here
2. Be HONEST about how it went (no false praise if score is low)
3. End warmly

DO NOT ask any questions or give any tasks. Just an honest, warm closing."""

        system_msg = "You are Tutor giving a session recap. Be honest but kind. Match your praise to actual performance. NO questions. End with a statement."

        try:
            response = client.chat.completions.create(
                model=TUTOR_MODEL,
                messages=[
                    {"role": "system", "content": system_msg},
                    {"role": "user", "content": wrapup_prompt}
                ],
                temperature=0.7,
                max_tokens=get_max_tokens_ceiling("tutor_wrapup"),
            )
            result = response.choices[0].message.content.strip()
            result = self._strip_trailing_questions(result)
            return result
        except Exception as e:
            print(f"[Session] Tutor wrapup error: {e}")
            return fallback

    def _generate_wrapup_sophie(self) -> str:
        """Generate Sophie's wrap-up closing line (no graded task)."""
        if not self.include_sophie:
            return ""

        # Simple fallbacks based on session state
        if self.early_stopped:
            fallback = "Bye Basil! We'll play again soon!"
        else:
            fallback = "Bye Basil! See you next time!"

        early_stop_note = ""
        if self.early_stopped:
            early_stop_note = "Note: The session ended early. Keep your goodbye simple and brief."

        session_transcript = self._get_full_session_transcript()

        # Use prompt template
        wrapup_prompt = None
        if self.sophie_wrapup_template:
            try:
                wrapup_prompt = self.sophie_wrapup_template.format(
                    lesson=self.lesson_of_the_day or "(unknown)",
                    early_stop_note=early_stop_note,
                    session_transcript=session_transcript,
                    length_guidance=get_length_guidance("sophie_wrapup", self.age_band_start),
                )
            except KeyError as e:
                print(f"[Session] Warning: Missing Sophie wrapup prompt variable {e}")
                wrapup_prompt = None

        # Fallback inline prompt
        if not wrapup_prompt:
            length_hint = get_length_guidance("sophie_wrapup", self.age_band_start)
            wrapup_prompt = f"""The session on "{self.lesson_of_the_day}" is ending.
{early_stop_note}
{length_hint} Say goodbye warmly from Sophie:
1. Says goodbye warmly
2. Maybe mentions next time
DO NOT ask questions. Just a warm, simple goodbye."""

        length_hint_sys = get_length_guidance("sophie_wrapup", self.age_band_start)
        system_msg = f"You are Sophie saying goodbye. {length_hint_sys} NO questions. End with a statement."

        try:
            response = client.chat.completions.create(
                model=SOPHIE_MODEL,
                messages=[
                    {"role": "system", "content": system_msg},
                    {"role": "user", "content": wrapup_prompt}
                ],
                temperature=0.7,
                max_tokens=get_max_tokens_ceiling("sophie_wrapup"),
            )
            result = response.choices[0].message.content.strip()
            result = self._strip_trailing_questions(result)
            return result
        except Exception as e:
            print(f"[Session] Sophie wrapup error: {e}")
            return fallback

    def run_wrapup(self):
        """
        Run graceful session wrap-up.

        Generates closing lines from Tutor and Sophie (not graded).
        """
        if not ENABLE_GRACEFUL_WRAPUP:
            return

        if self.verbose:
            print("\n--- Session Wrap-up ---")

        # Tutor recap
        tutor_line = self._generate_wrapup_tutor()
        self.transcript.append(("Tutor", f"[WRAPUP] {tutor_line}"))