bootstrap-basil/memory_manager.py at main · hunterooc/bootstrap-basil · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
# memory_manager.py
# Manages memory files: session summaries, story-so-far files, and metrics.

import os
import re
import json
from datetime import datetime
from openai import OpenAI

from config import (
    SUMMARIZER_MODEL,
    METRICS_FILE,
    SESSION_SUMMARIES_DIR,
    TASK_CATEGORIES,
    STAGE_DESCRIPTIONS,
    BASIL_ASSESSMENT_FILE,
    AGE_BAND_DESCRIPTIONS,
    BASIL_MAX_TOKENS_BY_AGE_BAND,
    TASK_CATEGORIES_BY_AGE_BAND,
    ASSESSMENT_PROMOTE_SCORE,
    ASSESSMENT_DEMOTE_SCORE,
    ASSESSMENT_WINDOW_SESSIONS,
    ASSESSMENT_MIN_COMPLIANCE_FOR_PROMOTION,
    ASSESSMENT_MAX_COMPLIANCE_FOR_DEMOTION,
    ASSESSMENT_PROGRESS_SIGNAL_REQUIRED_FOR_PROMOTION,
    COMPLIANCE_SCORE_THRESHOLD,
    DEBUG_SESSION,
    get_train_every_graded_turns,
    get_train_every_usable_turns,
)
from task_contract import normalize_targets
from file_lock_utils import get_lock

# Prompt file for session summaries
SESSION_SUMMARY_PROMPT_FILE = os.path.join(
    os.path.dirname(__file__), "prompts", "session_summary_prompt.txt"
)

# Cache for loaded prompt template
_session_summary_prompt_cache = None

def _load_session_summary_prompt() -> str:
    """Load the session summary prompt template (cached)."""
    global _session_summary_prompt_cache
    if _session_summary_prompt_cache is None:
        try:
            with open(SESSION_SUMMARY_PROMPT_FILE, "r") as f:
                _session_summary_prompt_cache = f.read()
        except FileNotFoundError:
            print(f"[Memory Manager] Warning: {SESSION_SUMMARY_PROMPT_FILE} not found, using fallback")
            _session_summary_prompt_cache = "Summarize this training session in 1-3 factual sentences."
    return _session_summary_prompt_cache

from llm_client import create_smart_client
client = create_smart_client()


def ensure_dirs():
    """Ensure memory directories exist."""
    os.makedirs(SESSION_SUMMARIES_DIR, exist_ok=True)


def load_metrics() -> dict:
    """Load the metrics file."""
    if os.path.exists(METRICS_FILE):
        with open(METRICS_FILE, "r") as f:
            return json.load(f)
    return _default_metrics()


def save_metrics(metrics: dict):
    """Save the metrics file."""
    ensure_dirs()
    # Preserve new rolling metrics fields if they exist
    # Load existing file to preserve fields from metrics_manager.py
    existing = {}
    if os.path.exists(METRICS_FILE):
        try:
            with open(METRICS_FILE, "r") as f:
                existing = json.load(f)
        except Exception:
            pass

    # Merge: old system fields take precedence, but preserve new fields
    merged = existing.copy()
    merged.update(metrics)  # Old system fields overwrite
    # Explicitly preserve new rolling metrics fields
    for key in ["graded_turns_since_last_train", "last_train_result", "ewma_score",
                "ewma_compliance", "last_n_session_ids", "recent_summary",
                "training_run_boundaries", "total_rollbacks"]:
        if key in existing:
            merged[key] = existing[key]

    with open(METRICS_FILE, "w") as f:
        json.dump(merged, f, indent=2)


def _default_metrics() -> dict:
    """Return default metrics structure."""
    return {
        "total_sessions": 0,
        "total_graded_turns": 0,
        "total_training_runs": 0,
        "last_training_timestamp": None,
        "score_history": [],
        "score_by_category": {cat: [] for cat in TASK_CATEGORIES},
        "recent_average_score": None,
        "stage_progression": [
            {"stage": 0, "description": "babble", "entered_at": datetime.now().isoformat()}
        ]
    }


def update_metrics_after_turn(score: float, category: str):
    """Update metrics after a graded turn. Process-safe via file lock."""
    with get_lock(METRICS_FILE):
        metrics = load_metrics()

        metrics["total_graded_turns"] += 1
        metrics["score_history"].append({
            "score": score,
            "timestamp": datetime.now().isoformat()
        })

        if category in metrics["score_by_category"]:
            metrics["score_by_category"][category].append({
                "score": score,
                "timestamp": datetime.now().isoformat()
            })

        # Calculate recent average (last 50 scores)
        recent_scores = [s["score"] for s in metrics["score_history"][-50:]]
        if recent_scores:
            metrics["recent_average_score"] = sum(recent_scores) / len(recent_scores)

        save_metrics(metrics)


def update_metrics_after_session(session_id: str):
    """Update metrics after a session ends. Process-safe via file lock."""
    with get_lock(METRICS_FILE):
        metrics = load_metrics()
        metrics["total_sessions"] += 1
        save_metrics(metrics)


def update_metrics_after_training():
    """Update metrics after a training run. Process-safe via file lock."""
    with get_lock(METRICS_FILE):
        metrics = load_metrics()
        metrics["total_training_runs"] += 1
        metrics["last_training_timestamp"] = datetime.now().isoformat()
        # Record which age_band training just completed at.
        # Promotion is capped at age_band_at_last_train + 1, so Basil must
        # train at each level before advancing further.
        assessment = load_basil_assessment()
        metrics["age_band_at_last_train"] = assessment.get("age_band", 0)
        save_metrics(metrics)


def save_session_summary(session_id: str, summary: dict):
    """Save a session summary."""
    ensure_dirs()
    filepath = os.path.join(SESSION_SUMMARIES_DIR, f"session_{session_id}.json")
    with open(filepath, "w") as f:
        json.dump(summary, f, indent=2)


def load_recent_session_summaries(n: int = 10) -> list:
    """Load the most recent N session summaries."""
    ensure_dirs()
    summaries = []

    if not os.path.exists(SESSION_SUMMARIES_DIR):
        return summaries

    files = sorted([
        f for f in os.listdir(SESSION_SUMMARIES_DIR)
        if f.startswith("session_") and f.endswith(".json")
    ], reverse=True)

    for fname in files[:n]:
        filepath = os.path.join(SESSION_SUMMARIES_DIR, fname)
        with open(filepath, "r") as f:
            summaries.append(json.load(f))

    return summaries


def compute_compliance_rate(graded_turns: list) -> float:
    """
    Compute compliance rate for a session from graded turns.

    A turn is "compliant" if score >= COMPLIANCE_SCORE_THRESHOLD (default 2).

    Args:
        graded_turns: List of graded turn data

    Returns:
        Compliance rate as float 0.0 to 1.0
    """
    if not graded_turns:
        return 0.0

    compliant_count = 0
    for turn in graded_turns:
        # Get score from nested grade dict or direct score field
        score = turn.get("grade", {}).get("score", turn.get("score", 0))
        if score >= COMPLIANCE_SCORE_THRESHOLD:
            compliant_count += 1

    return compliant_count / len(graded_turns)


def compute_progress_signal(compliance_rate: float, avg_score: float) -> bool:
    """
    Derive progress signal from compliance rate and average score.

    MVP rule:
    - progress_signal = (compliance_rate >= 0.60) OR
                       (avg_score >= 3.0 AND compliance_rate >= 0.40)

    Args:
        compliance_rate: Session compliance rate (0.0 to 1.0)
        avg_score: Session average score

    Returns:
        Boolean indicating stable, usable progress
    """
    if compliance_rate >= 0.60:
        return True
    if avg_score >= 3.0 and compliance_rate >= 0.40:
        return True
    return False


def _extract_target_words_from_task(task_text: str) -> list:
    """
    DEPRECATED: Legacy target word extraction from task text.

    New code should use task_spec["targets"] directly.
    This function is kept only for processing legacy data that lacks explicit targets.
    """
    if not task_text:
        return []

    targets = []

    # Pattern: Say 'word' or Say "word"
    quoted = re.findall(r"[Ss]ay\s+['\"]([^'\"]+)['\"]", task_text)
    targets.extend(quoted)

    # Pattern: Say word (single word after "Say")
    say_match = re.search(r"[Ss]ay\s+(\w+)", task_text)
    if say_match and say_match.group(1).lower() not in ["yes", "no", "a", "or", "one", "the"]:
        targets.append(say_match.group(1))

    # Pattern: yes or no
    if "yes or no" in task_text.lower():
        targets.extend(["yes", "no"])

    # Pattern: A or B? (e.g., "Apple or banana?")
    ab_match = re.search(r"(\w+)\s+or\s+(\w+)\?", task_text, re.IGNORECASE)
    if ab_match:
        targets.extend([ab_match.group(1), ab_match.group(2)])

    return [t.lower().strip() for t in targets if t]


def _get_targets_from_turn(turn: dict) -> list:
    """
    Get target words from a graded turn, preferring explicit task_spec["targets"].

    This is the single source of truth for target word extraction.
    Uses normalize_targets for consistent normalization.
    Falls back to legacy regex extraction only for old data.

    Args:
        turn: Graded turn dict with task_spec, task_text, etc.

    Returns:
        List of normalized lowercase target words
    """
    task_spec = turn.get("task_spec", {})

    # Prefer explicit targets from TaskSpec (use central normalization)
    targets = task_spec.get("targets", [])
    if targets:
        return normalize_targets(targets)

    # Fall back to legacy extraction for old data
    task_text = task_spec.get("task_text", "") or turn.get("task_text", "")
    legacy_targets = _extract_target_words_from_task(task_text)
    return normalize_targets(legacy_targets)


def _find_notable_hits(graded_turns: list) -> list:
    """
    Find target words that Basil actually produced (case-insensitive).

    Returns list of unique words that appeared in both task targets and Basil output.
    Uses _get_targets_from_turn for consistent target extraction.
    """
    hits = set()

    for turn in graded_turns:
        # Get Basil's output
        basil_output = turn.get("basil", "") or turn.get("basil_output", "")
        if not basil_output:
            continue

        basil_lower = basil_output.lower()

        # Get targets using centralized function (prefers task_spec["targets"])
        targets = _get_targets_from_turn(turn)

        # Check which targets appear in Basil's output
        for target in targets:
            if target in basil_lower:
                hits.add(target)

    return sorted(list(hits))


def _compute_strategy_trend(graded_turns: list) -> str:
    """Compute the dominant strategy trend from graded turns."""
    strategies = [t.get("next_strategy", "maintain") for t in graded_turns if t.get("next_strategy")]

    if not strategies:
        return "maintain"

    # Count occurrences
    from collections import Counter
    counts = Counter(strategies)

    # Return most common
    return counts.most_common(1)[0][0]


def _collect_all_target_words(graded_turns: list) -> list:
    """
    Collect all target words that were asked across all turns.

    Uses _get_targets_from_turn for consistent target extraction.
    """
    all_targets = set()

    for turn in graded_turns:
        targets = _get_targets_from_turn(turn)
        all_targets.update(targets)

    return sorted(list(all_targets))


def generate_session_summary(
    session_id: str,
    transcript: list,
    graded_turns: list,
    subject: str = None,
    lesson: str = None,
) -> dict:
    """
    Generate a compact summary of a session using the API.

    Uses factual teacher-notes style, not corporate prose.

    Args:
        session_id: Unique session identifier
        transcript: List of (speaker, text) tuples
        graded_turns: List of graded turn data
        subject: Session subject (optional)
        lesson: Session lesson (optional)

    Returns:
        Session summary dict with structured fields
    """
    # Calculate session stats
    scores = [t.get("grade", {}).get("score", t.get("score", 0)) for t in graded_turns]
    avg_score = sum(scores) / len(scores) if scores else 0
    categories = [t.get("task_category", "unknown") for t in graded_turns]

    # Compute compliance rate and progress signal
    compliance_rate = compute_compliance_rate(graded_turns)
    progress_signal = compute_progress_signal(compliance_rate, avg_score)

    # Extract structured data
    notable_hits = _find_notable_hits(graded_turns)
    target_words = _collect_all_target_words(graded_turns)
    strategy_trend = _compute_strategy_trend(graded_turns)

    # Determine next session plan based on performance
    next_session_plan = []
    if avg_score < 1.0:
        next_session_plan.append("Simplify to yes/no or single word tasks")
    elif avg_score < 2.5:
        next_session_plan.append("Focus on repetition of successful words")
    else:
        next_session_plan.append("Try slightly harder tasks (2-word phrases)")

    if not notable_hits:
        next_session_plan.append("Use more familiar vocabulary (mom, dad, yes, no)")
    elif len(notable_hits) >= 2:
        next_session_plan.append(f"Build on successful words: {', '.join(notable_hits[:3])}")

    # Load prompt template
    prompt_template = _load_session_summary_prompt()

    # Format the prompt
    prompt = prompt_template.format(
        subject=subject or "(unknown)",
        lesson=lesson or "(unknown)",
        task_categories=", ".join(set(categories)),
        graded_count=len(graded_turns),
        avg_score=f"{avg_score:.1f}",
        strategy_trend=strategy_trend,
        target_words=", ".join(target_words[:10]) if target_words else "(none extracted)",
        notable_hits=", ".join(notable_hits) if notable_hits else "(none)",
    )

    # Debug: print prompt if DEBUG_SESSION is set
    if DEBUG_SESSION:
        print(f"\n[Memory Manager DEBUG] Session Summary Prompt")
        print(f"  Prompt file: {SESSION_SUMMARY_PROMPT_FILE}")
        print(f"  Rendered prompt ({len(prompt)} chars):")
        print(f"  ---")
        print(prompt[-800:])  # Last 800 chars
        print(f"  ---\n")

    try:
        response = client.chat.completions.create(
            model=SUMMARIZER_MODEL,
            messages=[
                {"role": "system", "content": "You write brief factual teacher notes. No corporate prose. No anthropomorphism."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.3,
            max_tokens=100,  # Keep it short
        )

        summary_text = response.choices[0].message.content.strip()
    except Exception as e:
        print(f"[Memory Manager] Summary generation error: {e}")
        # Fallback: generate a simple factual summary
        hits_str = f"Hits: {', '.join(notable_hits)}. " if notable_hits else ""
        summary_text = f"Lesson: {lesson or 'unknown'}. {len(graded_turns)} tasks, avg {avg_score:.1f}/7. {hits_str}Strategy: {strategy_trend}."

    summary = {
        "session_id": session_id,
        "timestamp": datetime.now().isoformat(),
        "subject": subject,
        "lesson": lesson,
        "total_turns": len(transcript),
        "graded_turns": len(graded_turns),
        "average_score": avg_score,
        "compliance_rate": compliance_rate,
        "progress_signal": progress_signal,
        "categories_used": list(set(categories)),
        "summary_text": summary_text,
        "scores": scores,
        # New structured fields
        "notable_hits": notable_hits,
        "target_words_asked": target_words,
        "strategy_trend": strategy_trend,
        "next_session_plan": next_session_plan,
    }

    save_session_summary(session_id, summary)
    return summary


# =============================================================================
# BASIL ASSESSMENT MANAGEMENT
# =============================================================================

def load_basil_assessment() -> dict:
    """Load the Basil assessment file."""
    if os.path.exists(BASIL_ASSESSMENT_FILE):
        with open(BASIL_ASSESSMENT_FILE, "r") as f:
            return json.load(f)
    return _default_basil_assessment()


def save_basil_assessment(assessment: dict):
    """Save the Basil assessment file."""
    ensure_dirs()
    with open(BASIL_ASSESSMENT_FILE, "w") as f:
        json.dump(assessment, f, indent=2)


def _default_basil_assessment() -> dict:
    """Return default Basil assessment structure."""
    return {
        "age_band": 0,
        "capabilities": [
            "No reliable words yet; mostly babble.",
            "Occasionally produces recognizable fragments."
        ],
        "preferred_task_categories": ["control", "vocab"],
        "output_caps": {
            "basil_max_tokens": 30,
            "tutor_max_tokens": 120,
            "sophie_max_tokens": 80,
            "task_agent_max_tokens": 160,
            "grader_max_tokens": 120
        },
        "progress_signal": False,
        "compliance_rate": 0.0,
        "score_history": [],
        "assessed_age_band_history": [],  # New: session-level developmental assessments
        "recent_session_metrics": {
            "avg_score_last_session": None,
            "avg_score_window": None,
            "compliance_rate_last_session": None,
            "compliance_rate_window": None,
            "progress_signal_window": False,
            "avg_assessed_age_band": None,  # New: rolling average for promotion
            "window_sessions": 0
        },
        "last_updated": None
    }


def update_basil_assessment(session_avg_score: float, graded_turns: list, assessed_age_band: int = None, session_id: str = None) -> dict:
    """
    Update Basil's assessment after a session ends.

    Process-safe: acquires a file lock around the entire load-modify-save cycle.

    If assessed_age_band is provided, uses simple rolling average for promotion.
    Otherwise, falls back to score-based gated promotion/demotion (backward compat).

    Args:
        session_avg_score: Average score for the just-completed session
        graded_turns: List of graded turn data from the session
        assessed_age_band: Optional assessed developmental age_band (0-7) from assessment agent
        session_id: Optional session ID for tracking assessed_age_band history

    Returns:
        Updated assessment dict
    """
    with get_lock(BASIL_ASSESSMENT_FILE):
        return _update_basil_assessment_locked(session_avg_score, graded_turns, assessed_age_band, session_id)


def _update_basil_assessment_locked(session_avg_score: float, graded_turns: list, assessed_age_band: int = None, session_id: str = None) -> dict:
    """Inner implementation of update_basil_assessment, called under lock."""
    assessment = load_basil_assessment()
    current_age_band = assessment.get("age_band", 0)

    # Compute this session's compliance rate and progress signal
    session_compliance = compute_compliance_rate(graded_turns)
    session_progress = compute_progress_signal(session_compliance, session_avg_score)

    # Add this session's metrics to history
    score_history = assessment.get("score_history", [])
    score_history.append({
        "session_avg": session_avg_score,
        "compliance_rate": session_compliance,
        "progress_signal": session_progress,
        "timestamp": datetime.now().isoformat(),
        "graded_turns": len(graded_turns)
    })
    # Keep only last 20 sessions
    score_history = score_history[-20:]
    assessment["score_history"] = score_history

    # Update current session metrics
    assessment["compliance_rate"] = session_compliance
    assessment["progress_signal"] = session_progress

    # Track assessed_age_band if provided
    if assessed_age_band is not None:
        # Initialize assessed_age_band_history if needed
        if "assessed_age_band_history" not in assessment:
            assessment["assessed_age_band_history"] = []

        # Add this session's assessment
        assessment["assessed_age_band_history"].append({
            "assessed_age_band": assessed_age_band,
            "timestamp": datetime.now().isoformat(),
            "session_id": session_id,
        })
        # Keep only last 20 sessions
        assessment["assessed_age_band_history"] = assessment["assessed_age_band_history"][-20:]

    # Get windowed metrics from recent session summaries
    window_metrics = _compute_window_metrics(score_history, ASSESSMENT_WINDOW_SESSIONS)

    # Check if we're at or past the training threshold (would trigger training this cycle)
    # BLOCK promotion when at threshold: training must run first, then promotion on next batch
    # Uses usable turns (same metric as the training trigger in orchestrator.py)
    metrics = load_metrics()
    usable_since_train = metrics.get("usable_turns_since_last_train", 0)
    train_threshold = get_train_every_usable_turns(current_age_band)
    at_or_past_train_threshold = usable_since_train >= train_threshold

    # Cap promotion at one level above the age_band where training last ran.
    # Formula: max_age_band = age_band_at_last_train + 1
    # Before any training (-1 + 1 = 0), Basil stays at band 0.
    # After training at band N, Basil can advance to at most band N+1.
    # Demotion is unrestricted — min() only caps the upward direction.
    age_band_at_last_train = metrics.get("age_band_at_last_train", -1)
    max_promotion_band = age_band_at_last_train + 1

    # Determine if we should adjust age_band
    new_age_band = current_age_band
    decision = "hold"

    # #region agent log
    try:
        import json as _json
        _log_path = os.path.join(os.path.dirname(__file__), ".cursor", "debug.log")
        os.makedirs(os.path.dirname(_log_path), exist_ok=True)
        with open(_log_path, "a") as _lf:
            _lf.write(_json.dumps({"location": "memory_manager.py:promotion_check", "message": "promotion_decision_inputs", "data": {"current_age_band": current_age_band, "assessed_age_band": assessed_age_band, "usable_since_train": usable_since_train, "train_threshold": train_threshold, "at_or_past_train_threshold": at_or_past_train_threshold, "age_band_at_last_train": age_band_at_last_train, "max_promotion_band": max_promotion_band, "window_sessions": ASSESSMENT_WINDOW_SESSIONS, "assessed_history_len": len(assessment.get("assessed_age_band_history", []))}, "timestamp": int(datetime.now().timestamp() * 1000), "hypothesisId": "promotion_unblock"}) + "\n")
    except Exception:
        pass
    # #endregion

    # Use assessed_age_band for promotion if available, otherwise fall back to score-based logic
    if assessed_age_band is not None and "assessed_age_band_history" in assessment:
        # Use developmental assessment for promotion
        assessed_history = assessment["assessed_age_band_history"]

        if len(assessed_history) >= ASSESSMENT_WINDOW_SESSIONS:
            # Compute rolling average of assessed_age_bands
            recent_assessments = assessed_history[-ASSESSMENT_WINDOW_SESSIONS:]
            avg_assessed_age_band = sum(a["assessed_age_band"] for a in recent_assessments) / len(recent_assessments)

            # Simple promotion/demotion based on rounded average
            rounded_avg = round(avg_assessed_age_band)

            if rounded_avg > current_age_band and current_age_band < 7:
                # Block promotion when at/past threshold: training must run first
                if at_or_past_train_threshold:
                    print(f"[Assessment] Promotion blocked: training threshold reached (usable {usable_since_train} >= {train_threshold}), run training first")
                elif rounded_avg > max_promotion_band:
                    # Cap at one level above last trained band
                    if max_promotion_band > current_age_band:
                        new_age_band = max_promotion_band
                        decision = "promote"
                        print(f"[Assessment] Promoting Basil from age_band {current_age_band} to {new_age_band} "
                              f"(capped: assessed {avg_assessed_age_band:.2f} -> max {max_promotion_band}, "
                              f"last trained at band {age_band_at_last_train})")
                    else:
                        print(f"[Assessment] Promotion blocked: must train at band {current_age_band} first "
                              f"(last trained at band {age_band_at_last_train}, assessed: {avg_assessed_age_band:.2f})")
                else:
                    new_age_band = rounded_avg
                    decision = "promote"
                    print(f"[Assessment] Promoting Basil from age_band {current_age_band} to {new_age_band} (assessed: {avg_assessed_age_band:.2f})")
            elif rounded_avg < current_age_band and current_age_band > 0:
                new_age_band = rounded_avg
                decision = "demote"
                print(f"[Assessment] Demoting Basil from age_band {current_age_band} to {new_age_band} (assessed: {avg_assessed_age_band:.2f})")
            else:
                print(f"[Assessment] Age band unchanged: {current_age_band} (assessed: {avg_assessed_age_band:.2f})")
        else:
            print(f"[Assessment] Not enough assessed sessions ({len(assessed_history)} < {ASSESSMENT_WINDOW_SESSIONS}), holding age_band")

    elif window_metrics["count"] >= ASSESSMENT_WINDOW_SESSIONS:
        # Fallback to score-based logic (backward compat for old sessions)
        avg_score_window = window_metrics["avg_score"]
        compliance_window = window_metrics["compliance_rate"]
        progress_window = window_metrics["progress_signal"]

        # Check for promotion (requires multiple gates)
        if avg_score_window >= ASSESSMENT_PROMOTE_SCORE and current_age_band < 7:
            # Gate 1: Must NOT be at training threshold (training runs first, then promotion)
            if at_or_past_train_threshold:
                print(f"[Assessment] Promotion blocked: training threshold reached (usable {usable_since_train} >= {train_threshold}), run training first")
            elif current_age_band >= max_promotion_band:
                print(f"[Assessment] Promotion blocked: must train at band {current_age_band} first "
                      f"(last trained at band {age_band_at_last_train})")
            else:
                # Gate 2: Compliance rate must meet threshold
                compliance_ok = compliance_window >= ASSESSMENT_MIN_COMPLIANCE_FOR_PROMOTION

                # Gate 3: Progress signal (if required)
                if ASSESSMENT_PROGRESS_SIGNAL_REQUIRED_FOR_PROMOTION:
                    progress_ok = progress_window
                else:
                    progress_ok = True

                # Gate 4: Evidence of actual progress (existing check)
                evidence_ok = _check_progress_evidence(graded_turns, current_age_band)

                if compliance_ok and progress_ok and evidence_ok:
                    new_age_band = current_age_band + 1
                    decision = "promote"
                    print(f"[Assessment] Promoting Basil from age_band {current_age_band} to {new_age_band} (score-based)")
                else:
                    # Log why promotion was blocked
                    blocked_reasons = []
                    if not compliance_ok:
                        blocked_reasons.append(f"compliance {compliance_window:.1%} < {ASSESSMENT_MIN_COMPLIANCE_FOR_PROMOTION:.0%}")
                    if not progress_ok:
                        blocked_reasons.append("progress_signal=False")
                    if not evidence_ok:
                        blocked_reasons.append("no evidence of progress")
                    print(f"[Assessment] Promotion blocked: {', '.join(blocked_reasons)}")

        # Check for demotion (requires score low AND compliance collapsed)
        elif avg_score_window <= ASSESSMENT_DEMOTE_SCORE and current_age_band > 0:
            # Demotion requires either low compliance OR no progress signal
            compliance_collapsed = compliance_window <= ASSESSMENT_MAX_COMPLIANCE_FOR_DEMOTION
            no_progress = not progress_window

            if compliance_collapsed or no_progress:
                new_age_band = current_age_band - 1
                decision = "demote"
                print(f"[Assessment] Demoting Basil from age_band {current_age_band} to {new_age_band} (score-based)")
            else:
                print(f"[Assessment] Demotion blocked: compliance={compliance_window:.1%}, progress={progress_window}")

    # Compute avg_assessed_age_band for recent_session_metrics
    avg_assessed_age_band = None
    if "assessed_age_band_history" in assessment and assessment["assessed_age_band_history"]:
        recent_assessments = assessment["assessed_age_band_history"][-ASSESSMENT_WINDOW_SESSIONS:]
        if recent_assessments:
            avg_assessed_age_band = sum(a["assessed_age_band"] for a in recent_assessments) / len(recent_assessments)

    # Update recent_session_metrics for debugging/transparency
    assessment["recent_session_metrics"] = {
        "avg_score_last_session": session_avg_score,
        "avg_score_window": window_metrics["avg_score"],
        "compliance_rate_last_session": session_compliance,
        "compliance_rate_window": window_metrics["compliance_rate"],
        "progress_signal_window": window_metrics["progress_signal"],
        "window_sessions": window_metrics["count"],
        "avg_assessed_age_band": avg_assessed_age_band,  # New: for promotion tracking
    }

    # Log summary
    avg_assessed_str = f"{avg_assessed_age_band:.2f}" if avg_assessed_age_band is not None else "N/A"
    if assessed_age_band is not None:
        print(f"[Assessment] Window: avg_score={window_metrics['avg_score']:.2f}, "
              f"compliance={window_metrics['compliance_rate']:.1%}, "
              f"assessed_age_band={assessed_age_band}, "
              f"avg_assessed_age_band={avg_assessed_str}, "
              f"decision={decision}")
    else:
        print(f"[Assessment] Window: avg_score={window_metrics['avg_score']:.2f}, "
              f"compliance={window_metrics['compliance_rate']:.1%}, "
              f"progress={window_metrics['progress_signal']}, decision={decision} (score-based)")

    # #region agent log
    try:
        import json as _json
        _log_path = os.path.join(os.path.dirname(__file__), ".cursor", "debug.log")
        with open(_log_path, "a") as _lf:
            _lf.write(_json.dumps({"location": "memory_manager.py:promotion_result", "message": "promotion_decision_result", "data": {"old_age_band": current_age_band, "new_age_band": new_age_band, "decision": decision}, "timestamp": int(datetime.now().timestamp() * 1000), "hypothesisId": "promotion_unblock"}) + "\n")
    except Exception:
        pass
    # #endregion

    # Update age_band
    assessment["age_band"] = new_age_band

    # Update capabilities based on age_band
    assessment["capabilities"] = _generate_capabilities(new_age_band,
        [s["session_avg"] for s in score_history[-ASSESSMENT_WINDOW_SESSIONS:]], graded_turns)

    # Update preferred task categories
    assessment["preferred_task_categories"] = TASK_CATEGORIES_BY_AGE_BAND.get(
        new_age_band, ["control", "vocab"]
    )

    # Update output caps based on age_band
    assessment["output_caps"]["basil_max_tokens"] = BASIL_MAX_TOKENS_BY_AGE_BAND.get(
        new_age_band, 30
    )

    # Update timestamp
    assessment["last_updated"] = datetime.now().isoformat()

    save_basil_assessment(assessment)
    return assessment


def _compute_window_metrics(score_history: list, window_size: int) -> dict:
    """
    Compute aggregated metrics over the assessment window.

    Args:
        score_history: List of session records with session_avg, compliance_rate, progress_signal
        window_size: Number of sessions to consider

    Returns:
        Dict with avg_score, compliance_rate, progress_signal (majority), count
    """
    recent = score_history[-window_size:]

    if not recent:
        return {
            "avg_score": 0.0,
            "compliance_rate": 0.0,
            "progress_signal": False,
            "count": 0
        }

    # Compute average score across window
    scores = [s.get("session_avg", 0) for s in recent]
    avg_score = sum(scores) / len(scores)

    # Compute average compliance rate across window
    compliance_rates = [s.get("compliance_rate", 0.0) for s in recent]
    avg_compliance = sum(compliance_rates) / len(compliance_rates)

    # Progress signal: True if majority of sessions had progress_signal=True
    progress_signals = [s.get("progress_signal", False) for s in recent]
    progress_true_count = sum(1 for p in progress_signals if p)
    majority_progress = progress_true_count > len(progress_signals) / 2

    return {
        "avg_score": avg_score,
        "compliance_rate": avg_compliance,
        "progress_signal": majority_progress,
        "count": len(recent)
    }


def _check_progress_evidence(graded_turns: list, current_age_band: int) -> bool:
    """
    Check if graded turns show evidence of progress warranting promotion.

    Evidence varies by age_band:
    - 0->1: Any English fragments appearing
    - 1->2: Recognizable English words
    - 2->3: Consistent target word hits
    - 3->4: Multi-word relevant responses
    - 4->5: Coherent phrases/short sentences
    - 5->6: Follows instructions, produces complete sentences
    - 6->7: Meaningful multi-sentence answers
    """
    if not graded_turns:
        return False

    # Look at grader evidence and scores
    high_score_turns = [t for t in graded_turns if t.get("grade", {}).get("score", 0) >= 3]

    # Need at least 30% of turns to be decent
    if len(high_score_turns) < len(graded_turns) * 0.3:
        return False

    # Check evidence strings for relevant patterns
    for turn in high_score_turns:
        evidence = turn.get("grade", {}).get("evidence", [])
        justification = turn.get("grade", {}).get("justification", "")

        if current_age_band == 0:
            # Looking for any English fragments
            if any(keyword in justification.lower() for keyword in
                   ["english", "fragment", "recognizable", "word"]):
                return True
        elif current_age_band == 1:
            # Looking for recognizable English words
            if any(keyword in justification.lower() for keyword in
                   ["recognizable", "word", "english", "clear"]):
                return True
        elif current_age_band == 2:
            # Looking for consistent target word hits
            if any(keyword in justification.lower() for keyword in
                   ["target", "found", "match", "consistent", "correct"]):
                return True
        elif current_age_band == 3:
            # Looking for multi-word relevant responses
            if any(keyword in justification.lower() for keyword in
                   ["relevant", "on-topic", "appropriate", "multi-word"]):
                return True
        elif current_age_band == 4:
            # Looking for coherent phrases/short sentences
            if any(keyword in justification.lower() for keyword in
                   ["phrase", "sentence", "coherent", "follows", "instruction"]):
                return True
        elif current_age_band == 5:
            # Looking for complete sentences, follows instructions
            if any(keyword in justification.lower() for keyword in
                   ["complete", "sentence", "follows", "instruction", "meaningful"]):
                return True
        elif current_age_band == 6:
            # Looking for meaningful multi-sentence answers
            if any(keyword in justification.lower() for keyword in
                   ["meaningful", "answer", "response", "explanation", "multi-sentence"]):
                return True

    return False


def _generate_capabilities(age_band: int, recent_scores: list, graded_turns: list) -> list:
    """Generate capability bullets based on age_band and recent performance."""
    capabilities = []

    avg_score = sum(recent_scores) / len(recent_scores) if recent_scores else 0

    if age_band == 0:
        capabilities = [
            "Produces random tokens with no recognizable English.",
            f"Average task score: {avg_score:.1f}/7",
            "Best suited for: control tasks, length constraints."
        ]
    elif age_band == 1:
        capabilities = [
            "Produces English fragments but no consistent compliance.",
            f"Average task score: {avg_score:.1f}/7",
            "Best suited for: control tasks, vocab attempts.",
            "Still struggling with: producing recognizable words consistently."
        ]
    elif age_band == 2:
        capabilities = [
            "Occasionally produces target words, low compliance rate.",
            f"Average task score: {avg_score:.1f}/7",
            "Can attempt: one-word tasks, simple yes/no.",
            "Still struggling with: consistent word production."
        ]
    elif age_band == 3:
        capabilities = [
            "Consistently produces single-word responses, 30%+ compliance.",
            f"Average task score: {avg_score:.1f}/7",
            "Can attempt: control tasks, vocab, simple relevance tasks.",
            "Emerging: reliable single-word production."
        ]
    elif age_band == 4:
        capabilities = [
            "Produces multi-word on-topic responses.",
            f"Average task score: {avg_score:.1f}/7",
            "Can attempt: vocab, relevance, memory tasks.",
            "Emerging: short phrases, basic word combinations."
        ]
    elif age_band == 5:
        capabilities = [
            "Produces simple complete sentences.",
            f"Average task score: {avg_score:.1f}/7",
            "Can attempt: relevance, memory, conversation tasks.",
            "Emerging: sentence structure, coherent responses."
        ]
    elif age_band == 6:
        capabilities = [
            "Engages in basic Q&A with 1-2 sentence responses.",
            f"Average task score: {avg_score:.1f}/7",
            "Can attempt: memory, conversation tasks.",
            "Emerging: meaningful dialogue, following instructions."
        ]
    elif age_band == 7:
        capabilities = [
            "Shows reasoning and can provide explanations and comparisons.",
            f"Average task score: {avg_score:.1f}/7",
            "Can attempt: memory, conversation tasks with reasoning.",
            "Ready for: more complex vocabulary, multi-step reasoning."
        ]

    return capabilities


def get_assessment_for_prompt() -> dict:
    """
    Get a simplified assessment dict suitable for prompt injection.

    Returns dict with:
        - age_band: int
        - age_band_description: str
        - capabilities: list of strings
        - preferred_categories: list of strings
        - basil_max_tokens: int
    """
    assessment = load_basil_assessment()
    age_band = assessment.get("age_band", 0)

    return {
        "age_band": age_band,
        "age_band_description": AGE_BAND_DESCRIPTIONS.get(age_band, "unknown"),
        "capabilities": assessment.get("capabilities", []),
        "preferred_categories": assessment.get("preferred_task_categories", ["control", "vocab"]),
        "basil_max_tokens": assessment.get("output_caps", {}).get("basil_max_tokens", 30),
    }


if __name__ == "__main__":
    # Test memory manager
    print("Testing Memory Manager...")

    ensure_dirs()
    metrics = load_metrics()
    print(f"Current metrics: {json.dumps(metrics, indent=2)}")

    # Test updating metrics
    update_metrics_after_turn(2.5, "control")
    print("Updated metrics after test turn")

    metrics = load_metrics()
    print(f"Updated metrics: {json.dumps(metrics, indent=2)}")