feat: GOAP integration, MC robustness gate, learned posterior variance, spawn-aware planning

tmcarmichael · tmcarmichael · commit 1ebd6bb02720 · 2026-03-31T22:34:02.000-05:00
diff --git a/docs/compass-hero.png b/docs/compass-hero.png
diff --git a/src/brain/completion.py b/src/brain/completion.py
@@ -20,6 +20,11 @@
 
 log = logging.getLogger(__name__)
 
+# Routines returning RUNNING longer than this are force-exited.
+# Must exceed the combat casting pipeline (2-3 s legit) to avoid
+# killing active fights mid-cast.
+HARD_KILL_THRESHOLD_MS = 5000
+
 
 def tick_active_routine(brain: Brain, state: GameState, now: float) -> None:
     """Tick the active routine and handle SUCCESS/FAILURE/hard-kill outcomes."""
@@ -36,7 +41,7 @@ def tick_active_routine(brain: Brain, state: GameState, now: float) -> None:
     brain._ticked_routine_name = brain._active_name
 
     if status == RoutineStatus.RUNNING:
-        if brain.routine_tick_ms > 5000:
+        if brain.routine_tick_ms > HARD_KILL_THRESHOLD_MS:
             hard_kill_routine(brain, state, now)
         return
 
@@ -128,9 +133,7 @@ def notify_cycle_tracker(brain: Brain, state: GameState, status: RoutineStatus)
 
 
 def hard_kill_routine(brain: Brain, state: GameState, now: float) -> None:
-    """Force-exit a routine that returned RUNNING but took >5 s."""
-    # Threshold must exceed combat casting pipeline (2-3s legit)
-    # to avoid killing active fights mid-cast.
+    """Force-exit a routine that exceeded HARD_KILL_THRESHOLD_MS."""
     log.error("[DECISION] HARD KILL: %s took %.0fms, forcing exit", brain._active_name, brain.routine_tick_ms)
     assert brain._active is not None
     brain._active.failure_reason = "hard_kill"
diff --git a/src/brain/goap/actions.py b/src/brain/goap/actions.py
@@ -199,6 +199,23 @@ def apply_effects(self, ws: PlanWorldState) -> PlanWorldState:
         return ws.with_changes(targets_available=1)
 
     def estimate_cost(self, ctx: AgentContext | None) -> float:
+        """Use spawn prediction to reduce wander cost when respawns are imminent.
+
+        If the spawn predictor has enough data, the expected time-to-next-respawn
+        in nearby cells replaces the default heuristic.  This makes the planner
+        prefer wander-then-fight plans when targets are predicted to appear soon,
+        converting random wandering into directed positioning.
+        """
+        if not ctx or not ctx.spawn_predictor:
+            return _DEFAULT_COSTS["wander"]
+        import time as _time
+
+        best = ctx.spawn_predictor.best_cells(3, _time.time())
+        if best:
+            # Use the shortest predicted wait among nearby cells
+            min_wait = min(secs for _, secs in best)
+            # Blend: at least 5s (travel time), at most the default
+            return max(5.0, min(min_wait, _DEFAULT_COSTS["wander"]))
         return _DEFAULT_COSTS["wander"]
 
 
diff --git a/src/brain/goap/planner.py b/src/brain/goap/planner.py
@@ -41,7 +41,8 @@
 SATISFACTION_THRESHOLD = 0.70  # goal is "achieved enough" at this level
 PLAN_BUDGET_MS = 50.0  # max time for plan generation
 MC_ROLLOUTS = 20  # Monte Carlo rollouts per candidate plan
-MC_NOISE_SIGMA = 0.15  # noise on action effects during rollouts
+MC_NOISE_SIGMA = 0.15  # fallback noise when no learned variance available
+MC_ROBUSTNESS_THRESHOLD = 0.50  # reject plans below this MC satisfaction
 
 
 @dataclass(slots=True)
@@ -402,24 +403,28 @@ def _mc_evaluate(
         """Evaluate a candidate plan via Monte Carlo rollouts.
 
         Runs MC_ROLLOUTS stochastic simulations of the plan. In each rollout,
-        action effects are perturbed with Gaussian noise on continuous fields
-        (hp_pct, mana_pct) to simulate outcome uncertainty. Returns the mean
-        goal satisfaction across rollouts.
+        action effects are perturbed with noise drawn from learned posterior
+        variance (encounter history) when available, or fixed sigma as
+        fallback.  Returns the mean goal satisfaction across rollouts.
 
         A plan that achieves high satisfaction across noisy rollouts is robust
         to the inherent uncertainty in combat outcomes, rest durations, etc.
         """
         if not plan_actions:
             return goal.satisfaction(start)
 
+        # Derive noise sigma from learned posterior variance when available.
+        # Wider posteriors (less data) produce more noise, naturally penalising
+        # plans that depend on uncertain outcomes.
+        hp_sigma, mana_sigma = self._learned_mc_sigma(ctx)
+
         total_sat = 0.0
         for _ in range(MC_ROLLOUTS):
             ws = start
             for action in plan_actions:
                 ws = action.apply_effects(ws)
-                # Stochastic perturbation on continuous resource fields
-                hp_noise = random.gauss(0, MC_NOISE_SIGMA)
-                mana_noise = random.gauss(0, MC_NOISE_SIGMA)
+                hp_noise = random.gauss(0, hp_sigma)
+                mana_noise = random.gauss(0, mana_sigma)
                 ws = ws.with_changes(
                     hp_pct=max(0.0, min(1.0, ws.hp_pct + hp_noise)),
                     mana_pct=max(0.0, min(1.0, ws.mana_pct + mana_noise)),
@@ -428,6 +433,37 @@ def _mc_evaluate(
 
         return total_sat / MC_ROLLOUTS
 
+    @staticmethod
+    def _learned_mc_sigma(ctx: AgentContext | None) -> tuple[float, float]:
+        """Derive MC noise sigma from encounter posterior variance.
+
+        When fight history has enough data, the posterior variance on HP loss
+        and mana cost reflects actual outcome uncertainty.  Wider posteriors
+        (fewer observations) produce larger sigma, so plans that depend on
+        poorly-known actions are penalised more heavily.
+
+        Falls back to MC_NOISE_SIGMA when no learned data is available.
+        """
+        if not ctx or not ctx.fight_history:
+            return MC_NOISE_SIGMA, MC_NOISE_SIGMA
+        all_stats = ctx.fight_history.get_all_stats()
+        if not all_stats:
+            return MC_NOISE_SIGMA, MC_NOISE_SIGMA
+        # Average posterior std across known entity types
+        hp_vars: list[float] = []
+        mana_vars: list[float] = []
+        for stats in all_stats.values():
+            if stats.danger_post_var > 0:
+                hp_vars.append(stats.danger_post_var)
+            if stats.mana_post_var > 0:
+                mana_vars.append(stats.mana_post_var)
+        hp_sigma = (sum(hp_vars) / len(hp_vars)) ** 0.5 if hp_vars else MC_NOISE_SIGMA
+        mana_sigma = (sum(mana_vars) / len(mana_vars)) ** 0.5 if mana_vars else MC_NOISE_SIGMA
+        # Clamp to reasonable range
+        hp_sigma = max(0.02, min(0.40, hp_sigma))
+        mana_sigma = max(0.02, min(0.40, mana_sigma))
+        return hp_sigma, mana_sigma
+
     # -- Internal: A* Search ----------------------------------------------------
 
     def _search(self, start: PlanWorldState, goal: Goal, ctx: AgentContext | None) -> Plan | None:
@@ -462,9 +498,20 @@ def _search(self, start: PlanWorldState, goal: Goal, ctx: AgentContext | None) -
             # Goal test: deterministic satisfaction check
             sat = goal.satisfaction(node.state)
             if sat >= SATISFACTION_THRESHOLD:
-                # Monte Carlo robustness check: verify the plan holds
-                # under stochastic action outcomes.
+                # Monte Carlo robustness gate: reject plans that don't hold
+                # under stochastic action outcomes.  Uses learned posterior
+                # variance when available, fixed sigma as fallback.
                 mc_sat = self._mc_evaluate(node.actions, start, goal, ctx)
+                if mc_sat < MC_ROBUSTNESS_THRESHOLD:
+                    log.log(
+                        VERBOSE,
+                        "[GOAP] Plan rejected (mc_sat=%.2f < %.2f): %d steps, cost=%.1f",
+                        mc_sat,
+                        MC_ROBUSTNESS_THRESHOLD,
+                        len(node.actions),
+                        node.g_cost,
+                    )
+                    continue  # keep searching for a more robust plan
                 log.log(
                     VERBOSE,
                     "[GOAP] Plan found: %d steps, %d nodes, cost=%.1f, sat=%.2f, mc_sat=%.2f",
diff --git a/src/brain/scoring_phases.py b/src/brain/scoring_phases.py
@@ -21,6 +21,29 @@
 
 log = logging.getLogger(__name__)
 
+# Score multiplier applied when GOAP planner suggests a specific action
+GOAP_BOOST = 1.5
+
+
+def _resolve_phase_context(brain: Brain) -> tuple[str, str]:
+    """Return (session_phase, goap_hint) from brain diagnostics."""
+    phase = "grinding"
+    goap_hint = ""
+    if brain._ctx and hasattr(brain._ctx, "diag") and brain._ctx.diag:
+        pd = getattr(brain._ctx.diag, "phase_detector", None)
+        if pd is not None:
+            phase = pd.current_phase
+        goap_hint = getattr(brain._ctx.diag, "goap_suggestion", "")
+    return phase, goap_hint
+
+
+def _apply_modifiers(score: float, phase: str, rule_name: str, goap_hint: str) -> float:
+    """Apply session-phase modifier and GOAP boost to a raw score."""
+    score *= get_phase_modifier(phase, rule_name)
+    if goap_hint and rule_name == goap_hint and score > 0:
+        score *= GOAP_BOOST
+    return score
+
 
 def compute_divergence(brain: Brain, state: GameState, now: float, binary_winner: str) -> None:
     """Phase 1: compute scores for all rules, log when score-based
@@ -74,25 +97,13 @@ def select_by_tier(
             continue
         tier_groups[r.tier].append(r)
 
-    # Get session phase for contextual score modifiers
-    phase = "grinding"
-    goap_hint = ""
-    if brain._ctx and hasattr(brain._ctx, "diag") and brain._ctx.diag:
-        pd = getattr(brain._ctx.diag, "phase_detector", None)
-        if pd is not None:
-            phase = pd.current_phase
-        goap_hint = getattr(brain._ctx.diag, "goap_suggestion", "")
+    phase, goap_hint = _resolve_phase_context(brain)
 
     for tier in sorted(tier_groups):
         scored: list[tuple[float, RuleDef]] = []
         for r in tier_groups[tier]:
             t0 = time.perf_counter()
-            s = r.score_fn(state)
-            # Apply session phase modifier (startup, incident, idle, etc.)
-            s *= get_phase_modifier(phase, r.name)
-            # GOAP planner boost: prefer the planned action
-            if goap_hint and r.name == goap_hint and s > 0:
-                s *= 1.5  # 50% score boost for GOAP-suggested action
+            s = _apply_modifiers(r.score_fn(state), phase, r.name, goap_hint)
             rule_times[r.name] = (time.perf_counter() - t0) * 1000
             rule_eval[r.name] = f"{s:.2f}" if s > 0 else "0"
             diag_results.append(f"{r.name}={s:.2f}")
@@ -114,14 +125,7 @@ def select_weighted(
     emergency: list[tuple[float, RuleDef]] = []
     normal: list[tuple[float, RuleDef]] = []
 
-    # Session phase for contextual modifiers
-    phase = "grinding"
-    goap_hint = ""
-    if brain._ctx and hasattr(brain._ctx, "diag") and brain._ctx.diag:
-        pd = getattr(brain._ctx.diag, "phase_detector", None)
-        if pd is not None:
-            phase = pd.current_phase
-        goap_hint = getattr(brain._ctx.diag, "goap_suggestion", "")
+    phase, goap_hint = _resolve_phase_context(brain)
 
     for r in brain._rules:
         if r.name in brain._cooldowns and now < brain._cooldowns[r.name]:
@@ -131,12 +135,7 @@ def select_weighted(
             rule_times[r.name] = 0.0
             continue
         t0 = time.perf_counter()
-        s = r.score_fn(state)
-        # Apply session phase modifier
-        s *= get_phase_modifier(phase, r.name)
-        # GOAP planner boost
-        if goap_hint and r.name == goap_hint and s > 0:
-            s *= 1.5
+        s = _apply_modifiers(r.score_fn(state), phase, r.name, goap_hint)
         rule_times[r.name] = (time.perf_counter() - t0) * 1000
         weighted = r.weight * s
         rule_eval[r.name] = f"{weighted:.1f}" if s > 0 else "0"
@@ -171,13 +170,7 @@ def select_with_considerations(
     emergency: list[tuple[float, RuleDef]] = []
     normal: list[tuple[float, RuleDef]] = []
 
-    phase = "grinding"
-    goap_hint = ""
-    if brain._ctx and hasattr(brain._ctx, "diag") and brain._ctx.diag:
-        pd = getattr(brain._ctx.diag, "phase_detector", None)
-        if pd is not None:
-            phase = pd.current_phase
-        goap_hint = getattr(brain._ctx.diag, "goap_suggestion", "")
+    phase, goap_hint = _resolve_phase_context(brain)
 
     for r in brain._rules:
         if r.name in brain._cooldowns and now < brain._cooldowns[r.name]:
@@ -189,12 +182,10 @@ def select_with_considerations(
         t0 = time.perf_counter()
         # Phase 4: prefer considerations over score_fn when defined
         if r.considerations and brain._ctx:
-            s = score_from_considerations(r.considerations, state, brain._ctx)
+            raw = score_from_considerations(r.considerations, state, brain._ctx)
         else:
-            s = r.score_fn(state)
-        s *= get_phase_modifier(phase, r.name)
-        if goap_hint and r.name == goap_hint and s > 0:
-            s *= 1.5
+            raw = r.score_fn(state)
+        s = _apply_modifiers(raw, phase, r.name, goap_hint)
         rule_times[r.name] = (time.perf_counter() - t0) * 1000
         weighted = r.weight * s
         rule_eval[r.name] = f"{weighted:.1f}" if s > 0 else "0"
diff --git a/tests/test_goap_actions.py b/tests/test_goap_actions.py
@@ -426,6 +426,54 @@ def test_defeat_effects_use_class_defaults(self) -> None:
 # ---------------------------------------------------------------------------
 
 
+class TestWanderActionSpawnPrediction:
+    """WanderAction.estimate_cost uses spawn predictions when available."""
+
+    def _wander(self) -> WanderAction:
+        return WanderAction(name="wander", routine_name="WANDER")
+
+    def test_no_ctx_returns_default(self) -> None:
+        assert self._wander().estimate_cost(None) == 30.0
+
+    def test_no_spawn_predictor_returns_default(self) -> None:
+        from types import SimpleNamespace
+
+        ctx = SimpleNamespace(spawn_predictor=None)
+        assert self._wander().estimate_cost(ctx) == 30.0
+
+    def test_imminent_respawn_reduces_cost(self) -> None:
+        from types import SimpleNamespace
+
+        from core.types import Point
+
+        predictor = SimpleNamespace(
+            best_cells=lambda n, now: [(Point(100, 100, 0), 8.0)],
+        )
+        ctx = SimpleNamespace(spawn_predictor=predictor)
+        cost = self._wander().estimate_cost(ctx)
+        # 8s predicted wait -> cost should be 8.0 (clamped above 5.0)
+        assert cost == 8.0
+
+    def test_very_short_wait_clamps_to_minimum(self) -> None:
+        from types import SimpleNamespace
+
+        from core.types import Point
+
+        predictor = SimpleNamespace(
+            best_cells=lambda n, now: [(Point(50, 50, 0), 2.0)],
+        )
+        ctx = SimpleNamespace(spawn_predictor=predictor)
+        cost = self._wander().estimate_cost(ctx)
+        assert cost == 5.0  # floor of 5s travel time
+
+    def test_empty_predictions_returns_default(self) -> None:
+        from types import SimpleNamespace
+
+        predictor = SimpleNamespace(best_cells=lambda n, now: [])
+        ctx = SimpleNamespace(spawn_predictor=predictor)
+        assert self._wander().estimate_cost(ctx) == 30.0
+
+
 class TestAcquireActionCost:
     def test_estimate_cost(self) -> None:
         assert _acquire().estimate_cost(None) == 5.0
diff --git a/tests/test_goap_planner.py b/tests/test_goap_planner.py