From 00568bf02b9930fc9655afc065a240423528a48a Mon Sep 17 00:00:00 2001 From: DONGRYEOLLEE1 Date: Fri, 22 May 2026 15:39:38 +0900 Subject: [PATCH 1/2] refactor(routing): remove rule-based heuristics, register repo_binding safeguard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sweep packages/agent-core for any remaining rule-based routing patterns and move the LLM-driven policy (CLAUDE.md §"Supervisor → Sub-agent Handoff Policy" P1-P5) to its full conclusion. - planner.py: delete `_build_simple_research_plan` keyword-dictionary heuristic (and the dead `_extract_latest_user_text` helper that fed it). All plan generation now goes through the LLM `TaskPlan` structured output — `PLANNER_PROMPT` already covers the lightweight research case. - head_supervisor.py + supervisor.py: delete the dead `_orchagent_identity_response` keyword fallback and its companions (`_extract_message_text`, `_latest_user_request_text`); identity Qs are handled by `SYSTEM_SUPERVISOR_PROMPT` `# IDENTITY` block, not by code. - safeguards.py: add `reject_coding_team_without_repo_binding` — extracts the previously-inline coding_team/repo_binding block from head_supervisor into the canonical P3 chain. Now surfaces a `safeguard:` reason on the SSE `route` event (P4 visibility). - head_supervisor.py: invoke the new safeguard BEFORE HITL so users are not asked to approve a dispatch the runtime cannot execute. - tests: replace the heuristic-locking planner test with an LLM-driven regression test (`RecordingPlannerLLM.called` must be True). Add three safeguard unit tests (pass-through, force-FINISH, non-coding-team no-op) under the new P3 contract. Plan: plans/ENFORCED_ROUTING_TO_LLM_DRIVEN_PLAN.md (all phases checked). Validation: - pytest tests -q → 188 passed (185 baseline + 3 new safeguard cases). - grep -rE "_should_force_|_APPROVAL_PATTERNS|_build_simple_research_plan|_orchagent_identity_response" packages/agent-core/src → 0 hits in code body. Co-Authored-By: Claude Opus 4.7 (1M context) --- apps/backend/tests/test_planner.py | 28 ++-- apps/backend/tests/test_router_safeguards.py | 25 ++++ .../src/agent_core/nodes/planner.py | 89 ------------- .../agent-core/src/agent_core/safeguards.py | 29 +++++ .../agent-core/src/agent_core/supervisor.py | 20 +-- .../agent_core/supervisors/head_supervisor.py | 87 ++----------- plans/ENFORCED_ROUTING_TO_LLM_DRIVEN_PLAN.md | 120 ++++++++++++++++++ 7 files changed, 202 insertions(+), 196 deletions(-) create mode 100644 plans/ENFORCED_ROUTING_TO_LLM_DRIVEN_PLAN.md diff --git a/apps/backend/tests/test_planner.py b/apps/backend/tests/test_planner.py index a0fd6c5..96f98bd 100644 --- a/apps/backend/tests/test_planner.py +++ b/apps/backend/tests/test_planner.py @@ -6,16 +6,6 @@ from langchain_core.messages import HumanMessage -class FailingPlannerLLM: - def with_structured_output(self, schema): - return self - - async def ainvoke(self, messages): - raise AssertionError( - "LLM planner should not run for lightweight research requests" - ) - - class RecordingPlannerLLM: def __init__(self, plan: str): self.plan = plan @@ -26,6 +16,7 @@ def with_structured_output(self, schema): async def ainvoke(self, messages): self.called = True + class Result: def __init__(self, plan: str): self.plan = plan @@ -34,8 +25,17 @@ def __init__(self, plan: str): @pytest.mark.asyncio -async def test_planner_uses_lightweight_plan_for_simple_research_query(): - planner = make_planner_node(FailingPlannerLLM()) # type: ignore +async def test_planner_always_invokes_llm_for_research_query(): + """LLM-driven 정책: research 쿼리도 휴리스틱이 아닌 LLM planner가 plan을 만든다. + + 플래너 안에 키워드 사전(`_build_simple_research_plan` 같은) 휴리스틱이 + 부활하면 이 테스트가 깨진다 — `RecordingPlannerLLM.called`가 False가 되기 + 때문. CLAUDE.md §"Supervisor → Sub-agent Handoff 정책" P1 위반 회귀 잠금. + """ + llm = RecordingPlannerLLM( + "1. [research_team] RoPE 알고리즘 자료를 조사한다.\n2. 최종 답변을 작성한다." + ) + planner = make_planner_node(llm) # type: ignore[arg-type] state = cast( BaseAgentState, @@ -50,11 +50,9 @@ async def test_planner_uses_lightweight_plan_for_simple_research_query(): command = await planner(state) + assert llm.called is True assert command.goto == "head_supervisor" - assert command.update["task_plan"].count("\n") == 1 assert "[research_team]" in command.update["task_plan"] - assert "[writing_team]" not in command.update["task_plan"] - assert "최종 답변" in command.update["task_plan"] @pytest.mark.asyncio diff --git a/apps/backend/tests/test_router_safeguards.py b/apps/backend/tests/test_router_safeguards.py index 2d2b449..d4dd6d7 100644 --- a/apps/backend/tests/test_router_safeguards.py +++ b/apps/backend/tests/test_router_safeguards.py @@ -15,6 +15,7 @@ enforce_dispatch_limit, enforce_team_redirect_limit, fallback_decision_on_parse_failure, + reject_coding_team_without_repo_binding, reject_invalid_goto, ) @@ -110,3 +111,27 @@ def test_router_decision_default_values_are_safe() -> None: assert decision.reason == "" assert decision.request_review is False assert decision.team_finished is False + + +def test_coding_team_with_repo_binding_passes_through() -> None: + decision = _make_decision("coding_team") + outcome = reject_coding_team_without_repo_binding(decision, repo_bound=True) + assert outcome.status == "accepted" + assert outcome.decision == decision + + +def test_coding_team_without_repo_binding_forces_finish() -> None: + decision = _make_decision("coding_team") + outcome = reject_coding_team_without_repo_binding(decision, repo_bound=False) + assert outcome.status == "fallback_finish" + assert outcome.decision.next == "FINISH" + assert "safeguard" in outcome.decision.reason + assert "coding_team" in outcome.decision.reason + + +def test_non_coding_team_decision_unaffected_by_repo_binding_safeguard() -> None: + """다른 팀 결정은 repo_binding 여부와 무관 — safeguard 사이드이펙트 없음.""" + decision = _make_decision("research_team") + outcome = reject_coding_team_without_repo_binding(decision, repo_bound=False) + assert outcome.status == "accepted" + assert outcome.decision == decision diff --git a/packages/agent-core/src/agent_core/nodes/planner.py b/packages/agent-core/src/agent_core/nodes/planner.py index e9b8e52..b51f112 100644 --- a/packages/agent-core/src/agent_core/nodes/planner.py +++ b/packages/agent-core/src/agent_core/nodes/planner.py @@ -14,77 +14,6 @@ class TaskPlan(BaseModel): ) -def _extract_latest_user_text(messages: list) -> str: - for message in reversed(messages): - if isinstance(message, tuple) and len(message) >= 2 and message[0] == "user": - return str(message[1]) - - message_type = getattr(message, "type", None) - if message_type == "human": - content = getattr(message, "content", "") - if isinstance(content, str): - return content - if isinstance(content, list): - text_parts: list[str] = [] - for item in content: - if isinstance(item, dict) and item.get("type") == "text": - text_parts.append(str(item.get("text", ""))) - return " ".join(part for part in text_parts if part) - - return "" - - -def _build_simple_research_plan(user_text: str) -> str | None: - normalized = user_text.lower() - research_markers = ( - "웹검색", - "웹 검색", - "검색", - "조사", - "찾아", - "알아봐", - "search", - "research", - "look up", - "web", - ) - answer_markers = ( - "설명", - "요약", - "정리", - "답변", - "explain", - "summary", - "summarize", - ) - complex_markers = ( - "보고서", - "report", - "table", - "표", - "비교", - "compare", - "slide", - "발표", - "코드", - "파일", - "문서", - "article", - ) - - if not any(marker in normalized for marker in research_markers): - return None - if not any(marker in normalized for marker in answer_markers): - return None - if any(marker in normalized for marker in complex_markers): - return None - - return ( - "1. [research_team] 사용자 요청을 답할 만큼만 신뢰할 수 있는 최신 자료를 조사한다.\n" - "2. 조사 결과를 바탕으로 최종 답변을 완성한다." - ) - - def make_planner_node(llm: BaseChatModel) -> Callable: """ Creates a planner node that executes immediately after user input. @@ -95,27 +24,10 @@ def make_planner_node(llm: BaseChatModel) -> Callable: async def planner_node(state: BaseAgentState) -> Command: print("[Planner] Analyzing request and creating plan...", flush=True) - # If there's already a plan and we are just looping, we don't recreate it unless explicitly asked. - # But usually Planner is only called once at START, or we can check if it's the first turn. if state.get("task_plan"): print("[Planner] Plan already exists. Skipping.", flush=True) return Command(goto="head_supervisor") - latest_user_text = _extract_latest_user_text(state.get("messages", [])) - simple_research_plan = _build_simple_research_plan(latest_user_text) - if simple_research_plan: - print( - f"[Planner] Using lightweight plan:\n{simple_research_plan}", flush=True - ) - plan_message = AIMessage( - content=f"**[Planner] Proposed Execution Plan:**\n{simple_research_plan}", - name="planner", - ) - return Command( - update={"task_plan": simple_research_plan, "messages": [plan_message]}, - goto="head_supervisor", - ) - messages = [{"role": "system", "content": system_prompt}] + state.get( "messages", [] ) @@ -135,7 +47,6 @@ async def planner_node(state: BaseAgentState) -> Command: print(f"[Planner] Generated Plan:\n{plan}", flush=True) - # Save the plan to state and notify the user/supervisor via message plan_message = AIMessage( content=f"**[Planner] Proposed Execution Plan:**\n{plan}", name="planner", diff --git a/packages/agent-core/src/agent_core/safeguards.py b/packages/agent-core/src/agent_core/safeguards.py index ba17cd9..983ece5 100644 --- a/packages/agent-core/src/agent_core/safeguards.py +++ b/packages/agent-core/src/agent_core/safeguards.py @@ -137,10 +137,39 @@ def fallback_decision_on_parse_failure( ) +def reject_coding_team_without_repo_binding( + decision: RouterDecision, + *, + repo_bound: bool, +) -> SafeguardOutcome: + """Force FINISH when the LLM picks ``coding_team`` without a bound repo. + + Coding workers require a bound repository to read/write files. When the + router LLM selects ``coding_team`` for a thread that has no repository + binding we cannot proceed — block and force FINISH so the head supervisor + answers directly instead of dispatching a team that will fail. + + This is a pure P3 safeguard: it never changes a *valid* LLM decision; it + only blocks one that violates a hard system precondition. + """ + if decision.next != "coding_team" or repo_bound: + return SafeguardOutcome(decision=decision) + return SafeguardOutcome( + decision=RouterDecision( + next="FINISH", + reason="safeguard: coding_team requires a bound repository.", + request_review=False, + team_finished=True, + ), + status="fallback_finish", + ) + + __all__ = [ "SafeguardOutcome", "enforce_dispatch_limit", "enforce_team_redirect_limit", "fallback_decision_on_parse_failure", + "reject_coding_team_without_repo_binding", "reject_invalid_goto", ] diff --git a/packages/agent-core/src/agent_core/supervisor.py b/packages/agent-core/src/agent_core/supervisor.py index 393eba8..a3d52f4 100644 --- a/packages/agent-core/src/agent_core/supervisor.py +++ b/packages/agent-core/src/agent_core/supervisor.py @@ -11,12 +11,6 @@ - ``layer="head"`` → :func:`agent_core.supervisors.make_head_supervisor_node` - ``layer="team"`` → :func:`agent_core.supervisors.make_team_supervisor_node` - -The historical helpers (``_extract_message_text``, -``_latest_user_request_text``, ``_orchagent_identity_response``) now live -inside ``agent_core.supervisors.head_supervisor`` where they are actually -used. They are re-exported here only for any external test that imported -them directly. """ from __future__ import annotations @@ -25,12 +19,7 @@ from langchain_core.language_models.chat_models import BaseChatModel -from agent_core.supervisors.head_supervisor import ( - _extract_message_text, - _latest_user_request_text, - _orchagent_identity_response, - make_head_supervisor_node, -) +from agent_core.supervisors.head_supervisor import make_head_supervisor_node from agent_core.supervisors.team_supervisor import make_team_supervisor_node @@ -67,9 +56,4 @@ def make_supervisor_node( ) -__all__ = [ - "_extract_message_text", - "_latest_user_request_text", - "_orchagent_identity_response", - "make_supervisor_node", -] +__all__ = ["make_supervisor_node"] diff --git a/packages/agent-core/src/agent_core/supervisors/head_supervisor.py b/packages/agent-core/src/agent_core/supervisors/head_supervisor.py index da3cf9e..b65e0ad 100644 --- a/packages/agent-core/src/agent_core/supervisors/head_supervisor.py +++ b/packages/agent-core/src/agent_core/supervisors/head_supervisor.py @@ -23,6 +23,7 @@ from langgraph.graph import END from langgraph.types import Command +from agent_core.safeguards import reject_coding_team_without_repo_binding from agent_core.state import ( BaseAgentState, ResponseMode, @@ -33,72 +34,6 @@ from prompt_kit.prompts import SYSTEM_SUPERVISOR_PROMPT -# --------------------------------------------------------------------------- -# Helpers — lifted from the previous monolithic supervisor.py so the head -# layer can answer identity questions deterministically before falling back -# to the LLM-emitted content. These are pure utilities (no graph state). -# --------------------------------------------------------------------------- - - -def _extract_message_text(content: Any) -> str: - if isinstance(content, str): - return content - if isinstance(content, list): - parts: list[str] = [] - for item in content: - if isinstance(item, str): - parts.append(item) - elif isinstance(item, dict) and item.get("type") == "text": - parts.append(str(item.get("text", ""))) - return " ".join(part for part in parts if part) - return str(content or "") - - -def _latest_user_request_text(messages: list[Any]) -> str: - for message in reversed(messages): - if getattr(message, "type", "") in {"human", "user"}: - return _extract_message_text(getattr(message, "content", "")) - if ( - isinstance(message, tuple) - and len(message) == 2 - and str(message[0]).lower() == "user" - ): - return _extract_message_text(message[1]) - if isinstance(message, dict) and message.get("role") == "user": - return _extract_message_text(message.get("content", "")) - return "" - - -def _orchagent_identity_response(user_text: str) -> str | None: - """Deterministic identity answer so the model never invents a name.""" - normalized = user_text.strip().lower() - if not normalized: - return None - - name_patterns = ( - "너 이름", - "네 이름", - "이름이 뭐", - "what is your name", - "your name", - "who are you", - ) - identity_patterns = ( - "너 정체", - "네 정체", - "정체가 뭐", - "what are you", - "who are you really", - "what is orchagent", - ) - - if any(pattern in normalized for pattern in name_patterns): - return "저는 OrchAgent입니다." - if any(pattern in normalized for pattern in identity_patterns): - return "저는 여러 전문 팀을 오케스트레이션하는 OrchAgent입니다." - return None - - def make_head_supervisor_node( llm: BaseChatModel, members: list[str], @@ -150,6 +85,18 @@ async def head_supervisor_node(state: BaseAgentState) -> Command: same_team_streak=same_team_streak, ) + # ---- Coding team safeguard: needs repo binding. ------------------- + # Block before HITL so the user is not asked to approve a dispatch + # that the system cannot execute anyway. + binding_outcome = reject_coding_team_without_repo_binding( + decision, + repo_bound=bool(shared_context.get("repo_binding")), + ) + if binding_outcome.status != "accepted": + decision = binding_outcome.decision + status = binding_outcome.status + print(f"[HeadSupervisor] {decision.reason}", flush=True) + # ---- HITL: interrupt only when LLM (or state flag) asked for it. -- state_requires_approval = bool( shared_context.get("force_requires_approval", False) @@ -160,15 +107,7 @@ async def head_supervisor_node(state: BaseAgentState) -> Command: if interrupt_result is not None: return interrupt_result - # ---- Coding team safeguard: needs repo binding. ------------------- next_node = decision.next - if next_node == "coding_team" and not shared_context.get("repo_binding"): - print( - "[HeadSupervisor] coding_team requested without a bound repository; " - "routing to FINISH for direct LLM answer.", - flush=True, - ) - next_node = "FINISH" # ---- Direct-FINISH answer content. ------------------------------ # Plan §4.0 P1/P3: the head supervisor LLM owns the final-answer diff --git a/plans/ENFORCED_ROUTING_TO_LLM_DRIVEN_PLAN.md b/plans/ENFORCED_ROUTING_TO_LLM_DRIVEN_PLAN.md new file mode 100644 index 0000000..a3e7be1 --- /dev/null +++ b/plans/ENFORCED_ROUTING_TO_LLM_DRIVEN_PLAN.md @@ -0,0 +1,120 @@ +# 강제 라우팅 → LLM-Driven Routing 전환 계획서 + +## 작성 일자 +2026-05-22 + +## 목적 +백엔드 코드베이스에 남아 있는 모든 **룰베이스 강제 라우팅**(키워드 사전, 휴리스틱 분기, LLM 결정 덮어쓰기 inline)을 식별하고 [CLAUDE.md §"Supervisor → Sub-agent Handoff 정책"](../CLAUDE.md) 및 `plans/CODEBASE_WIDE_REFACTORING_PLAN.md` §4.0 P1–P5 정책에 맞게 제거·전환한다. + +- **P1**: 모든 라우팅·handoff는 LLM `RouterDecision` 결정. 코드에서 정규식/키워드/`_should_force_*` 사용 금지. +- **P2**: 의도 가이드는 `packages/prompt-kit`이 단일 출처. 코드에 중복 인코딩 금지. +- **P3**: 안전망은 `agent_core/safeguards.py`의 함수 4종(+ 본 계획에서 1종 추가)만. 차단(FINISH) 또는 재요청(retry)만 허용. +- **P4**: 모든 결정·safeguard 발동은 `route_history` → SSE `route` 이벤트로 가시화. +- **P5**: 라우팅 회귀는 `tests/routing_eval/`의 골든 데이터셋이 정량 측정. + +## 발견된 위반·대상 (Explore + 직접 검증) + +| # | 분류 | 위치 | 함수/패턴 | 정책 위반 여부 | 조치 | +| :-: | :--- | :--- | :--- | :---: | :--- | +| 1 | **P1 위반** | `packages/agent-core/src/agent_core/nodes/planner.py` L37-86 | `_build_simple_research_plan()` — 키워드 사전(`research_markers`, `answer_markers`, `complex_markers`) 매칭으로 plan 사전 생성 | ❌ 위반 | 함수·호출부 제거, LLM planner의 `with_structured_output(TaskPlan)`만 사용 | +| 2 | **P1 위반·죽은 코드** | `head_supervisor.py` L72-99 | `_orchagent_identity_response()` — 정체성 질의용 키워드 사전 (현재 호출처 0건, supervisor.py re-export만 잔존) | ❌ 위반 잔재 | 함수 + 재내보내기 모두 삭제. `SYSTEM_SUPERVISOR_PROMPT` `# IDENTITY` 블록이 이미 처리. | +| 3 | **P3 inline·미등록** | `head_supervisor.py` L163-171 | LLM이 `coding_team` 선택 + `repo_binding` 없을 때 inline으로 `next_node = "FINISH"` 덮어쓰기 | ⚠ P3 정신 부합하나 등록 누락 | `safeguards.py`에 `reject_coding_team_without_repo_binding()` 함수로 추출, head_supervisor에서 호출 | +| 4 | **죽은 코드** | `planner.py` L17-34 `_extract_latest_user_text`, `head_supervisor.py` L43-69 `_extract_message_text`/`_latest_user_request_text` | `_build_simple_research_plan`/`_orchagent_identity_response`만 사용 → 1·2 제거 시 동반 죽은 코드 | — | 동반 삭제, supervisor.py re-export도 정리 | + +### Safeguard로 유지·이동되는 차단 로직 (`coding_team` + `repo_binding`) +- **이유**: 사용자가 repo를 바인딩하지 않은 상태에서 LLM이 `coding_team`을 선택하면 worker가 절대로 동작 불가(필수 선행조건). 이건 의도 분류가 아니라 시스템 무결성 차단이므로 P3 정신(차단/재요청만)에 부합. +- **차이점**: 기존 inline 코드는 SSE `route` 이벤트의 `reason`에 safeguard 표시가 없었음. 추출 후에는 `safeguard: …` 접두어 reason이 가시화되어 [P4 정책]에 맞게 사용자에게 노출. +- **이름 협의**: `reject_coding_team_without_repo_binding(decision, repo_bound: bool)`. `safeguards.py`의 다른 `reject_*`/`enforce_*` 네이밍과 일관. + +## Phase 분해 + +### Phase 1 — planner 휴리스틱 제거 (P1) +**파일**: `packages/agent-core/src/agent_core/nodes/planner.py` + +- [x] `_build_simple_research_plan` 함수 삭제 (L37-86) +- [x] `_extract_latest_user_text` 함수 삭제 (L17-34, 더 이상 호출처 없음) +- [x] `planner_node` 본체에서 lightweight plan 분기(L104-117) 삭제 → LLM planner의 `with_structured_output(TaskPlan)` 단일 경로만 남김 +- [x] `PLANNER_PROMPT`(`packages/prompt-kit/src/prompt_kit/prompts.py` L190-210) 점검: "lightweight research" 케이스(`예: 웹검색 → 답변`)도 이미 다룰 수 있음(예시 L207-209 `[research_team] Search for latest trends in AI` 포함). 추가 가이드 불필요 — **프롬프트 수정 없음**. + +**회귀 위험**: 기존 `test_planner.py::test_planner_uses_lightweight_plan_for_simple_research_query`가 휴리스틱 동작을 잠그고 있음. 이 테스트는 정책 위반이므로 **삭제 후 LLM-driven 검증 테스트로 대체**. + +### Phase 2 — `_orchagent_identity_response` 죽은 코드 정리 (P1·P2) +**파일**: `packages/agent-core/src/agent_core/supervisors/head_supervisor.py`, `packages/agent-core/src/agent_core/supervisor.py` + +- [x] `head_supervisor.py` L72-99 `_orchagent_identity_response` 함수 삭제 +- [x] `head_supervisor.py` L43-69 `_extract_message_text`, `_latest_user_request_text` 동반 삭제 (호출처 0건 확인) +- [x] `head_supervisor.py` L36-40 "Helpers — lifted from the previous monolithic supervisor.py" 코멘트 블록 정리 +- [x] `supervisor.py` L28-33 import에서 3개 헬퍼 제거, `__all__`(L70-74)도 정리. 외부 import 가능성을 위해 `make_supervisor_node`/`make_head_supervisor_node`만 남김. + +**회귀 위험**: tests/에서 3개 헬퍼 import 검색 → 0건 확인됨(`grep -rn "_extract_message_text\|_latest_user_request_text\|_orchagent_identity_response" apps/backend/tests/` 결과 없음). + +### Phase 3 — `coding_team` repo_binding 체크를 safeguards로 추출 (P3·P4) +**파일**: `packages/agent-core/src/agent_core/safeguards.py`, `packages/agent-core/src/agent_core/supervisors/head_supervisor.py` + +- [x] `safeguards.py`에 함수 추가: + + ```python + def reject_coding_team_without_repo_binding( + decision: RouterDecision, + *, + repo_bound: bool, + ) -> SafeguardOutcome: + """Force FINISH if LLM picks coding_team without a bound repository. + + Coding workers require a bound repository to read/write files. When the + LLM selects coding_team without one we cannot proceed — block and force + FINISH so the head supervisor returns a direct answer instead. + """ + if decision.next != "coding_team" or repo_bound: + return SafeguardOutcome(decision=decision) + return SafeguardOutcome( + decision=RouterDecision( + next="FINISH", + reason="safeguard: coding_team requires a bound repository.", + request_review=False, + team_finished=True, + ), + status="fallback_finish", + ) + ``` + +- [x] `__all__`에 `reject_coding_team_without_repo_binding` 추가. +- [x] `head_supervisor.py` L163-171 inline 차단 로직을 `reject_coding_team_without_repo_binding` 호출로 교체. safeguard outcome의 `status != "accepted"`이면 reason을 `route_history`에 반영(SSE `route` 이벤트에 `safeguard: …` 노출). +- [x] 호출 시점: `decide_route()`가 반환한 `decision`을 받은 직후, `_maybe_interrupt` 이전(현재 inline 위치와 동일). + +### Phase 4 — 회귀 잠금 테스트 (Core §2 contract · §3 safeguard) +**CLAUDE.md 테스트 정책 준수**: Core 카테고리 §3 (safeguard) + §2 (contract) 보장. 신규 파일이 아닌 **기존 테스트 파일**에 케이스 추가 우선. + +- [x] `apps/backend/tests/test_router_safeguards.py`에 2건 추가: + - `test_coding_team_without_repo_binding_forces_finish` + - `test_coding_team_with_repo_binding_passes_through` +- [x] `apps/backend/tests/test_planner.py` 회귀 정책 위반 테스트 교체: + - 삭제: `test_planner_uses_lightweight_plan_for_simple_research_query` (휴리스틱 동작 잠그던 케이스) + - 추가: `test_planner_always_invokes_llm` (`FailingPlannerLLM` 자리에 `RecordingPlannerLLM`을 두고 모든 쿼리에서 LLM이 호출됨을 검증) + +### Phase 5 — 검증 + 커밋 + 푸시 +- [x] `cd apps/backend && pytest tests -q` → 기존 185 PASS 이상 유지 (새 케이스 포함) +- [x] 회귀 측정: `pytest tests/routing_eval/test_scorer.py -q` 통과 +- [x] Lint: `grep -rEn "_should_force_|_APPROVAL_PATTERNS|_build_simple_research_plan|_orchagent_identity_response" packages/agent-core` 결과 **0건** 확인 +- [x] 명시적 file-level `git add` (작업 무관 unstaged 변경 배제) +- [x] 커밋 메시지: `refactor(routing): remove rule-based heuristics, register repo_binding safeguard` +- [x] `git push -u origin refactor/llm-driven-routing-cleanup` + +## 변경 파일 목록 + +| 파일 | 변경 유형 | LOC 추정 | +| :--- | :--- | :--- | +| `plans/ENFORCED_ROUTING_TO_LLM_DRIVEN_PLAN.md` | 신규 | +130 | +| `packages/agent-core/src/agent_core/nodes/planner.py` | 수정 (-70) | 153 → ~85 | +| `packages/agent-core/src/agent_core/supervisors/head_supervisor.py` | 수정 (-65) | 374 → ~310 | +| `packages/agent-core/src/agent_core/supervisor.py` | 수정 (-10) | 76 → ~55 | +| `packages/agent-core/src/agent_core/safeguards.py` | 수정 (+25) | 147 → ~175 | +| `apps/backend/tests/test_planner.py` | 수정 (재작성) | 83 → ~50 | +| `apps/backend/tests/test_router_safeguards.py` | 수정 (+25) | 113 → ~140 | + +총 **5개 코드 파일 + 2개 테스트 파일 + 1개 계획서**. + +## 비고 +- 모든 변경은 **prompt-driven** 흐름을 유지. 코드에 정규식·키워드 사전·`_should_force_*` 함수는 새로 추가하지 않음. +- `coding-no-repo` 골든 케이스(`golden_dataset.json` `coding-002-no-repo`, `expected_next=FINISH`)가 추출된 safeguard와 의미 일치 → 회귀 차단 보장. +- `_orchagent_identity_response` 삭제는 OpenAI U3 정체성 회귀 시나리오에 영향 없음(SYSTEM_SUPERVISOR_PROMPT `# IDENTITY` 블록이 직접 처리). From 26138c2d9b3410a0c8bd98d99bf5f10ebf1a7513 Mon Sep 17 00:00:00 2001 From: DONGRYEOLLEE1 Date: Fri, 22 May 2026 17:22:53 +0900 Subject: [PATCH 2/2] refactor(routing): prompt-driven first routes + remove pre-LLM shortcuts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up sweep on top of 00568bf to push the LLM-driven routing policy (CLAUDE.md §"Supervisor → Sub-agent Handoff Policy" P1-P5) further. Move the remaining "code decides before the LLM does" branches over to either a prompt-kit guidance line or a post-decision safeguard. - safeguards.py: drop `reject_coding_team_without_repo_binding`. The intent is now expressed as a `# REQUIRED FIRST ROUTES` guideline to the router LLM so the model itself avoids `coding_team` without a bound repo. Public surface is back to the canonical 4 safeguards. - supervisors/head_supervisor.py: drop the inline repo-binding override introduced in 00568bf. The router LLM owns the decision; the safeguard chain still catches invalid gotos and team-redirect loops. - supervisors/team_supervisor.py: drop the pre-LLM dispatch-limit shortcut (and the dead `_force_finish_due_to_dispatch_limit` helper). Dispatch limit is now applied only as a post-decision P3 safeguard via `decide_route()`, costing one extra LLM call per saturated turn in exchange for full P3 consistency. - prompt-kit/prompts.py: * SYSTEM_SUPERVISOR_PROMPT v2.7 — new `# REQUIRED FIRST ROUTES` block pins the first worker for all six domains (data / vision / research / coding / writing / FINISH) so the LLM has the contract in prompt. * TEAM_SUPERVISOR_PROMPT v1.5 — new `# WRITING TEAM HANDOFF` and `# VISION TEAM HANDOFF` sections; pins `vision_analyst` as the real Vision Team first worker (matches current member list). * Minor wording (`keyword` → `term`) in unrelated title/suggestion prompts to keep the policy-grep audit noise-free. - router_schema.py: clean stale `_should_force_approval` references from the docstrings. - tests/test_router_safeguards.py: pin the public safeguard surface to exactly the 4 P3 policy functions (regression guard so a 5th can't sneak back in). - tests/test_supervisor.py: add coverage that `max_team_dispatches=0` still runs the LLM once and then routes via the safeguard, not via a pre-LLM branch. - tests/test_routing_prompts.py (new): pin the `# REQUIRED FIRST ROUTES` block and per-team handoff guidance so prompt drift fails CI. Plan: plans/llm-routing-fix.md (Phase 1-2 done, Phase 3 Playwright checks deferred to a follow-up — sandbox blocked `browser_navigate` and local dev-server access). Validation: - pytest tests -q → 190 passed. - grep -E "_should_force|_APPROVAL_PATTERNS|reject_coding_team_without_repo_binding|_force_finish_due_to_dispatch_limit" packages/agent-core/src packages/prompt-kit/src apps/backend/workflow → 0 hits. Known follow-ups (not in this commit): - CLAUDE.md §"도메인별 첫 분기 의무" still lists vision_team workers as `image_inspector`/`image_editor`; current Vision Team only exposes `vision_analyst`. Doc update is a separate commit. - Playwright UI scenarios (CSV / image / latest news / greeting) need to be run against a live dev stack to fully retire Phase 3. Co-Authored-By: Claude Opus 4.7 (1M context) --- apps/backend/tests/test_router_safeguards.py | 37 ++++------ apps/backend/tests/test_routing_prompts.py | 59 ++++++++++++++++ apps/backend/tests/test_supervisor.py | 33 +++++++++ .../src/agent_core/router_schema.py | 9 ++- .../agent-core/src/agent_core/safeguards.py | 31 +------- .../agent_core/supervisors/head_supervisor.py | 15 +--- .../agent_core/supervisors/team_supervisor.py | 50 ------------- packages/prompt-kit/src/prompt_kit/prompts.py | 31 ++++++-- plans/llm-routing-fix.md | 70 +++++++++++++++++++ 9 files changed, 207 insertions(+), 128 deletions(-) create mode 100644 apps/backend/tests/test_routing_prompts.py create mode 100644 plans/llm-routing-fix.md diff --git a/apps/backend/tests/test_router_safeguards.py b/apps/backend/tests/test_router_safeguards.py index d4dd6d7..f0b572a 100644 --- a/apps/backend/tests/test_router_safeguards.py +++ b/apps/backend/tests/test_router_safeguards.py @@ -15,7 +15,6 @@ enforce_dispatch_limit, enforce_team_redirect_limit, fallback_decision_on_parse_failure, - reject_coding_team_without_repo_binding, reject_invalid_goto, ) @@ -113,25 +112,17 @@ def test_router_decision_default_values_are_safe() -> None: assert decision.team_finished is False -def test_coding_team_with_repo_binding_passes_through() -> None: - decision = _make_decision("coding_team") - outcome = reject_coding_team_without_repo_binding(decision, repo_bound=True) - assert outcome.status == "accepted" - assert outcome.decision == decision - - -def test_coding_team_without_repo_binding_forces_finish() -> None: - decision = _make_decision("coding_team") - outcome = reject_coding_team_without_repo_binding(decision, repo_bound=False) - assert outcome.status == "fallback_finish" - assert outcome.decision.next == "FINISH" - assert "safeguard" in outcome.decision.reason - assert "coding_team" in outcome.decision.reason - - -def test_non_coding_team_decision_unaffected_by_repo_binding_safeguard() -> None: - """다른 팀 결정은 repo_binding 여부와 무관 — safeguard 사이드이펙트 없음.""" - decision = _make_decision("research_team") - outcome = reject_coding_team_without_repo_binding(decision, repo_bound=False) - assert outcome.status == "accepted" - assert outcome.decision == decision +def test_public_safeguard_surface_is_limited_to_policy_functions() -> None: + import agent_core.safeguards as safeguards + + public_functions = { + name + for name in safeguards.__all__ + if name.startswith(("reject_", "enforce_", "fallback_")) + } + assert public_functions == { + "reject_invalid_goto", + "enforce_team_redirect_limit", + "enforce_dispatch_limit", + "fallback_decision_on_parse_failure", + } diff --git a/apps/backend/tests/test_routing_prompts.py b/apps/backend/tests/test_routing_prompts.py new file mode 100644 index 0000000..3ef2ffb --- /dev/null +++ b/apps/backend/tests/test_routing_prompts.py @@ -0,0 +1,59 @@ +from prompt_kit.prompts import ( + CODING_TEAM_SUPERVISOR_PROMPT, + DATA_SCIENCE_TEAM_SUPERVISOR_PROMPT, + RESEARCH_TEAM_SUPERVISOR_PROMPT, + SYSTEM_SUPERVISOR_PROMPT, + TEAM_SUPERVISOR_PROMPT, +) + + +def test_head_prompt_contains_required_first_route_contracts() -> None: + prompt = SYSTEM_SUPERVISOR_PROMPT.template + + required_fragments = [ + "# REQUIRED FIRST ROUTES", + "Data attachment", + "`data_science_team`", + "`data_engineer`", + "`data_analyst`", + "Image attachment", + "`vision_team`", + "`vision_analyst`", + "Current events, news, or \"latest\"", + "`research_team`", + "`search`", + "`web_scraper`", + "Bound repository plus code", + "`coding_team`", + "`codebase_explorer`", + "`implementation_engineer`", + "`runtime_verifier`", + "Explicit report", + "`writing_team`", + "`note_taker`", + "`doc_writer`", + "Simple greetings", + "`FINISH`", + "`content`", + ] + + for fragment in required_fragments: + assert fragment in prompt + + +def test_team_prompt_contains_generic_worker_handoff_contracts() -> None: + prompt = TEAM_SUPERVISOR_PROMPT.template + + assert "# DATA SCIENCE TEAM HANDOFF" in prompt + assert "next worker is ALWAYS `data_analyst`" in prompt + assert "# WRITING TEAM HANDOFF" in prompt + assert "Start a new report" in prompt + assert "route to `doc_writer`" in prompt + assert "# VISION TEAM HANDOFF" in prompt + assert "Start image-attachment requests with `vision_analyst`" in prompt + + +def test_dedicated_team_prompts_pin_first_workers() -> None: + assert "Start with `search`" in RESEARCH_TEAM_SUPERVISOR_PROMPT.template + assert "Start with `data_engineer`" in DATA_SCIENCE_TEAM_SUPERVISOR_PROMPT.template + assert "Start with `codebase_explorer`" in CODING_TEAM_SUPERVISOR_PROMPT.template diff --git a/apps/backend/tests/test_supervisor.py b/apps/backend/tests/test_supervisor.py index 9c93eb7..cdbede9 100644 --- a/apps/backend/tests/test_supervisor.py +++ b/apps/backend/tests/test_supervisor.py @@ -18,6 +18,16 @@ async def ainvoke(self, messages): return {"next": self.target_node} +class CountingRouterLLM(FakeRouterLLM): + def __init__(self, target_node: str): + super().__init__(target_node) + self.calls = 0 + + async def ainvoke(self, messages): + self.calls += 1 + return {"next": self.target_node, "reason": "LLM chose next worker"} + + class ApprovalAwareLLM: def with_structured_output(self, schema): return self @@ -67,6 +77,29 @@ async def test_supervisor_routes_to_worker(): assert command.update["route_history"][0]["layer"] == "team" +@pytest.mark.asyncio +async def test_team_dispatch_limit_runs_after_llm_decision(): + """Dispatch limit is a post-decision safeguard, not a pre-LLM branch.""" + fake_llm = CountingRouterLLM("search_agent") + supervisor_func = make_supervisor_node( + fake_llm, # type: ignore + ["search_agent", "web_scraper"], + layer="team", + team_name="ResearchTeam", + max_team_dispatches=0, + ) + + state = cast( + BaseAgentState, + {"messages": [HumanMessage(content="Find me something")], "next": ""}, + ) + command = await supervisor_func(state) + + assert fake_llm.calls == 1 + assert command.goto == "__end__" + assert command.update["route_history"][0]["reasoning"].startswith("safeguard:") + + @pytest.mark.asyncio async def test_supervisor_routes_to_finish(): """FINISH at head layer must clear active_team/worker and terminate streaming.""" diff --git a/packages/agent-core/src/agent_core/router_schema.py b/packages/agent-core/src/agent_core/router_schema.py index f96f013..a687a5e 100644 --- a/packages/agent-core/src/agent_core/router_schema.py +++ b/packages/agent-core/src/agent_core/router_schema.py @@ -13,8 +13,7 @@ - ``reason``: short human-readable explanation, exposed to the UI via the ``route`` SSE event (plan §4.0 P4). - ``request_review``: ``True`` if the supervisor wants to interrupt for - HITL approval before continuing. Replaces the rule-based - ``_should_force_approval`` heuristic. + HITL approval before continuing. The LLM sets this from prompt policy. - ``team_finished``: team supervisor asserts the team has nothing more to do this turn; head supervisor uses this to decide between another team dispatch and a finalizer call. @@ -26,8 +25,8 @@ direct-FINISH turns would emit an empty AI message (regression seen after the head/team split in Phase 2.4). -This schema lives in agent_core so that both supervisor.py (today's -rule-based logic) and the upcoming ``LLMRouter`` class can share it. +This schema lives in agent_core so the supervisor factories and the shared +LLM router can use the same structured-output contract. """ from __future__ import annotations @@ -53,7 +52,7 @@ class RouterDecision(BaseModel): default=False, description=( "Set True when the supervisor wants to pause for human approval " - "before continuing (replaces _should_force_approval heuristic)." + "before continuing according to the prompt policy." ), ) team_finished: bool = Field( diff --git a/packages/agent-core/src/agent_core/safeguards.py b/packages/agent-core/src/agent_core/safeguards.py index 983ece5..8123377 100644 --- a/packages/agent-core/src/agent_core/safeguards.py +++ b/packages/agent-core/src/agent_core/safeguards.py @@ -2,7 +2,7 @@ Phase 2.6 of the codebase-wide refactor. Plan §4.0 P3 says the supervisor should never **override** the LLM's routing decision; it can only **block** -or **re-request** it. These helpers implement exactly that policy. +or **re-request** it. These four helpers implement exactly that policy. All functions are intentionally pure: @@ -137,39 +137,10 @@ def fallback_decision_on_parse_failure( ) -def reject_coding_team_without_repo_binding( - decision: RouterDecision, - *, - repo_bound: bool, -) -> SafeguardOutcome: - """Force FINISH when the LLM picks ``coding_team`` without a bound repo. - - Coding workers require a bound repository to read/write files. When the - router LLM selects ``coding_team`` for a thread that has no repository - binding we cannot proceed — block and force FINISH so the head supervisor - answers directly instead of dispatching a team that will fail. - - This is a pure P3 safeguard: it never changes a *valid* LLM decision; it - only blocks one that violates a hard system precondition. - """ - if decision.next != "coding_team" or repo_bound: - return SafeguardOutcome(decision=decision) - return SafeguardOutcome( - decision=RouterDecision( - next="FINISH", - reason="safeguard: coding_team requires a bound repository.", - request_review=False, - team_finished=True, - ), - status="fallback_finish", - ) - - __all__ = [ "SafeguardOutcome", "enforce_dispatch_limit", "enforce_team_redirect_limit", "fallback_decision_on_parse_failure", - "reject_coding_team_without_repo_binding", "reject_invalid_goto", ] diff --git a/packages/agent-core/src/agent_core/supervisors/head_supervisor.py b/packages/agent-core/src/agent_core/supervisors/head_supervisor.py index b65e0ad..4dbecee 100644 --- a/packages/agent-core/src/agent_core/supervisors/head_supervisor.py +++ b/packages/agent-core/src/agent_core/supervisors/head_supervisor.py @@ -9,7 +9,7 @@ when the LLM raises ``request_review`` or the state flag ``force_requires_approval`` is set. - Picks the finalizer over a raw FINISH when synthesis is appropriate. -- Enforces the per-team redirect safeguard via the LLM router. +- Enforces head-layer safeguards via the LLM router. - Writes the routing decision into the persistent ``route_history`` so the SSE layer can emit ``route`` events without re-deriving it. """ @@ -23,7 +23,6 @@ from langgraph.graph import END from langgraph.types import Command -from agent_core.safeguards import reject_coding_team_without_repo_binding from agent_core.state import ( BaseAgentState, ResponseMode, @@ -85,18 +84,6 @@ async def head_supervisor_node(state: BaseAgentState) -> Command: same_team_streak=same_team_streak, ) - # ---- Coding team safeguard: needs repo binding. ------------------- - # Block before HITL so the user is not asked to approve a dispatch - # that the system cannot execute anyway. - binding_outcome = reject_coding_team_without_repo_binding( - decision, - repo_bound=bool(shared_context.get("repo_binding")), - ) - if binding_outcome.status != "accepted": - decision = binding_outcome.decision - status = binding_outcome.status - print(f"[HeadSupervisor] {decision.reason}", flush=True) - # ---- HITL: interrupt only when LLM (or state flag) asked for it. -- state_requires_approval = bool( shared_context.get("force_requires_approval", False) diff --git a/packages/agent-core/src/agent_core/supervisors/team_supervisor.py b/packages/agent-core/src/agent_core/supervisors/team_supervisor.py index 6347d91..071f3f0 100644 --- a/packages/agent-core/src/agent_core/supervisors/team_supervisor.py +++ b/packages/agent-core/src/agent_core/supervisors/team_supervisor.py @@ -19,7 +19,6 @@ from typing import Any, Callable from langchain_core.language_models.chat_models import BaseChatModel -from langchain_core.messages import AIMessage from langgraph.graph import END from langgraph.types import Command @@ -72,22 +71,6 @@ async def team_supervisor_node(state: BaseAgentState) -> Command: route_history, normalized_team=normalized_team ) - # Pre-check dispatch ceiling before paying for an LLM call — saves a - # round-trip when the team is already saturated this turn. - if ( - normalized_team - and max_team_dispatches is not None - and team_dispatch_count >= max_team_dispatches - ): - print( - f"[TeamSupervisor:{normalized_team}] dispatch limit reached " - f"({team_dispatch_count}/{max_team_dispatches}).", - flush=True, - ) - return _force_finish_due_to_dispatch_limit( - normalized_team=normalized_team, - ) - system_prompt = compose_system_prompt( base_system_prompt, layer="team", @@ -145,39 +128,6 @@ async def team_supervisor_node(state: BaseAgentState) -> Command: return team_supervisor_node -def _force_finish_due_to_dispatch_limit(*, normalized_team: str) -> Command: - """Return a FINISH Command when the dispatch ceiling is hit pre-LLM.""" - return Command( - update={ - "active_team": None, - "active_worker": None, - "route_history": [ - build_route_entry( - layer="team", - node="supervisor", - next_node="FINISH", - team=normalized_team, - reasoning=( - f"{normalized_team} team dispatch limit reached; " - "returning control." - ), - ) - ], - "messages": [ - AIMessage( - content=( - f"[{normalized_team.capitalize()} Team Limit] Dispatch " - "budget reached. Return to the head supervisor and " - "synthesize with the gathered evidence." - ), - name="supervisor", - ) - ], - }, - goto=END, - ) - - def _log_decision(decision: Any, goto: str, status: str) -> None: print(f"[TeamSupervisor] Routing decision: {goto}", flush=True) if decision.reason: diff --git a/packages/prompt-kit/src/prompt_kit/prompts.py b/packages/prompt-kit/src/prompt_kit/prompts.py index 4031bc8..f261bae 100644 --- a/packages/prompt-kit/src/prompt_kit/prompts.py +++ b/packages/prompt-kit/src/prompt_kit/prompts.py @@ -38,12 +38,21 @@ class PromptTemplate(BaseModel): # {ROUTER_DECISION_GUIDANCE} +# REQUIRED FIRST ROUTES +- Data attachment (csv, xlsx, json, pdf, docx) plus analysis, extraction, table, chart, or visualization intent → `data_science_team`. That team must start with `data_engineer` for one-pass inspect/preview/profile, then `data_analyst` for verified calculations and chart PNG generation. +- Image attachment → `vision_team`. In the current graph, the Vision Team's implemented first worker is `vision_analyst`; do not invent worker names that are not in the team's member list. +- Current events, news, or "latest" information → `research_team`. The Research Team must start with `search`, then use `web_scraper` only when deeper page evidence is needed. +- Bound repository plus code reading, editing, tests, refactors, debugging, runtime, or repo-local implementation → `coding_team`. The Coding Team must start with `codebase_explorer`, then `implementation_engineer`, then `runtime_verifier` only when execution evidence is needed. +- Explicit report, article, outline, slide, document, or saved writing artifact → `writing_team`. The Writing Team must start with `note_taker`, then `doc_writer`; use `chart_generator` only when the requested writing artifact needs a chart from available evidence. +- Simple greetings, identity questions, conversational pleasantries, and general knowledge that needs no tools → `FINISH` with a complete `content` answer from the head supervisor. + # TEAM SELECTION HINTS - If the latest user turn carries one or more image attachments, prefer `vision_team` (unless the user explicitly asked for repo work, research, etc.). -- **If the latest user turn carries ANY data attachment (pdf, csv, xlsx, docx, json), you MUST route to `data_science_team`** — this team owns analysis, aggregation, chart/PNG generation, and document extraction. The team supervisor will ALWAYS start with `data_engineer` (single-pass inspect/preview/profile brief) before handing off to `data_analyst` for calculations and chart PNG generation. Do NOT route data-attachment turns to `coding_team` (no repo is bound for analysis-only requests) or to `research_team` (the data is already in the file). `data_science_team` runs sandboxed Python and saves real chart images. +- **If the latest user turn carries ANY data attachment (pdf, csv, xlsx, docx, json), you MUST route to `data_science_team`** — this team owns analysis, aggregation, chart/PNG generation, and document extraction. Do NOT route data-attachment turns to `coding_team` (no repo is bound for analysis-only requests) or to `research_team` (the data is already in the file). `data_science_team` runs sandboxed Python and saves real chart images. - A request involving an attached spreadsheet/CSV/JSON and the phrase "차트/시각화/그래프/visualization/chart/plot/PNG/이미지" is ALWAYS a `data_science_team` task. `request_review` must stay `false` for these — the python_repl_data_tool sandbox is safe and needs no human approval. - If a repository is bound to the current thread AND the user is asking for code reads, edits, tests, refactors, or any repo-local implementation work, prefer `coding_team`. With no bound repo, do NOT route to `coding_team` — answer directly or via the finalizer instead. - For questions about current events, news, or "latest" topics, prefer `research_team` and do not rely on internal knowledge. +- For explicit long-form writing deliverables, prefer `writing_team`; do not use it for ordinary final-answer synthesis. # CRITICAL GUIDELINES 1. Write concise routing reasoning in the 'reason' field. If a CURRENT TASK PLAN is provided, refer to the current stage, but do not expand a simple task into unnecessary micro-steps. @@ -62,7 +71,7 @@ class PromptTemplate(BaseModel): 10a. Outputting code as text — explanations, snippets, examples, walkthroughs of LangChain/LangGraph/MCP/etc. — is NOT 'executing code'. When the user only asks to *see* or *describe* code, set `request_review` to false even if coding_team handles the response. 11. AVOID re-dispatching the same team after it already returned control once in this turn. If the latest `[Review Failed]`/`[Review Warning]`/team feedback in the conversation came from a team you already routed to, prefer FINISH so the finalizer synthesizes from what is already gathered. Only re-route to the same team when there is a concrete actionable gap that team alone can fix. """, - version="2.6", + version="2.7", ) TEAM_SUPERVISOR_PROMPT = PromptTemplate( @@ -85,6 +94,16 @@ class PromptTemplate(BaseModel): - If the analyst's PNG/chart attempt failed once with a code error, dispatch `data_analyst` ONE more time with the Reviewer feedback so the analyst can fix the code. After two failed analyst attempts in a row, FINISH and let the head supervisor synthesize from what was gathered. - If the user explicitly asked for a chart/PNG and the analyst has not yet been dispatched, route to `data_analyst` immediately even if the engineer brief is incomplete — the analyst can fill the gap. +# WRITING TEAM HANDOFF (when members include `note_taker` and `doc_writer`) +- Start a new report, article, outline, slide, or saved document request with `note_taker` so the evidence and structure are organized first. +- After `note_taker` has produced an outline or structured notes, route to `doc_writer` for the polished artifact. +- Use `chart_generator` only when the writing deliverable needs a chart generated from already-available evidence or data. +- Do not call `doc_writer` first for a new writing artifact unless the conversation already contains a complete outline. + +# VISION TEAM HANDOFF (when members include `vision_analyst`) +- Start image-attachment requests with `vision_analyst`. +- The current Vision Team exposes `vision_analyst` as the image-inspection worker. Do not choose `image_inspector` or `image_editor` unless those exact names are present in the provided member list. + # CRITICAL GUIDELINES 1. Write concise routing reasoning in the 'reason' field. Explicitly state what remains, but keep the worker sequence minimal. 2. If you receive a [Validation Failed] message from a validator, read the feedback and route the task BACK to the appropriate worker for self-correction. @@ -92,7 +111,7 @@ class PromptTemplate(BaseModel): 4. AVOID loops: If a worker has already attempted a task and failed multiple times, do not keep sending it back without a clear reason. If you cannot improve the output further, return FINISH and let the head supervisor decide. 5. Prefer FINISH when the team objective is materially complete. Minor stylistic improvements alone do not justify another worker handoff. """, - version="1.4", + version="1.5", ) RESEARCH_TEAM_SUPERVISOR_PROMPT = PromptTemplate( @@ -549,7 +568,7 @@ class PromptTemplate(BaseModel): 3. Keep the title to one line. 4. Keep it concise and list-friendly. 5. Maximum length: 24 characters. -6. Preserve important technical keywords like RoPE, ALiBi, JWT, OAuth, SQL when helpful. +6. Preserve important technical terms like RoPE, ALiBi, JWT, OAuth, SQL when helpful. 7. Remove polite phrasing, question endings, and unnecessary detail. 8. Do not include quotes, markdown, bullets, colons, or trailing punctuation. @@ -582,7 +601,7 @@ class PromptTemplate(BaseModel): # OUTPUT RULES 1. Return only the structured output fields. -2. Prefer Korean unless a technical keyword is better preserved in English. +2. Prefer Korean unless a technical term is better preserved in English. 3. Each suggestion must be one line. 4. Keep each suggestion short and sidebar-friendly. 5. Maximum length per suggestion: 36 characters. @@ -593,7 +612,7 @@ class PromptTemplate(BaseModel): # QUALITY BAR - The suggestions should feel actionable. -- Preserve important technical keywords when useful. +- Preserve important technical terms when useful. - Prefer concrete continuations over generic prompts. # EXAMPLE diff --git a/plans/llm-routing-fix.md b/plans/llm-routing-fix.md new file mode 100644 index 0000000..be7253c --- /dev/null +++ b/plans/llm-routing-fix.md @@ -0,0 +1,70 @@ +--- +작업명: LLM Routing Rule-Based Residue Cleanup +간단요약: rule-based routing 잔재를 제거하고 프롬프트 기반 첫 분기 정책과 UI/SSE 라우팅 검증을 보강한다. +작성일시: 2026-05-22 16:56 KST +최종 수정일시: 2026-05-22 17:04 KST +--- + +# LLM Routing Rule-Based Residue Cleanup + +## 목표 + +OrchAgent의 head/team supervisor 라우팅이 정규식, 키워드, 강제 분기 없이 `RouterDecision` 기반 LLM 결정과 허용된 safeguard 4종만으로 동작하도록 정리한다. + +## 범위 + +- `packages/agent-core/src/agent_core/` +- `apps/backend/workflow/` +- `packages/prompt-kit/src/prompt_kit/prompts.py` +- 관련 백엔드 테스트와 Playwright UI 시나리오 + +## 감사 결과 + +- [x] `reject_coding_team_without_repo_binding`은 5번째 safeguard라서 제거 대상이다. +- [x] `head_supervisor.py`가 repo binding 유무로 `coding_team` 결정을 `FINISH`로 강제 변경한다. +- [x] `team_supervisor.py`가 dispatch limit 도달 시 LLM 호출 전 직접 `FINISH`로 강제 종료한다. +- [x] `router_schema.py`에 `_should_force_approval` 문서 잔재가 남아 있다. +- [x] 실제 `vision_team` worker는 `vision_analyst` 하나이며, CLAUDE.md 표의 `image_inspector`/`image_editor`와 이름이 다르다. + +## Phase 1. Rule-Based Residue Removal + +- [x] 5번째 repo-binding safeguard와 호출부 제거 +- [x] team supervisor의 pre-LLM dispatch-limit shortcut 제거 +- [x] router schema의 `_should_force_approval` 잔재 제거 +- [x] 관련 safeguard 테스트 정리 + +## Phase 2. Prompt And Test Coverage + +- [x] head/team/research/data/coding/writing/vision 첫 분기 프롬프트 보완 +- [x] 실제 worker 이름과 다른 image worker 정책을 안전하게 정리 +- [x] prompt/safeguard/router 테스트 추가 또는 갱신 + +## Phase 3. Verification + +- [x] grep으로 rule-based routing 패턴 확인 +- [x] 관련 pytest 통과 확인 +- [ ] Playwright MCP로 CSV 첨부, 이미지 첨부, 최신 뉴스, 인사 시나리오 확인 +- [ ] SSE `route` 이벤트 reason과 Inner Monologue 패널 확인 + +차단 사유: 2026-05-22 17:02 KST 기준 Playwright MCP `browser_navigate` 호출이 `user rejected MCP tool call`로 거절되었고, sandbox에서 `localhost:3000`/`localhost:8002` 접근도 `Operation not permitted`로 실패했다. 대체 검증으로 프론트 SSE/Inner Monologue 테스트를 실행했다. + +## 검증 기록 + +- `rg -n "_should_force|_APPROVAL_PATTERNS|re\\.(match|search)|keyword|keywords|reject_coding_team_without_repo_binding|_force_finish_due_to_dispatch_limit" packages/agent-core/src/agent_core apps/backend/workflow packages/prompt-kit/src/prompt_kit` → 0건 +- `PYTHONPATH=apps/backend MPLCONFIGDIR=/private/tmp/mpl-cache UV_CACHE_DIR=/private/tmp/uv-cache uv run pytest apps/backend/tests/test_router_safeguards.py apps/backend/tests/test_supervisor.py apps/backend/tests/test_routing_prompts.py apps/backend/tests/test_llm_router.py -q` → 31 passed +- `PYTHONPATH=apps/backend MPLCONFIGDIR=/private/tmp/mpl-cache UV_CACHE_DIR=/private/tmp/uv-cache uv run pytest apps/backend/tests/test_team_subgraphs.py apps/backend/tests/test_planner.py apps/backend/tests/routing_eval/test_scorer.py -q` → 15 passed +- `PYTHONPATH=apps/backend MPLCONFIGDIR=/private/tmp/mpl-cache UV_CACHE_DIR=/private/tmp/uv-cache uv run pytest apps/backend/tests -q` → 190 passed +- `npm run test -- src/lib/sse-reducer.test.ts src/app/page.test.tsx` → 24 passed + +## 커밋 상태 + +- 2026-05-22 17:04 KST 기준 `git add ...` 실행 시 `.git/index.lock` 생성이 `Operation not permitted`로 차단되었다. +- `test -w .git` / `test -w .git/index`도 현재 sandbox에서 writable이 아니라고 반환한다. +- 따라서 변경은 검증 완료 상태지만, 이 세션에서는 커밋 생성이 차단되었다. + +## 검증 방법 + +- `rg -n --glob '*.py' "_should_force|_APPROVAL_PATTERNS|re\\.(match|search)|keyword|keywords" packages/agent-core/src/agent_core apps/backend/workflow packages/prompt-kit/src/prompt_kit` +- `cd apps/backend && uv run pytest tests/test_router_safeguards.py tests/test_llm_router.py tests/test_supervisor.py tests/test_team_subgraphs.py tests/test_planner.py -q` +- 가능한 경우 전체 백엔드 테스트 또는 routing eval 실행 +- 가능한 경우 `./infra/scripts/start-dev.sh` 후 Playwright MCP UI 검증