From fa132138098fc66cfa812998de7981cea981b3bb Mon Sep 17 00:00:00 2001
From: Protocol Zero <257158451+Protocol-zero-0@users.noreply.github.com>
Date: Fri, 12 Jun 2026 23:39:15 +0000
Subject: [PATCH] =?UTF-8?q?Agenda=20=E6=96=B9=E5=90=91=E7=A1=AC=E7=BA=A6?=
 =?UTF-8?q?=E6=9D=9F=EF=BC=9A=E7=94=9F=E6=88=90=20prompt=20=E6=B3=A8?=
 =?UTF-8?q?=E5=85=A5=E6=96=B9=E5=90=91=E5=8E=9F=E8=AF=9D=20+=20=E5=87=BA?=
 =?UTF-8?q?=E5=9F=9F=E7=BB=93=E6=9E=9C=E7=A1=AE=E5=AE=9A=E6=80=A7=E6=8B=A6?=
 =?UTF-8?q?=E6=88=AA?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

#41 只圈了信号消费侧，Tier1/Tier2 的生成 prompt 完全不知道 agenda 存在，
taxonomy 匹配不上还会回退全库扫描，实测一个粗差检测方向的 agenda 产出的
5 条 Tier2 idea 全部跑题（RAG/代码生成等）。

- 两个 agent 的全部生成环节（结构检测、形式化、问题提炼、方法设计、实验
  设计）在带 agenda 时把方向原话 + focus/prefer 关键词作为硬约束段注入
  prompt；不带 agenda 时 prompt 逐字节不变
- 新增 agents/agenda_relevance.py：生成后的规则关键词闸，命中任一范围词
  即保留（阈值 AGENDA_SCOPE_MIN_TERM_HITS 可配，默认 1，0 关闭），出域
  insight 不入库，计入 dropped_out_of_scope 并记日志
- discover_paradigm_insights / discover_paper_ideas 返回值改为
  {insights, dropped_out_of_scope}，调度器解包并写入 tier*_done 事件
---
 agents/agenda_relevance.py             | 160 ++++++++++
 agents/paper_idea_agent.py             |  75 +++--
 agents/paradigm_agent.py               |  67 +++--
 config.py                              |   8 +
 orchestrator/discovery_scheduler.py    |  42 ++-
 scripts/run_deepgraph_new_idea_once.py |   3 +-
 tests/test_agenda_prompt_scoping.py    | 400 +++++++++++++++++++++++++
 tests/test_vnext_discovery.py          |   5 +-
 8 files changed, 709 insertions(+), 51 deletions(-)
 create mode 100644 agents/agenda_relevance.py
 create mode 100644 tests/test_agenda_prompt_scoping.py

diff --git a/agents/agenda_relevance.py b/agents/agenda_relevance.py
new file mode 100644
index 0000000..e302627
--- /dev/null
+++ b/agents/agenda_relevance.py
@@ -0,0 +1,160 @@
+"""Agenda direction guardrails: prompt constraint block + deterministic scope gate.
+
+PR #41 scoped what the Tier 1 / Tier 2 agents *read* (signal queries circled to
+the agenda's taxonomy subgraph), but nothing constrained what they *write*: the
+generation prompts never mentioned the agenda, so off-topic candidates passed
+through whenever the taxonomy match was loose or fell back to the global scan.
+
+This module adds the missing two pieces, both rule-based (no extra LLM calls):
+
+1. ``agenda_constraint_block(agenda)`` — a prompt section appended to every
+   generation prompt when an agenda is present, stating the user's direction
+   verbatim plus the scope keywords, and instructing the model to stay inside.
+2. ``insight_in_scope(insight, agenda)`` — a post-generation keyword gate.
+   Generated insights whose text matches none of the agenda's scope terms are
+   dropped before storage and reported as ``dropped_out_of_scope``.
+
+The gate is intentionally lenient: by default one term hit is enough to keep
+an insight (configurable via AGENDA_SCOPE_MIN_TERM_HITS). Its job is to catch
+clearly unrelated output, not to rank borderline cases — prompt steering does
+the fine-grained work, the gate is the deterministic backstop.
+"""
+
+from __future__ import annotations
+
+import re
+from typing import Any, Iterable
+
+from config import AGENDA_SCOPE_MIN_TERM_HITS
+
+# Tokens that appear in almost any ML research direction. Useless as scope
+# evidence when auto-extracted from free text, so they are skipped during
+# tokenization. Explicit keyword phrases (focus / prefer.keywords) are always
+# kept verbatim regardless of this list.
+_GENERIC_TOKENS = {
+    "and", "are", "based", "between", "data", "deep", "for", "from", "into",
+    "learning", "machine", "method", "methods", "model", "models", "new",
+    "novel", "over", "paper", "papers", "research", "task", "tasks", "that",
+    "the", "this", "toward", "towards", "under", "use", "using", "via",
+    "with",
+}
+
+_TOKEN_RE = re.compile(r"[a-z][a-z0-9\-]{2,}")
+
+# Fields whose text represents what a generated insight is about. Covers both
+# tiers: Tier 2 ideas carry title/problem_statement/proposed_method, Tier 1
+# paradigm insights carry title/formal_structure/transformation.
+_SCOPE_TEXT_FIELDS = (
+    "title",
+    "problem_statement",
+    "proposed_method",
+    "formal_structure",
+    "transformation",
+)
+
+
+def _tokens(text: Any) -> list[str]:
+    return [
+        tok
+        for tok in _TOKEN_RE.findall(str(text or "").lower())
+        if tok not in _GENERIC_TOKENS
+    ]
+
+
+def agenda_match_terms(agenda) -> list[str]:
+    """Lowercased scope terms for the relevance gate.
+
+    Combines, deduplicated and in order:
+    - focus + prefer.keywords phrases, verbatim (the user named these);
+    - the individual tokens of those phrases (so "outlier rejection" also
+      matches text that only says "outlier");
+    - tokens extracted from the direction description (catches terms the
+      user wrote in the free-text direction but not in the keyword list).
+    """
+    from agents.agenda_selector import agenda_scope_keywords
+
+    seen: set[str] = set()
+    terms: list[str] = []
+
+    def _add(term: str) -> None:
+        term = term.strip().lower()
+        if term and term not in seen:
+            seen.add(term)
+            terms.append(term)
+
+    for phrase in agenda_scope_keywords(agenda):
+        _add(phrase)
+        for tok in _tokens(phrase):
+            _add(tok)
+    for tok in _tokens(getattr(agenda, "description", "")):
+        _add(tok)
+    return terms
+
+
+def agenda_constraint_block(agenda) -> str:
+    """Prompt section that pins generation to the agenda's direction.
+
+    Appended to the user prompt of every generation call when an agenda is
+    present; without an agenda the prompts are untouched.
+    """
+    from agents.agenda_selector import agenda_scope_keywords
+
+    direction = str(getattr(agenda, "description", "") or "").strip()
+    if not direction:
+        direction = str(getattr(agenda, "name", "") or "").strip()
+    lines = [
+        "",
+        "# RESEARCH DIRECTION CONSTRAINT (hard requirement)",
+        "",
+        "All output must stay inside this user-defined research direction:",
+        "",
+        f"Direction: {direction}",
+    ]
+    keywords = agenda_scope_keywords(agenda)
+    if keywords:
+        lines.append(f"Scope keywords: {', '.join(keywords)}")
+    lines.extend(
+        [
+            "",
+            "Rules:",
+            "- Only propose problems, insights, and methods that fall inside this direction.",
+            "- Ignore signals and evidence unrelated to the direction, even if they look promising.",
+            "- If little of the evidence fits the direction, return fewer items rather than drifting off-topic.",
+        ]
+    )
+    return "\n".join(lines)
+
+
+def insight_scope_text(insight: dict) -> str:
+    """Lowercased text of the fields that describe what an insight is about."""
+    parts = []
+    for field in _SCOPE_TEXT_FIELDS:
+        value = insight.get(field)
+        if value:
+            parts.append(str(value))
+    return " ".join(parts).lower()
+
+
+def count_term_hits(text: str, terms: Iterable[str]) -> int:
+    return sum(1 for term in terms if term in text)
+
+
+def insight_in_scope(insight: dict, agenda, *, min_hits: int | None = None) -> bool:
+    """Deterministic check that a generated insight matches the agenda's scope.
+
+    Lenient by design (default: one term hit keeps the insight). Disabled —
+    everything passes — when there is no agenda, when the threshold is set to
+    zero, or when the agenda yields no ASCII-matchable term (e.g. a direction
+    written entirely in Chinese cannot be matched against English insight
+    text, and dropping everything would be worse than dropping nothing).
+    """
+    if agenda is None:
+        return True
+    if min_hits is None:
+        min_hits = AGENDA_SCOPE_MIN_TERM_HITS
+    if min_hits <= 0:
+        return True
+    terms = agenda_match_terms(agenda)
+    if not any(re.search(r"[a-z0-9]", term) for term in terms):
+        return True
+    return count_term_hits(insight_scope_text(insight), terms) >= min_hits
diff --git a/agents/paper_idea_agent.py b/agents/paper_idea_agent.py
index aa65724..b4b11b4 100644
--- a/agents/paper_idea_agent.py
+++ b/agents/paper_idea_agent.py
@@ -10,6 +10,7 @@
 """
 import json
 from agents.agenda_budget import AgendaBudgetExceededError
+from agents.agenda_relevance import agenda_constraint_block, insight_in_scope
 from agents.discovery_metadata import build_evidence_packet, enrich_deep_insight
 from agents.insight_validation import get_evosci_input_issue
 from agents.llm_client import call_llm_json, is_llm_auth_error, is_llm_provider_unavailable_error
@@ -243,8 +244,12 @@
 }"""
 
 
-def _build_problem_prompt(signals: dict) -> str:
-    """Build evidence prompt for Call 1 (Problem Sharpening)."""
+def _build_problem_prompt(signals: dict, agenda=None) -> str:
+    """Build evidence prompt for Call 1 (Problem Sharpening).
+
+    With an agenda, the user's research direction is appended as a hard
+    constraint; without one the prompt is built exactly as before.
+    """
     sections = ["# EVIDENCE FROM 10,000+ ML PAPERS\n"]
 
     # Contradiction clusters
@@ -337,12 +342,15 @@ def _build_problem_prompt(signals: dict) -> str:
         for row in rows[:6]:
             sections.append(f"- {json.dumps(row, ensure_ascii=True, default=str)[:260]}")
 
+    if agenda is not None:
+        sections.append(agenda_constraint_block(agenda))
+
     return "\n".join(sections)
 
 
-def _build_method_prompt(problem: dict) -> str:
+def _build_method_prompt(problem: dict, agenda=None) -> str:
     """Build prompt for Call 2 (Method Invention)."""
-    return f"""# RESEARCH PROBLEM
+    prompt = f"""# RESEARCH PROBLEM
 
 ## Title: {problem['title']}
 
@@ -365,11 +373,14 @@ def _build_method_prompt(problem: dict) -> str:
 
 Design a NEW method that addresses this specific failure mode.
 The method must be technically novel — not "apply [existing technique] to [this domain]"."""
+    if agenda is not None:
+        prompt += "\n" + agenda_constraint_block(agenda)
+    return prompt
 
 
-def _build_experiment_prompt(problem: dict, method: dict) -> str:
+def _build_experiment_prompt(problem: dict, method: dict, agenda=None) -> str:
     """Build prompt for Call 3 (Experimental Design)."""
-    return f"""# PROPOSED RESEARCH
+    prompt = f"""# PROPOSED RESEARCH
 
 ## Problem
 Title: {problem['title']}
@@ -387,6 +398,9 @@ def _build_experiment_prompt(problem: dict, method: dict) -> str:
 
 Design a complete experimental plan for validating this method.
 Be specific: exact model names, dataset names, metric names, compute estimates."""
+    if agenda is not None:
+        prompt += "\n" + agenda_constraint_block(agenda)
+    return prompt
 
 
 def _llm_temporarily_unavailable(exc: Exception) -> bool:
@@ -400,16 +414,19 @@ def discover_paper_ideas(
     tier2_plateau_limit: int = 20,
     tier2_limitation_nodes: int = 15,
     agenda=None,
-) -> list[dict]:
+) -> dict:
     """Run the 3-stage paper idea discovery pipeline.
 
-    Returns list of deep_insight dicts ready for storage.
+    Returns {"insights": [...], "dropped_out_of_scope": n} where insights are
+    deep_insight dicts ready for storage.
     If max_papers is None, every sharpened problem (up to max_problems) is expanded.
 
     With an agenda (contracts.agenda.ResearchAgenda), the signal scan is
-    circled to the matching taxonomy subgraph and produced ideas are tagged
-    with agenda_id. Budget exhaustion stops the loop cleanly, returning the
-    ideas accepted so far.
+    circled to the matching taxonomy subgraph, every generation prompt carries
+    the agenda's direction as a hard constraint, ideas whose text matches none
+    of the agenda's scope terms are dropped (counted in dropped_out_of_scope),
+    and produced ideas are tagged with agenda_id. Budget exhaustion stops the
+    loop cleanly, returning the ideas accepted so far.
     """
     if max_papers is None:
         max_papers = max_problems
@@ -417,6 +434,11 @@ def discover_paper_ideas(
     print(f"[PAPER_IDEA] Starting Tier 2 discovery...", flush=True)
     total_tokens = 0
     total_calls = 0
+    deep_insights: list[dict] = []
+    dropped_out_of_scope = 0
+
+    def _result() -> dict:
+        return {"insights": deep_insights, "dropped_out_of_scope": dropped_out_of_scope}
 
     # Stage 0: Gather signals (scoped to the agenda's subgraph when known)
     scope_node_ids = None
@@ -451,29 +473,29 @@ def discover_paper_ideas(
     )
     if not has_signals:
         print("[PAPER_IDEA] No signals available. Run signal_harvester first.", flush=True)
-        return []
+        return _result()
 
     # Stage 1: Problem Sharpening
     print("[PAPER_IDEA] Call 1/3: Problem Sharpening...", flush=True)
-    problem_prompt = _build_problem_prompt(signals)
+    problem_prompt = _build_problem_prompt(signals, agenda=agenda)
     try:
         result1, tokens1 = call_llm_json(PROBLEM_SHARPENING_SYSTEM, problem_prompt)
         total_tokens += tokens1
         total_calls += 1
     except AgendaBudgetExceededError as e:
         print(f"[PAPER_IDEA] Stopped before problem sharpening: {e}", flush=True)
-        return []
+        return _result()
     except Exception as e:
         if _llm_temporarily_unavailable(e):
             print(f"[PAPER_IDEA] Problem sharpening skipped: LLM unavailable ({e})", flush=True)
-            return []
+            return _result()
         print(f"[PAPER_IDEA] Problem sharpening failed: {e}", flush=True)
-        return []
+        return _result()
 
     problems = result1.get("problems", [])
     if not problems:
         print("[PAPER_IDEA] No problems extracted", flush=True)
-        return []
+        return _result()
 
     problem_budget = min(len(problems), max_problems + max(2, max_papers // 2))
     problems = problems[:problem_budget]
@@ -483,7 +505,6 @@ def discover_paper_ideas(
     )
 
     # Stage 2 + 3: Method Invention + Experiment Design for top problems
-    deep_insights = []
     for i, problem in enumerate(problems):
         if len(deep_insights) >= max_papers:
             break
@@ -493,7 +514,7 @@ def discover_paper_ideas(
 
         # Stage 2: Method Invention
         print(f"[PAPER_IDEA] Call 2/3: Inventing method for '{title[:50]}'...", flush=True)
-        method_prompt = _build_method_prompt(problem)
+        method_prompt = _build_method_prompt(problem, agenda=agenda)
         try:
             result2, tokens2 = call_llm_json(METHOD_INVENTION_SYSTEM, method_prompt)
             total_tokens += tokens2
@@ -521,7 +542,7 @@ def discover_paper_ideas(
 
         # Stage 3: Experimental Design
         print(f"[PAPER_IDEA] Call 3/3: Designing experiments for '{method['name']}'...", flush=True)
-        exp_prompt = _build_experiment_prompt(problem, method)
+        exp_prompt = _build_experiment_prompt(problem, method, agenda=agenda)
         try:
             result3, tokens3 = call_llm_json(EXPERIMENT_DESIGN_SYSTEM, exp_prompt)
             total_tokens += tokens3
@@ -604,6 +625,15 @@ def discover_paper_ideas(
             "agenda_id": agenda.agenda_id if agenda is not None else None,
         }
 
+        if agenda is not None and not insight_in_scope(deep_insight, agenda):
+            dropped_out_of_scope += 1
+            print(
+                f"[PAPER_IDEA] Dropped out-of-scope idea for agenda "
+                f"'{agenda.name}': {deep_insight['title'][:80]}",
+                flush=True,
+            )
+            continue
+
         input_issue = get_evosci_input_issue(deep_insight, mode="verification")
         if input_issue:
             missing = ", ".join(input_issue.get("missing_fields") or [])
@@ -616,6 +646,7 @@ def discover_paper_ideas(
         deep_insights.append(enrich_deep_insight(deep_insight))
         print(f"[PAPER_IDEA] Accepted: {method['name']} — {title[:60]}", flush=True)
 
-    print(f"[PAPER_IDEA] Done: {len(deep_insights)} paper ideas from {len(problems)} problems. "
+    print(f"[PAPER_IDEA] Done: {len(deep_insights)} paper ideas from {len(problems)} problems "
+          f"({dropped_out_of_scope} dropped as out of agenda scope). "
           f"Tokens: {total_tokens}, LLM calls: {total_calls}", flush=True)
-    return deep_insights
+    return _result()
diff --git a/agents/paradigm_agent.py b/agents/paradigm_agent.py
index 1f785c1..2c722a2 100644
--- a/agents/paradigm_agent.py
+++ b/agents/paradigm_agent.py
@@ -21,6 +21,7 @@
     is_llm_provider_unavailable_error,
 )
 from agents.agenda_budget import AgendaBudgetExceededError
+from agents.agenda_relevance import agenda_constraint_block, insight_in_scope
 from contracts import DeepInsightSpec, normalize_deep_insight_storage
 from agents.signal_harvester import agenda_taxonomy_node_ids, get_tier1_signals
 from config import LLM_MODEL, PROMPT_VERSION
@@ -201,8 +202,12 @@ def _guess_mechanism_type(candidate: dict, formalized: dict) -> str:
     return "structural_equivalence"
 
 
-def _build_structure_prompt(signals: dict) -> str:
-    """Build the evidence prompt for Call 1 (Structure Detection)."""
+def _build_structure_prompt(signals: dict, agenda=None) -> str:
+    """Build the evidence prompt for Call 1 (Structure Detection).
+
+    With an agenda, the user's research direction is appended as a hard
+    constraint; without one the prompt is built exactly as before.
+    """
     sections = []
 
     sections.append("# CROSS-FIELD EVIDENCE FROM 10,000+ ML PAPERS\n")
@@ -267,10 +272,13 @@ def _build_structure_prompt(signals: dict) -> str:
             # default=str keeps prompt building backend-agnostic.
             sections.append(f"- {json.dumps(row, ensure_ascii=True, default=str)[:220]}")
 
+    if agenda is not None:
+        sections.append(agenda_constraint_block(agenda))
+
     return "\n".join(sections)
 
 
-def _build_formalization_prompt(candidate: dict, signals: dict) -> str:
+def _build_formalization_prompt(candidate: dict, signals: dict, agenda=None) -> str:
     """Build the evidence prompt for Call 2 (Formalization)."""
     sections = [f"# CANDIDATE PARADIGM INSIGHT\n"]
     sections.append(f"## Title: {candidate['title']}\n")
@@ -320,6 +328,9 @@ def _build_formalization_prompt(candidate: dict, signals: dict) -> str:
             for r in results:
                 sections.append(f"  {r['method_name']} on {r['dataset_name']} [{r['metric_name']}] = {r['metric_value']}")
 
+    if agenda is not None:
+        sections.append(agenda_constraint_block(agenda))
+
     return "\n".join(sections)
 
 
@@ -382,20 +393,29 @@ def discover_paradigm_insights(
     tier1_top_overlaps: int = 20,
     tier1_top_patterns: int = 15,
     agenda=None,
-) -> list[dict]:
+) -> dict:
     """Run the 3-stage paradigm discovery pipeline.
 
-    Returns list of deep_insight dicts ready for storage.
+    Returns {"insights": [...], "dropped_out_of_scope": n} where insights are
+    deep_insight dicts ready for storage.
 
     With an agenda (contracts.agenda.ResearchAgenda), the signal scan is
-    circled to the taxonomy subgraph matching the agenda's scope keywords and
-    every produced insight is tagged with agenda_id. Budget exhaustion
-    (AgendaBudgetExceededError from the metered LLM client) stops the loop
-    cleanly, returning the insights accepted so far.
+    circled to the taxonomy subgraph matching the agenda's scope keywords,
+    every generation prompt carries the agenda's direction as a hard
+    constraint, insights whose text matches none of the agenda's scope terms
+    are dropped (counted in dropped_out_of_scope), and every produced insight
+    is tagged with agenda_id. Budget exhaustion (AgendaBudgetExceededError
+    from the metered LLM client) stops the loop cleanly, returning the
+    insights accepted so far.
     """
     print(f"[PARADIGM] Starting Tier 1 discovery (max {max_candidates} candidates)...", flush=True)
     total_tokens = 0
     total_calls = 0
+    deep_insights: list[dict] = []
+    dropped_out_of_scope = 0
+
+    def _result() -> dict:
+        return {"insights": deep_insights, "dropped_out_of_scope": dropped_out_of_scope}
 
     # Stage 0: Gather signals (scoped to the agenda's subgraph when known)
     scope_node_ids = None
@@ -416,29 +436,29 @@ def discover_paradigm_insights(
     )
     if not signals["entity_overlaps"] and not signals["pattern_matches"]:
         print("[PARADIGM] No signals available. Run signal_harvester first.", flush=True)
-        return []
+        return _result()
 
     # Stage 1: Structure Detection
     print("[PARADIGM] Call 1/3: Structure Detection...", flush=True)
-    structure_prompt = _build_structure_prompt(signals)
+    structure_prompt = _build_structure_prompt(signals, agenda=agenda)
     try:
         result1, tokens1 = call_llm_json(STRUCTURE_DETECTION_SYSTEM, structure_prompt)
         total_tokens += tokens1
         total_calls += 1
     except AgendaBudgetExceededError as e:
         print(f"[PARADIGM] Stopped before structure detection: {e}", flush=True)
-        return []
+        return _result()
     except Exception as e:
         if _llm_temporarily_unavailable(e):
             print(f"[PARADIGM] Structure detection skipped: LLM unavailable ({e})", flush=True)
-            return []
+            return _result()
         print(f"[PARADIGM] Structure detection failed: {e}", flush=True)
-        return []
+        return _result()
 
     candidates = result1.get("candidates", [])
     if not candidates:
         print("[PARADIGM] No candidates from structure detection", flush=True)
-        return []
+        return _result()
 
     candidates.sort(key=lambda c: c.get("confidence", 0), reverse=True)
     candidate_budget = min(len(candidates), max_candidates + max(2, max_candidates // 2))
@@ -449,7 +469,6 @@ def discover_paradigm_insights(
     )
 
     # Stage 2 + 3: Formalize and Challenge each candidate
-    deep_insights = []
     for i, candidate in enumerate(candidates):
         if len(deep_insights) >= max_candidates:
             break
@@ -458,7 +477,7 @@ def discover_paradigm_insights(
 
         # Stage 2: Formalization
         print(f"[PARADIGM] Call 2/3: Formalizing '{title[:50]}'...", flush=True)
-        formal_prompt = _build_formalization_prompt(candidate, signals)
+        formal_prompt = _build_formalization_prompt(candidate, signals, agenda=agenda)
         try:
             result2, tokens2 = call_llm_json(FORMALIZATION_SYSTEM, formal_prompt)
             total_tokens += tokens2
@@ -566,6 +585,15 @@ def discover_paradigm_insights(
         if result2.get("minimal_experiment"):
             deep_insight["experimental_plan"] = json.dumps(result2["minimal_experiment"])
 
+        if agenda is not None and not insight_in_scope(deep_insight, agenda):
+            dropped_out_of_scope += 1
+            print(
+                f"[PARADIGM] Dropped out-of-scope insight for agenda "
+                f"'{agenda.name}': {deep_insight['title'][:80]}",
+                flush=True,
+            )
+            continue
+
         input_issue = get_evosci_input_issue(deep_insight, mode="verification")
         if input_issue:
             missing = ", ".join(input_issue.get("missing_fields") or [])
@@ -578,9 +606,10 @@ def discover_paradigm_insights(
         deep_insights.append(enrich_deep_insight(deep_insight))
         print(f"[PARADIGM] Accepted: {title[:80]} (score={score})", flush=True)
 
-    print(f"[PARADIGM] Done: {len(deep_insights)} insights from {len(candidates)} candidates. "
+    print(f"[PARADIGM] Done: {len(deep_insights)} insights from {len(candidates)} candidates "
+          f"({dropped_out_of_scope} dropped as out of agenda scope). "
           f"Tokens: {total_tokens}, LLM calls: {total_calls}", flush=True)
-    return deep_insights
+    return _result()
 
 
 def _jsonify(v):
diff --git a/config.py b/config.py
index a34ce62..dd077c2 100644
--- a/config.py
+++ b/config.py
@@ -253,6 +253,14 @@ def _split_csv(value: str | list | tuple | None) -> list[str]:
     "DEEPGRAPH_AGENDA_TOKEN_BUDGET_DEFAULT", 0, "agenda.token_budget_default"
 )
 
+# Agenda relevance gate: minimum number of agenda scope terms (focus +
+# prefer.keywords + direction-text tokens) a generated insight must mention
+# to be stored under that agenda. Deliberately lenient: 1 keeps an insight on
+# any single term hit; 0 disables the gate. See agents/agenda_relevance.py.
+AGENDA_SCOPE_MIN_TERM_HITS = _env_int(
+    "DEEPGRAPH_AGENDA_SCOPE_MIN_TERM_HITS", 1, "agenda.scope_min_term_hits"
+)
+
 # SciForge Experiment Validation
 EXPERIMENT_TIME_BUDGET = _env_int("SCIFORGE_TIME_BUDGET", 300, "experiment.time_budget_seconds")
 EXPERIMENT_MAX_ITERATIONS = _env_int("SCIFORGE_MAX_ITERATIONS", 100, "experiment.max_iterations")
diff --git a/orchestrator/discovery_scheduler.py b/orchestrator/discovery_scheduler.py
index 6979e6a..3159f58 100644
--- a/orchestrator/discovery_scheduler.py
+++ b/orchestrator/discovery_scheduler.py
@@ -110,18 +110,20 @@ def run_tier1_discovery(
     try:
         if agenda is not None:
             with agenda_scope(agenda.agenda_id, "tier1_discovery"):
-                insights = discover_paradigm_insights(
+                result = discover_paradigm_insights(
                     max_candidates=max_candidates,
                     tier1_top_overlaps=top_ov,
                     tier1_top_patterns=top_pat,
                     agenda=agenda,
                 )
         else:
-            insights = discover_paradigm_insights(
+            result = discover_paradigm_insights(
                 max_candidates=max_candidates,
                 tier1_top_overlaps=top_ov,
                 tier1_top_patterns=top_pat,
             )
+        insights = result["insights"]
+        dropped_out_of_scope = int(result.get("dropped_out_of_scope") or 0)
         stored = []
         for ins in insights:
             insight_id = store_deep_insight(ins)
@@ -142,8 +144,19 @@ def run_tier1_discovery(
                 "title": ins["title"],
                 "adversarial_score": ins.get("adversarial_score", 0),
             })
-        log_event("discovery", {"step": "tier1_done", "count": len(stored)})
-        print(f"[DISCOVERY] Tier 1 done: {len(stored)} paradigm insights stored", flush=True)
+        log_event(
+            "discovery",
+            {
+                "step": "tier1_done",
+                "count": len(stored),
+                "dropped_out_of_scope": dropped_out_of_scope,
+            },
+        )
+        print(
+            f"[DISCOVERY] Tier 1 done: {len(stored)} paradigm insights stored "
+            f"({dropped_out_of_scope} dropped as out of agenda scope)",
+            flush=True,
+        )
         return stored
     except Exception as e:
         if _llm_temporarily_unavailable(e):
@@ -199,7 +212,7 @@ def run_tier2_discovery(
     try:
         if agenda is not None:
             with agenda_scope(agenda.agenda_id, "tier2_discovery"):
-                insights = discover_paper_ideas(
+                result = discover_paper_ideas(
                     max_problems=max_problems,
                     max_papers=mpapers,
                     tier2_plateau_limit=plateaus,
@@ -207,12 +220,14 @@ def run_tier2_discovery(
                     agenda=agenda,
                 )
         else:
-            insights = discover_paper_ideas(
+            result = discover_paper_ideas(
                 max_problems=max_problems,
                 max_papers=mpapers,
                 tier2_plateau_limit=plateaus,
                 tier2_limitation_nodes=lim_nodes,
             )
+        insights = result["insights"]
+        dropped_out_of_scope = int(result.get("dropped_out_of_scope") or 0)
         stored = []
         for ins in insights:
             insight_id = store_deep_insight(ins)
@@ -238,8 +253,19 @@ def run_tier2_discovery(
                 "title": ins["title"],
                 "method_name": method.get("name", ""),
             })
-        log_event("discovery", {"step": "tier2_done", "count": len(stored)})
-        print(f"[DISCOVERY] Tier 2 done: {len(stored)} paper ideas stored", flush=True)
+        log_event(
+            "discovery",
+            {
+                "step": "tier2_done",
+                "count": len(stored),
+                "dropped_out_of_scope": dropped_out_of_scope,
+            },
+        )
+        print(
+            f"[DISCOVERY] Tier 2 done: {len(stored)} paper ideas stored "
+            f"({dropped_out_of_scope} dropped as out of agenda scope)",
+            flush=True,
+        )
         return stored
     except Exception as e:
         if _llm_temporarily_unavailable(e):
diff --git a/scripts/run_deepgraph_new_idea_once.py b/scripts/run_deepgraph_new_idea_once.py
index af6aa85..c5a298b 100644
--- a/scripts/run_deepgraph_new_idea_once.py
+++ b/scripts/run_deepgraph_new_idea_once.py
@@ -103,10 +103,11 @@ def main() -> int:
         print(_as_json({"event": "harvest_signals_start"}), flush=True)
         print(_as_json({"event": "harvest_signals_done", "stats": harvest_signals()}), flush=True)
 
-    ideas = discover_paper_ideas(
+    idea_result = discover_paper_ideas(
         max_problems=args.max_problems,
         max_papers=args.max_generated,
     )
+    ideas = idea_result["insights"]
     print(_as_json({"event": "ideas_generated", "count": len(ideas)}), flush=True)
 
     stored: list[int] = []
diff --git a/tests/test_agenda_prompt_scoping.py b/tests/test_agenda_prompt_scoping.py
new file mode 100644
index 0000000..729e6e0
--- /dev/null
+++ b/tests/test_agenda_prompt_scoping.py
@@ -0,0 +1,400 @@
+"""Agenda direction hard constraint: prompt injection + deterministic scope gate.
+
+Field report that motivated this: an agenda about outlier rejection /
+correspondence pruning / SfM produced five Tier 2 ideas about RAG, code
+generation and graph expressivity — the signal scan was scoped (PR #41) but
+the generation prompts never mentioned the agenda and nothing checked the
+output. These tests pin down both fixes:
+
+- with an agenda, every generation prompt carries the direction verbatim plus
+  the scope keywords; without one, prompts are built exactly as before;
+- generated insights that match none of the agenda's scope terms are dropped
+  before storage and counted in dropped_out_of_scope.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import unittest
+from unittest import mock
+
+os.environ["DEEPGRAPH_DATABASE_URL"] = ""  # force SQLite tmpdir; never touch a real DB from the environment
+
+CONSTRAINT_HEADER = "# RESEARCH DIRECTION CONSTRAINT"
+
+DIRECTION_TEXT = "特征对应图上的粗差检测（outlier rejection / correspondence pruning / SfM）"
+
+
+def _make_agenda():
+    from agents.agenda_loader import parse_agenda
+
+    return parse_agenda(
+        {
+            "version": "v1",
+            "name": "outlier-correspondence-sfm",
+            "description": DIRECTION_TEXT,
+            "focus": ["outlier rejection", "correspondence pruning", "sfm"],
+            "prefer": {"keywords": ["robust estimation"]},
+        }
+    )
+
+
+def _tier1_signals():
+    return {
+        "entity_overlaps": [
+            {
+                "node_a_id": "cv.sfm",
+                "node_b_id": "cv.matching",
+                "taxonomic_distance": 4,
+                "shared_entity_count": 3,
+                "overlap_score": 0.42,
+                "shared_entity_ids": json.dumps(
+                    [{"name": "ransac"}, {"name": "essential matrix"}]
+                ),
+                "shared_entity_types": json.dumps({"method": 2}),
+            }
+        ],
+        "pattern_matches": [],
+        "contradiction_clusters": [],
+        "taxonomy_map": [],
+    }
+
+
+def _tier2_signals():
+    return {
+        "contradiction_clusters": [],
+        "performance_plateaus": [],
+        "limitation_clusters": [],
+        "high_potential_insights": [
+            {
+                "title": "Matching pipelines disagree on inlier definitions",
+                "insight_type": "limitation",
+                "hypothesis": "Correspondence filtering thresholds are protocol artifacts",
+                "paradigm_score": 7,
+            }
+        ],
+        "mechanism_mismatches": [],
+        "protocol_artifacts": [],
+        "negative_space_gaps": [],
+        "hidden_variable_bridges": [],
+        "claim_method_gaps": [],
+    }
+
+
+class ConstraintBlockTests(unittest.TestCase):
+    def test_block_carries_direction_and_keywords(self):
+        from agents.agenda_relevance import agenda_constraint_block
+
+        block = agenda_constraint_block(_make_agenda())
+        self.assertIn(CONSTRAINT_HEADER, block)
+        self.assertIn(DIRECTION_TEXT, block)
+        self.assertIn("outlier rejection", block)
+        self.assertIn("correspondence pruning", block)
+        self.assertIn("robust estimation", block)
+
+    def test_match_terms_include_phrases_and_tokens(self):
+        from agents.agenda_relevance import agenda_match_terms
+
+        terms = agenda_match_terms(_make_agenda())
+        for expected in (
+            "outlier rejection",  # verbatim phrase
+            "outlier",            # phrase token
+            "pruning",
+            "sfm",
+            "robust estimation",  # prefer.keywords
+        ):
+            self.assertIn(expected, terms)
+
+
+class Tier1PromptInjectionTests(unittest.TestCase):
+    def test_structure_prompt_without_agenda_is_unchanged(self):
+        from agents.paradigm_agent import _build_structure_prompt
+
+        signals = _tier1_signals()
+        default_prompt = _build_structure_prompt(signals)
+        self.assertEqual(default_prompt, _build_structure_prompt(signals, agenda=None))
+        self.assertNotIn(CONSTRAINT_HEADER, default_prompt)
+
+    def test_structure_prompt_with_agenda_appends_constraint(self):
+        from agents.paradigm_agent import _build_structure_prompt
+
+        signals = _tier1_signals()
+        prompt = _build_structure_prompt(signals, agenda=_make_agenda())
+        self.assertIn(CONSTRAINT_HEADER, prompt)
+        self.assertIn(DIRECTION_TEXT, prompt)
+        self.assertIn("outlier rejection", prompt)
+        # Evidence section still present, before the constraint block.
+        self.assertLess(prompt.index("CROSS-FIELD EVIDENCE"), prompt.index(CONSTRAINT_HEADER))
+
+    def test_formalization_prompt_injection(self):
+        from agents.paradigm_agent import _build_formalization_prompt
+
+        candidate = {"title": "Candidate", "field_a": {}, "field_b": {}}
+        signals = _tier1_signals()
+        default_prompt = _build_formalization_prompt(candidate, signals)
+        self.assertEqual(
+            default_prompt, _build_formalization_prompt(candidate, signals, agenda=None)
+        )
+        self.assertNotIn(CONSTRAINT_HEADER, default_prompt)
+
+        prompt = _build_formalization_prompt(candidate, signals, agenda=_make_agenda())
+        self.assertIn(CONSTRAINT_HEADER, prompt)
+        self.assertIn(DIRECTION_TEXT, prompt)
+
+
+class Tier2PromptInjectionTests(unittest.TestCase):
+    PROBLEM = {
+        "title": "Inlier threshold drift",
+        "source_type": "limitation",
+        "source_evidence": "3 papers",
+        "formal_statement": "Minimize false matches",
+        "current_failure_mode": "Fixed thresholds",
+        "desideratum": "Adaptive filtering",
+        "impact_scope": "matching pipelines",
+        "related_node_ids": ["cv.sfm"],
+    }
+    METHOD = {
+        "name": "GraphGate",
+        "type": "training_procedure",
+        "one_line": "Filters correspondences",
+        "definition": "L = ...",
+        "key_properties": [],
+        "limitations": "none stated",
+    }
+
+    def test_problem_prompt_injection(self):
+        from agents.paper_idea_agent import _build_problem_prompt
+
+        signals = _tier2_signals()
+        default_prompt = _build_problem_prompt(signals)
+        self.assertEqual(default_prompt, _build_problem_prompt(signals, agenda=None))
+        self.assertNotIn(CONSTRAINT_HEADER, default_prompt)
+
+        prompt = _build_problem_prompt(signals, agenda=_make_agenda())
+        self.assertIn(CONSTRAINT_HEADER, prompt)
+        self.assertIn(DIRECTION_TEXT, prompt)
+        self.assertIn("outlier rejection", prompt)
+
+    def test_method_prompt_injection(self):
+        from agents.paper_idea_agent import _build_method_prompt
+
+        default_prompt = _build_method_prompt(self.PROBLEM)
+        self.assertEqual(default_prompt, _build_method_prompt(self.PROBLEM, agenda=None))
+        self.assertNotIn(CONSTRAINT_HEADER, default_prompt)
+
+        prompt = _build_method_prompt(self.PROBLEM, agenda=_make_agenda())
+        self.assertIn(CONSTRAINT_HEADER, prompt)
+        self.assertIn(DIRECTION_TEXT, prompt)
+
+    def test_experiment_prompt_injection(self):
+        from agents.paper_idea_agent import _build_experiment_prompt
+
+        default_prompt = _build_experiment_prompt(self.PROBLEM, self.METHOD)
+        self.assertEqual(
+            default_prompt, _build_experiment_prompt(self.PROBLEM, self.METHOD, agenda=None)
+        )
+        self.assertNotIn(CONSTRAINT_HEADER, default_prompt)
+
+        prompt = _build_experiment_prompt(self.PROBLEM, self.METHOD, agenda=_make_agenda())
+        self.assertIn(CONSTRAINT_HEADER, prompt)
+        self.assertIn(DIRECTION_TEXT, prompt)
+
+
+class ScopeGateTests(unittest.TestCase):
+    IN_SCOPE = {
+        "title": "Adaptive outlier rejection on correspondence graphs",
+        "problem_statement": "Minimize wrong matches kept by fixed RANSAC thresholds in SfM",
+        "proposed_method": json.dumps({"name": "Consistency-weighted pruning"}),
+    }
+    OFF_SCOPE = {
+        "title": "Retrieval grounding gap in code generation",
+        "problem_statement": "Align retriever and generator objectives to cut hallucinated APIs",
+        "proposed_method": json.dumps({"name": "Joint retriever-generator objective"}),
+    }
+
+    def test_in_scope_insight_passes(self):
+        from agents.agenda_relevance import insight_in_scope
+
+        self.assertTrue(insight_in_scope(self.IN_SCOPE, _make_agenda()))
+
+    def test_off_scope_insight_is_rejected(self):
+        from agents.agenda_relevance import insight_in_scope
+
+        self.assertFalse(insight_in_scope(self.OFF_SCOPE, _make_agenda()))
+
+    def test_no_agenda_passes_everything(self):
+        from agents.agenda_relevance import insight_in_scope
+
+        self.assertTrue(insight_in_scope(self.OFF_SCOPE, None))
+
+    def test_threshold_is_configurable(self):
+        from agents.agenda_relevance import insight_in_scope
+
+        # Title-only insight has a couple of hits; a high threshold drops it.
+        sparse = {"title": "A note on sfm"}
+        self.assertTrue(insight_in_scope(sparse, _make_agenda(), min_hits=1))
+        self.assertFalse(insight_in_scope(sparse, _make_agenda(), min_hits=3))
+        # Zero disables the gate.
+        self.assertTrue(insight_in_scope(self.OFF_SCOPE, _make_agenda(), min_hits=0))
+
+    def test_non_ascii_only_agenda_disables_gate(self):
+        from agents.agenda_loader import parse_agenda
+        from agents.agenda_relevance import insight_in_scope
+
+        agenda = parse_agenda(
+            {
+                "version": "v1",
+                "name": "chinese-only",
+                "description": "粗差检测",
+                "focus": ["粗差检测"],
+            }
+        )
+        # No ASCII-matchable term: dropping everything would be worse than
+        # dropping nothing, so the gate stands down.
+        self.assertTrue(insight_in_scope(self.OFF_SCOPE, agenda))
+
+
+class Tier2DroppedCountTests(unittest.TestCase):
+    def _run(self):
+        import agents.paper_idea_agent as pia
+
+        in_scope_problem = {
+            "title": "Threshold-free outlier rejection for correspondence graphs",
+            "source_type": "limitation",
+            "source_evidence": "3 papers report fixed inlier thresholds",
+            "formal_statement": "Minimize surviving wrong matches in SfM correspondence pruning",
+            "current_failure_mode": "Fixed global thresholds ignore scene geometry",
+            "desideratum": "Per-edge adaptive filtering",
+            "impact_scope": "matching pipelines",
+            "mechanism_type": "protocol_artifact",
+            "related_node_ids": ["cv.sfm"],
+        }
+        off_scope_problem = {
+            "title": "Retrieval grounding gap in code synthesis",
+            "source_type": "plateau",
+            "source_evidence": "5 papers within 1%",
+            "formal_statement": "Minimize hallucinated API calls in retrieval-augmented code synthesis",
+            "current_failure_mode": "Retriever and generator trained separately",
+            "desideratum": "Joint objective",
+            "impact_scope": "code assistants",
+            "mechanism_type": "plateau",
+            "related_node_ids": ["nlp.codegen"],
+        }
+        method_payload = {
+            "method": {
+                "name": "EdgeGate",
+                "type": "training_procedure",
+                "one_line": "Per-edge gating",
+                "definition": "g(e) = sigma(w^T f(e))",
+                "key_properties": [],
+                "why_novel": "Differs from the three closest filters by gating per edge with learned geometry features",
+                "limitations": "needs labels",
+            }
+        }
+        exp_in = {"paper_title": "Adaptive correspondence pruning without global thresholds"}
+        exp_off = {"paper_title": "Jointly trained retrieval for grounded code synthesis"}
+
+        with (
+            mock.patch.object(pia, "get_tier2_signals", return_value=_tier2_signals()),
+            mock.patch.object(pia, "agenda_taxonomy_node_ids", return_value=[]),
+            mock.patch.object(
+                pia,
+                "call_llm_json",
+                side_effect=[
+                    ({"problems": [in_scope_problem, off_scope_problem]}, 10),
+                    (method_payload, 10),
+                    (exp_in, 10),
+                    (method_payload, 10),
+                    (exp_off, 10),
+                ],
+            ) as llm,
+            mock.patch.object(pia, "get_evosci_input_issue", return_value=None),
+            mock.patch.object(pia, "enrich_deep_insight", side_effect=lambda d: d),
+            mock.patch.object(pia, "build_evidence_packet", return_value={}),
+        ):
+            result = pia.discover_paper_ideas(agenda=_make_agenda())
+        return result, llm
+
+    def test_off_scope_idea_is_dropped_and_counted(self):
+        result, llm = self._run()
+        self.assertEqual(result["dropped_out_of_scope"], 1)
+        self.assertEqual(len(result["insights"]), 1)
+        self.assertIn("pruning", result["insights"][0]["title"].lower())
+        # The generation prompt itself carried the constraint block.
+        problem_prompt = llm.call_args_list[0].args[1]
+        self.assertIn(CONSTRAINT_HEADER, problem_prompt)
+        self.assertIn(DIRECTION_TEXT, problem_prompt)
+
+
+class Tier1DroppedCountTests(unittest.TestCase):
+    def _run(self):
+        import agents.paradigm_agent as pa
+
+        in_scope_candidate = {
+            "title": "Outlier rejection and correspondence pruning share a consistency structure",
+            "confidence": 9,
+            "field_a": {},
+            "field_b": {},
+            "unifying_structure": "graph consistency",
+            "shared_failure_mode": "threshold sensitivity",
+            "evidence_from_graph": "shared ransac usage",
+        }
+        off_scope_candidate = {
+            "title": "Dialogue state tracking unifies with knowledge base completion",
+            "confidence": 5,
+            "field_a": {},
+            "field_b": {},
+            "unifying_structure": "slot filling",
+            "shared_failure_mode": "schema drift",
+            "evidence_from_graph": "shared ontology entities",
+        }
+        formal_in = {
+            "title": "Consistency-constrained outlier rejection across SfM pipelines",
+            "formal_structure": "argmin over correspondence subgraphs",
+            "transformation": "map pruning rules to consistency potentials",
+        }
+        formal_off = {
+            "title": "Dialogue state tracking as schema alignment",
+            "formal_structure": "argmax over slot assignments",
+            "transformation": "map ontology edges to slot values",
+        }
+        adversarial = {
+            "overall_score": 8,
+            "verdict": "interesting",
+            "attacks": [],
+            "strongest_attack": "",
+        }
+
+        with (
+            mock.patch.object(pa, "get_tier1_signals", return_value=_tier1_signals()),
+            mock.patch.object(pa, "agenda_taxonomy_node_ids", return_value=[]),
+            mock.patch.object(
+                pa,
+                "call_llm_json",
+                side_effect=[
+                    ({"candidates": [in_scope_candidate, off_scope_candidate]}, 10),
+                    (formal_in, 10),
+                    (formal_off, 10),
+                ],
+            ) as llm,
+            mock.patch.object(pa, "_call_with_provider", return_value=(adversarial, 5)),
+            mock.patch.object(pa, "get_evosci_input_issue", return_value=None),
+            mock.patch.object(pa, "enrich_deep_insight", side_effect=lambda d: d),
+            mock.patch.object(pa, "build_evidence_packet", return_value={}),
+        ):
+            result = pa.discover_paradigm_insights(agenda=_make_agenda())
+        return result, llm
+
+    def test_off_scope_insight_is_dropped_and_counted(self):
+        result, llm = self._run()
+        self.assertEqual(result["dropped_out_of_scope"], 1)
+        self.assertEqual(len(result["insights"]), 1)
+        self.assertIn("outlier rejection", result["insights"][0]["title"].lower())
+        structure_prompt = llm.call_args_list[0].args[1]
+        self.assertIn(CONSTRAINT_HEADER, structure_prompt)
+        self.assertIn(DIRECTION_TEXT, structure_prompt)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_vnext_discovery.py b/tests/test_vnext_discovery.py
index 47da0d1..ab404fc 100644
--- a/tests/test_vnext_discovery.py
+++ b/tests/test_vnext_discovery.py
@@ -133,7 +133,10 @@ def test_run_tier1_discovery_skips_zero_id_and_keeps_next_result(self):
 
         with (
             mock.patch.object(discovery_scheduler, "_init_schema_v2"),
-            mock.patch("agents.paradigm_agent.discover_paradigm_insights", return_value=insights),
+            mock.patch(
+                "agents.paradigm_agent.discover_paradigm_insights",
+                return_value={"insights": insights, "dropped_out_of_scope": 0},
+            ),
             mock.patch("agents.paradigm_agent.store_deep_insight", side_effect=[0, 42]),
             mock.patch.object(discovery_scheduler, "log_event"),
         ):