From fa132138098fc66cfa812998de7981cea981b3bb Mon Sep 17 00:00:00 2001 From: Protocol Zero <257158451+Protocol-zero-0@users.noreply.github.com> Date: Fri, 12 Jun 2026 23:39:15 +0000 Subject: [PATCH] =?UTF-8?q?Agenda=20=E6=96=B9=E5=90=91=E7=A1=AC=E7=BA=A6?= =?UTF-8?q?=E6=9D=9F=EF=BC=9A=E7=94=9F=E6=88=90=20prompt=20=E6=B3=A8?= =?UTF-8?q?=E5=85=A5=E6=96=B9=E5=90=91=E5=8E=9F=E8=AF=9D=20+=20=E5=87=BA?= =?UTF-8?q?=E5=9F=9F=E7=BB=93=E6=9E=9C=E7=A1=AE=E5=AE=9A=E6=80=A7=E6=8B=A6?= =?UTF-8?q?=E6=88=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit #41 只圈了信号消费侧,Tier1/Tier2 的生成 prompt 完全不知道 agenda 存在, taxonomy 匹配不上还会回退全库扫描,实测一个粗差检测方向的 agenda 产出的 5 条 Tier2 idea 全部跑题(RAG/代码生成等)。 - 两个 agent 的全部生成环节(结构检测、形式化、问题提炼、方法设计、实验 设计)在带 agenda 时把方向原话 + focus/prefer 关键词作为硬约束段注入 prompt;不带 agenda 时 prompt 逐字节不变 - 新增 agents/agenda_relevance.py:生成后的规则关键词闸,命中任一范围词 即保留(阈值 AGENDA_SCOPE_MIN_TERM_HITS 可配,默认 1,0 关闭),出域 insight 不入库,计入 dropped_out_of_scope 并记日志 - discover_paradigm_insights / discover_paper_ideas 返回值改为 {insights, dropped_out_of_scope},调度器解包并写入 tier*_done 事件 --- agents/agenda_relevance.py | 160 ++++++++++ agents/paper_idea_agent.py | 75 +++-- agents/paradigm_agent.py | 67 +++-- config.py | 8 + orchestrator/discovery_scheduler.py | 42 ++- scripts/run_deepgraph_new_idea_once.py | 3 +- tests/test_agenda_prompt_scoping.py | 400 +++++++++++++++++++++++++ tests/test_vnext_discovery.py | 5 +- 8 files changed, 709 insertions(+), 51 deletions(-) create mode 100644 agents/agenda_relevance.py create mode 100644 tests/test_agenda_prompt_scoping.py diff --git a/agents/agenda_relevance.py b/agents/agenda_relevance.py new file mode 100644 index 0000000..e302627 --- /dev/null +++ b/agents/agenda_relevance.py @@ -0,0 +1,160 @@ +"""Agenda direction guardrails: prompt constraint block + deterministic scope gate. + +PR #41 scoped what the Tier 1 / Tier 2 agents *read* (signal queries circled to +the agenda's taxonomy subgraph), but nothing constrained what they *write*: the +generation prompts never mentioned the agenda, so off-topic candidates passed +through whenever the taxonomy match was loose or fell back to the global scan. + +This module adds the missing two pieces, both rule-based (no extra LLM calls): + +1. ``agenda_constraint_block(agenda)`` — a prompt section appended to every + generation prompt when an agenda is present, stating the user's direction + verbatim plus the scope keywords, and instructing the model to stay inside. +2. ``insight_in_scope(insight, agenda)`` — a post-generation keyword gate. + Generated insights whose text matches none of the agenda's scope terms are + dropped before storage and reported as ``dropped_out_of_scope``. + +The gate is intentionally lenient: by default one term hit is enough to keep +an insight (configurable via AGENDA_SCOPE_MIN_TERM_HITS). Its job is to catch +clearly unrelated output, not to rank borderline cases — prompt steering does +the fine-grained work, the gate is the deterministic backstop. +""" + +from __future__ import annotations + +import re +from typing import Any, Iterable + +from config import AGENDA_SCOPE_MIN_TERM_HITS + +# Tokens that appear in almost any ML research direction. Useless as scope +# evidence when auto-extracted from free text, so they are skipped during +# tokenization. Explicit keyword phrases (focus / prefer.keywords) are always +# kept verbatim regardless of this list. +_GENERIC_TOKENS = { + "and", "are", "based", "between", "data", "deep", "for", "from", "into", + "learning", "machine", "method", "methods", "model", "models", "new", + "novel", "over", "paper", "papers", "research", "task", "tasks", "that", + "the", "this", "toward", "towards", "under", "use", "using", "via", + "with", +} + +_TOKEN_RE = re.compile(r"[a-z][a-z0-9\-]{2,}") + +# Fields whose text represents what a generated insight is about. Covers both +# tiers: Tier 2 ideas carry title/problem_statement/proposed_method, Tier 1 +# paradigm insights carry title/formal_structure/transformation. +_SCOPE_TEXT_FIELDS = ( + "title", + "problem_statement", + "proposed_method", + "formal_structure", + "transformation", +) + + +def _tokens(text: Any) -> list[str]: + return [ + tok + for tok in _TOKEN_RE.findall(str(text or "").lower()) + if tok not in _GENERIC_TOKENS + ] + + +def agenda_match_terms(agenda) -> list[str]: + """Lowercased scope terms for the relevance gate. + + Combines, deduplicated and in order: + - focus + prefer.keywords phrases, verbatim (the user named these); + - the individual tokens of those phrases (so "outlier rejection" also + matches text that only says "outlier"); + - tokens extracted from the direction description (catches terms the + user wrote in the free-text direction but not in the keyword list). + """ + from agents.agenda_selector import agenda_scope_keywords + + seen: set[str] = set() + terms: list[str] = [] + + def _add(term: str) -> None: + term = term.strip().lower() + if term and term not in seen: + seen.add(term) + terms.append(term) + + for phrase in agenda_scope_keywords(agenda): + _add(phrase) + for tok in _tokens(phrase): + _add(tok) + for tok in _tokens(getattr(agenda, "description", "")): + _add(tok) + return terms + + +def agenda_constraint_block(agenda) -> str: + """Prompt section that pins generation to the agenda's direction. + + Appended to the user prompt of every generation call when an agenda is + present; without an agenda the prompts are untouched. + """ + from agents.agenda_selector import agenda_scope_keywords + + direction = str(getattr(agenda, "description", "") or "").strip() + if not direction: + direction = str(getattr(agenda, "name", "") or "").strip() + lines = [ + "", + "# RESEARCH DIRECTION CONSTRAINT (hard requirement)", + "", + "All output must stay inside this user-defined research direction:", + "", + f"Direction: {direction}", + ] + keywords = agenda_scope_keywords(agenda) + if keywords: + lines.append(f"Scope keywords: {', '.join(keywords)}") + lines.extend( + [ + "", + "Rules:", + "- Only propose problems, insights, and methods that fall inside this direction.", + "- Ignore signals and evidence unrelated to the direction, even if they look promising.", + "- If little of the evidence fits the direction, return fewer items rather than drifting off-topic.", + ] + ) + return "\n".join(lines) + + +def insight_scope_text(insight: dict) -> str: + """Lowercased text of the fields that describe what an insight is about.""" + parts = [] + for field in _SCOPE_TEXT_FIELDS: + value = insight.get(field) + if value: + parts.append(str(value)) + return " ".join(parts).lower() + + +def count_term_hits(text: str, terms: Iterable[str]) -> int: + return sum(1 for term in terms if term in text) + + +def insight_in_scope(insight: dict, agenda, *, min_hits: int | None = None) -> bool: + """Deterministic check that a generated insight matches the agenda's scope. + + Lenient by design (default: one term hit keeps the insight). Disabled — + everything passes — when there is no agenda, when the threshold is set to + zero, or when the agenda yields no ASCII-matchable term (e.g. a direction + written entirely in Chinese cannot be matched against English insight + text, and dropping everything would be worse than dropping nothing). + """ + if agenda is None: + return True + if min_hits is None: + min_hits = AGENDA_SCOPE_MIN_TERM_HITS + if min_hits <= 0: + return True + terms = agenda_match_terms(agenda) + if not any(re.search(r"[a-z0-9]", term) for term in terms): + return True + return count_term_hits(insight_scope_text(insight), terms) >= min_hits diff --git a/agents/paper_idea_agent.py b/agents/paper_idea_agent.py index aa65724..b4b11b4 100644 --- a/agents/paper_idea_agent.py +++ b/agents/paper_idea_agent.py @@ -10,6 +10,7 @@ """ import json from agents.agenda_budget import AgendaBudgetExceededError +from agents.agenda_relevance import agenda_constraint_block, insight_in_scope from agents.discovery_metadata import build_evidence_packet, enrich_deep_insight from agents.insight_validation import get_evosci_input_issue from agents.llm_client import call_llm_json, is_llm_auth_error, is_llm_provider_unavailable_error @@ -243,8 +244,12 @@ }""" -def _build_problem_prompt(signals: dict) -> str: - """Build evidence prompt for Call 1 (Problem Sharpening).""" +def _build_problem_prompt(signals: dict, agenda=None) -> str: + """Build evidence prompt for Call 1 (Problem Sharpening). + + With an agenda, the user's research direction is appended as a hard + constraint; without one the prompt is built exactly as before. + """ sections = ["# EVIDENCE FROM 10,000+ ML PAPERS\n"] # Contradiction clusters @@ -337,12 +342,15 @@ def _build_problem_prompt(signals: dict) -> str: for row in rows[:6]: sections.append(f"- {json.dumps(row, ensure_ascii=True, default=str)[:260]}") + if agenda is not None: + sections.append(agenda_constraint_block(agenda)) + return "\n".join(sections) -def _build_method_prompt(problem: dict) -> str: +def _build_method_prompt(problem: dict, agenda=None) -> str: """Build prompt for Call 2 (Method Invention).""" - return f"""# RESEARCH PROBLEM + prompt = f"""# RESEARCH PROBLEM ## Title: {problem['title']} @@ -365,11 +373,14 @@ def _build_method_prompt(problem: dict) -> str: Design a NEW method that addresses this specific failure mode. The method must be technically novel — not "apply [existing technique] to [this domain]".""" + if agenda is not None: + prompt += "\n" + agenda_constraint_block(agenda) + return prompt -def _build_experiment_prompt(problem: dict, method: dict) -> str: +def _build_experiment_prompt(problem: dict, method: dict, agenda=None) -> str: """Build prompt for Call 3 (Experimental Design).""" - return f"""# PROPOSED RESEARCH + prompt = f"""# PROPOSED RESEARCH ## Problem Title: {problem['title']} @@ -387,6 +398,9 @@ def _build_experiment_prompt(problem: dict, method: dict) -> str: Design a complete experimental plan for validating this method. Be specific: exact model names, dataset names, metric names, compute estimates.""" + if agenda is not None: + prompt += "\n" + agenda_constraint_block(agenda) + return prompt def _llm_temporarily_unavailable(exc: Exception) -> bool: @@ -400,16 +414,19 @@ def discover_paper_ideas( tier2_plateau_limit: int = 20, tier2_limitation_nodes: int = 15, agenda=None, -) -> list[dict]: +) -> dict: """Run the 3-stage paper idea discovery pipeline. - Returns list of deep_insight dicts ready for storage. + Returns {"insights": [...], "dropped_out_of_scope": n} where insights are + deep_insight dicts ready for storage. If max_papers is None, every sharpened problem (up to max_problems) is expanded. With an agenda (contracts.agenda.ResearchAgenda), the signal scan is - circled to the matching taxonomy subgraph and produced ideas are tagged - with agenda_id. Budget exhaustion stops the loop cleanly, returning the - ideas accepted so far. + circled to the matching taxonomy subgraph, every generation prompt carries + the agenda's direction as a hard constraint, ideas whose text matches none + of the agenda's scope terms are dropped (counted in dropped_out_of_scope), + and produced ideas are tagged with agenda_id. Budget exhaustion stops the + loop cleanly, returning the ideas accepted so far. """ if max_papers is None: max_papers = max_problems @@ -417,6 +434,11 @@ def discover_paper_ideas( print(f"[PAPER_IDEA] Starting Tier 2 discovery...", flush=True) total_tokens = 0 total_calls = 0 + deep_insights: list[dict] = [] + dropped_out_of_scope = 0 + + def _result() -> dict: + return {"insights": deep_insights, "dropped_out_of_scope": dropped_out_of_scope} # Stage 0: Gather signals (scoped to the agenda's subgraph when known) scope_node_ids = None @@ -451,29 +473,29 @@ def discover_paper_ideas( ) if not has_signals: print("[PAPER_IDEA] No signals available. Run signal_harvester first.", flush=True) - return [] + return _result() # Stage 1: Problem Sharpening print("[PAPER_IDEA] Call 1/3: Problem Sharpening...", flush=True) - problem_prompt = _build_problem_prompt(signals) + problem_prompt = _build_problem_prompt(signals, agenda=agenda) try: result1, tokens1 = call_llm_json(PROBLEM_SHARPENING_SYSTEM, problem_prompt) total_tokens += tokens1 total_calls += 1 except AgendaBudgetExceededError as e: print(f"[PAPER_IDEA] Stopped before problem sharpening: {e}", flush=True) - return [] + return _result() except Exception as e: if _llm_temporarily_unavailable(e): print(f"[PAPER_IDEA] Problem sharpening skipped: LLM unavailable ({e})", flush=True) - return [] + return _result() print(f"[PAPER_IDEA] Problem sharpening failed: {e}", flush=True) - return [] + return _result() problems = result1.get("problems", []) if not problems: print("[PAPER_IDEA] No problems extracted", flush=True) - return [] + return _result() problem_budget = min(len(problems), max_problems + max(2, max_papers // 2)) problems = problems[:problem_budget] @@ -483,7 +505,6 @@ def discover_paper_ideas( ) # Stage 2 + 3: Method Invention + Experiment Design for top problems - deep_insights = [] for i, problem in enumerate(problems): if len(deep_insights) >= max_papers: break @@ -493,7 +514,7 @@ def discover_paper_ideas( # Stage 2: Method Invention print(f"[PAPER_IDEA] Call 2/3: Inventing method for '{title[:50]}'...", flush=True) - method_prompt = _build_method_prompt(problem) + method_prompt = _build_method_prompt(problem, agenda=agenda) try: result2, tokens2 = call_llm_json(METHOD_INVENTION_SYSTEM, method_prompt) total_tokens += tokens2 @@ -521,7 +542,7 @@ def discover_paper_ideas( # Stage 3: Experimental Design print(f"[PAPER_IDEA] Call 3/3: Designing experiments for '{method['name']}'...", flush=True) - exp_prompt = _build_experiment_prompt(problem, method) + exp_prompt = _build_experiment_prompt(problem, method, agenda=agenda) try: result3, tokens3 = call_llm_json(EXPERIMENT_DESIGN_SYSTEM, exp_prompt) total_tokens += tokens3 @@ -604,6 +625,15 @@ def discover_paper_ideas( "agenda_id": agenda.agenda_id if agenda is not None else None, } + if agenda is not None and not insight_in_scope(deep_insight, agenda): + dropped_out_of_scope += 1 + print( + f"[PAPER_IDEA] Dropped out-of-scope idea for agenda " + f"'{agenda.name}': {deep_insight['title'][:80]}", + flush=True, + ) + continue + input_issue = get_evosci_input_issue(deep_insight, mode="verification") if input_issue: missing = ", ".join(input_issue.get("missing_fields") or []) @@ -616,6 +646,7 @@ def discover_paper_ideas( deep_insights.append(enrich_deep_insight(deep_insight)) print(f"[PAPER_IDEA] Accepted: {method['name']} — {title[:60]}", flush=True) - print(f"[PAPER_IDEA] Done: {len(deep_insights)} paper ideas from {len(problems)} problems. " + print(f"[PAPER_IDEA] Done: {len(deep_insights)} paper ideas from {len(problems)} problems " + f"({dropped_out_of_scope} dropped as out of agenda scope). " f"Tokens: {total_tokens}, LLM calls: {total_calls}", flush=True) - return deep_insights + return _result() diff --git a/agents/paradigm_agent.py b/agents/paradigm_agent.py index 1f785c1..2c722a2 100644 --- a/agents/paradigm_agent.py +++ b/agents/paradigm_agent.py @@ -21,6 +21,7 @@ is_llm_provider_unavailable_error, ) from agents.agenda_budget import AgendaBudgetExceededError +from agents.agenda_relevance import agenda_constraint_block, insight_in_scope from contracts import DeepInsightSpec, normalize_deep_insight_storage from agents.signal_harvester import agenda_taxonomy_node_ids, get_tier1_signals from config import LLM_MODEL, PROMPT_VERSION @@ -201,8 +202,12 @@ def _guess_mechanism_type(candidate: dict, formalized: dict) -> str: return "structural_equivalence" -def _build_structure_prompt(signals: dict) -> str: - """Build the evidence prompt for Call 1 (Structure Detection).""" +def _build_structure_prompt(signals: dict, agenda=None) -> str: + """Build the evidence prompt for Call 1 (Structure Detection). + + With an agenda, the user's research direction is appended as a hard + constraint; without one the prompt is built exactly as before. + """ sections = [] sections.append("# CROSS-FIELD EVIDENCE FROM 10,000+ ML PAPERS\n") @@ -267,10 +272,13 @@ def _build_structure_prompt(signals: dict) -> str: # default=str keeps prompt building backend-agnostic. sections.append(f"- {json.dumps(row, ensure_ascii=True, default=str)[:220]}") + if agenda is not None: + sections.append(agenda_constraint_block(agenda)) + return "\n".join(sections) -def _build_formalization_prompt(candidate: dict, signals: dict) -> str: +def _build_formalization_prompt(candidate: dict, signals: dict, agenda=None) -> str: """Build the evidence prompt for Call 2 (Formalization).""" sections = [f"# CANDIDATE PARADIGM INSIGHT\n"] sections.append(f"## Title: {candidate['title']}\n") @@ -320,6 +328,9 @@ def _build_formalization_prompt(candidate: dict, signals: dict) -> str: for r in results: sections.append(f" {r['method_name']} on {r['dataset_name']} [{r['metric_name']}] = {r['metric_value']}") + if agenda is not None: + sections.append(agenda_constraint_block(agenda)) + return "\n".join(sections) @@ -382,20 +393,29 @@ def discover_paradigm_insights( tier1_top_overlaps: int = 20, tier1_top_patterns: int = 15, agenda=None, -) -> list[dict]: +) -> dict: """Run the 3-stage paradigm discovery pipeline. - Returns list of deep_insight dicts ready for storage. + Returns {"insights": [...], "dropped_out_of_scope": n} where insights are + deep_insight dicts ready for storage. With an agenda (contracts.agenda.ResearchAgenda), the signal scan is - circled to the taxonomy subgraph matching the agenda's scope keywords and - every produced insight is tagged with agenda_id. Budget exhaustion - (AgendaBudgetExceededError from the metered LLM client) stops the loop - cleanly, returning the insights accepted so far. + circled to the taxonomy subgraph matching the agenda's scope keywords, + every generation prompt carries the agenda's direction as a hard + constraint, insights whose text matches none of the agenda's scope terms + are dropped (counted in dropped_out_of_scope), and every produced insight + is tagged with agenda_id. Budget exhaustion (AgendaBudgetExceededError + from the metered LLM client) stops the loop cleanly, returning the + insights accepted so far. """ print(f"[PARADIGM] Starting Tier 1 discovery (max {max_candidates} candidates)...", flush=True) total_tokens = 0 total_calls = 0 + deep_insights: list[dict] = [] + dropped_out_of_scope = 0 + + def _result() -> dict: + return {"insights": deep_insights, "dropped_out_of_scope": dropped_out_of_scope} # Stage 0: Gather signals (scoped to the agenda's subgraph when known) scope_node_ids = None @@ -416,29 +436,29 @@ def discover_paradigm_insights( ) if not signals["entity_overlaps"] and not signals["pattern_matches"]: print("[PARADIGM] No signals available. Run signal_harvester first.", flush=True) - return [] + return _result() # Stage 1: Structure Detection print("[PARADIGM] Call 1/3: Structure Detection...", flush=True) - structure_prompt = _build_structure_prompt(signals) + structure_prompt = _build_structure_prompt(signals, agenda=agenda) try: result1, tokens1 = call_llm_json(STRUCTURE_DETECTION_SYSTEM, structure_prompt) total_tokens += tokens1 total_calls += 1 except AgendaBudgetExceededError as e: print(f"[PARADIGM] Stopped before structure detection: {e}", flush=True) - return [] + return _result() except Exception as e: if _llm_temporarily_unavailable(e): print(f"[PARADIGM] Structure detection skipped: LLM unavailable ({e})", flush=True) - return [] + return _result() print(f"[PARADIGM] Structure detection failed: {e}", flush=True) - return [] + return _result() candidates = result1.get("candidates", []) if not candidates: print("[PARADIGM] No candidates from structure detection", flush=True) - return [] + return _result() candidates.sort(key=lambda c: c.get("confidence", 0), reverse=True) candidate_budget = min(len(candidates), max_candidates + max(2, max_candidates // 2)) @@ -449,7 +469,6 @@ def discover_paradigm_insights( ) # Stage 2 + 3: Formalize and Challenge each candidate - deep_insights = [] for i, candidate in enumerate(candidates): if len(deep_insights) >= max_candidates: break @@ -458,7 +477,7 @@ def discover_paradigm_insights( # Stage 2: Formalization print(f"[PARADIGM] Call 2/3: Formalizing '{title[:50]}'...", flush=True) - formal_prompt = _build_formalization_prompt(candidate, signals) + formal_prompt = _build_formalization_prompt(candidate, signals, agenda=agenda) try: result2, tokens2 = call_llm_json(FORMALIZATION_SYSTEM, formal_prompt) total_tokens += tokens2 @@ -566,6 +585,15 @@ def discover_paradigm_insights( if result2.get("minimal_experiment"): deep_insight["experimental_plan"] = json.dumps(result2["minimal_experiment"]) + if agenda is not None and not insight_in_scope(deep_insight, agenda): + dropped_out_of_scope += 1 + print( + f"[PARADIGM] Dropped out-of-scope insight for agenda " + f"'{agenda.name}': {deep_insight['title'][:80]}", + flush=True, + ) + continue + input_issue = get_evosci_input_issue(deep_insight, mode="verification") if input_issue: missing = ", ".join(input_issue.get("missing_fields") or []) @@ -578,9 +606,10 @@ def discover_paradigm_insights( deep_insights.append(enrich_deep_insight(deep_insight)) print(f"[PARADIGM] Accepted: {title[:80]} (score={score})", flush=True) - print(f"[PARADIGM] Done: {len(deep_insights)} insights from {len(candidates)} candidates. " + print(f"[PARADIGM] Done: {len(deep_insights)} insights from {len(candidates)} candidates " + f"({dropped_out_of_scope} dropped as out of agenda scope). " f"Tokens: {total_tokens}, LLM calls: {total_calls}", flush=True) - return deep_insights + return _result() def _jsonify(v): diff --git a/config.py b/config.py index a34ce62..dd077c2 100644 --- a/config.py +++ b/config.py @@ -253,6 +253,14 @@ def _split_csv(value: str | list | tuple | None) -> list[str]: "DEEPGRAPH_AGENDA_TOKEN_BUDGET_DEFAULT", 0, "agenda.token_budget_default" ) +# Agenda relevance gate: minimum number of agenda scope terms (focus + +# prefer.keywords + direction-text tokens) a generated insight must mention +# to be stored under that agenda. Deliberately lenient: 1 keeps an insight on +# any single term hit; 0 disables the gate. See agents/agenda_relevance.py. +AGENDA_SCOPE_MIN_TERM_HITS = _env_int( + "DEEPGRAPH_AGENDA_SCOPE_MIN_TERM_HITS", 1, "agenda.scope_min_term_hits" +) + # SciForge Experiment Validation EXPERIMENT_TIME_BUDGET = _env_int("SCIFORGE_TIME_BUDGET", 300, "experiment.time_budget_seconds") EXPERIMENT_MAX_ITERATIONS = _env_int("SCIFORGE_MAX_ITERATIONS", 100, "experiment.max_iterations") diff --git a/orchestrator/discovery_scheduler.py b/orchestrator/discovery_scheduler.py index 6979e6a..3159f58 100644 --- a/orchestrator/discovery_scheduler.py +++ b/orchestrator/discovery_scheduler.py @@ -110,18 +110,20 @@ def run_tier1_discovery( try: if agenda is not None: with agenda_scope(agenda.agenda_id, "tier1_discovery"): - insights = discover_paradigm_insights( + result = discover_paradigm_insights( max_candidates=max_candidates, tier1_top_overlaps=top_ov, tier1_top_patterns=top_pat, agenda=agenda, ) else: - insights = discover_paradigm_insights( + result = discover_paradigm_insights( max_candidates=max_candidates, tier1_top_overlaps=top_ov, tier1_top_patterns=top_pat, ) + insights = result["insights"] + dropped_out_of_scope = int(result.get("dropped_out_of_scope") or 0) stored = [] for ins in insights: insight_id = store_deep_insight(ins) @@ -142,8 +144,19 @@ def run_tier1_discovery( "title": ins["title"], "adversarial_score": ins.get("adversarial_score", 0), }) - log_event("discovery", {"step": "tier1_done", "count": len(stored)}) - print(f"[DISCOVERY] Tier 1 done: {len(stored)} paradigm insights stored", flush=True) + log_event( + "discovery", + { + "step": "tier1_done", + "count": len(stored), + "dropped_out_of_scope": dropped_out_of_scope, + }, + ) + print( + f"[DISCOVERY] Tier 1 done: {len(stored)} paradigm insights stored " + f"({dropped_out_of_scope} dropped as out of agenda scope)", + flush=True, + ) return stored except Exception as e: if _llm_temporarily_unavailable(e): @@ -199,7 +212,7 @@ def run_tier2_discovery( try: if agenda is not None: with agenda_scope(agenda.agenda_id, "tier2_discovery"): - insights = discover_paper_ideas( + result = discover_paper_ideas( max_problems=max_problems, max_papers=mpapers, tier2_plateau_limit=plateaus, @@ -207,12 +220,14 @@ def run_tier2_discovery( agenda=agenda, ) else: - insights = discover_paper_ideas( + result = discover_paper_ideas( max_problems=max_problems, max_papers=mpapers, tier2_plateau_limit=plateaus, tier2_limitation_nodes=lim_nodes, ) + insights = result["insights"] + dropped_out_of_scope = int(result.get("dropped_out_of_scope") or 0) stored = [] for ins in insights: insight_id = store_deep_insight(ins) @@ -238,8 +253,19 @@ def run_tier2_discovery( "title": ins["title"], "method_name": method.get("name", ""), }) - log_event("discovery", {"step": "tier2_done", "count": len(stored)}) - print(f"[DISCOVERY] Tier 2 done: {len(stored)} paper ideas stored", flush=True) + log_event( + "discovery", + { + "step": "tier2_done", + "count": len(stored), + "dropped_out_of_scope": dropped_out_of_scope, + }, + ) + print( + f"[DISCOVERY] Tier 2 done: {len(stored)} paper ideas stored " + f"({dropped_out_of_scope} dropped as out of agenda scope)", + flush=True, + ) return stored except Exception as e: if _llm_temporarily_unavailable(e): diff --git a/scripts/run_deepgraph_new_idea_once.py b/scripts/run_deepgraph_new_idea_once.py index af6aa85..c5a298b 100644 --- a/scripts/run_deepgraph_new_idea_once.py +++ b/scripts/run_deepgraph_new_idea_once.py @@ -103,10 +103,11 @@ def main() -> int: print(_as_json({"event": "harvest_signals_start"}), flush=True) print(_as_json({"event": "harvest_signals_done", "stats": harvest_signals()}), flush=True) - ideas = discover_paper_ideas( + idea_result = discover_paper_ideas( max_problems=args.max_problems, max_papers=args.max_generated, ) + ideas = idea_result["insights"] print(_as_json({"event": "ideas_generated", "count": len(ideas)}), flush=True) stored: list[int] = [] diff --git a/tests/test_agenda_prompt_scoping.py b/tests/test_agenda_prompt_scoping.py new file mode 100644 index 0000000..729e6e0 --- /dev/null +++ b/tests/test_agenda_prompt_scoping.py @@ -0,0 +1,400 @@ +"""Agenda direction hard constraint: prompt injection + deterministic scope gate. + +Field report that motivated this: an agenda about outlier rejection / +correspondence pruning / SfM produced five Tier 2 ideas about RAG, code +generation and graph expressivity — the signal scan was scoped (PR #41) but +the generation prompts never mentioned the agenda and nothing checked the +output. These tests pin down both fixes: + +- with an agenda, every generation prompt carries the direction verbatim plus + the scope keywords; without one, prompts are built exactly as before; +- generated insights that match none of the agenda's scope terms are dropped + before storage and counted in dropped_out_of_scope. +""" + +from __future__ import annotations + +import json +import os +import unittest +from unittest import mock + +os.environ["DEEPGRAPH_DATABASE_URL"] = "" # force SQLite tmpdir; never touch a real DB from the environment + +CONSTRAINT_HEADER = "# RESEARCH DIRECTION CONSTRAINT" + +DIRECTION_TEXT = "特征对应图上的粗差检测(outlier rejection / correspondence pruning / SfM)" + + +def _make_agenda(): + from agents.agenda_loader import parse_agenda + + return parse_agenda( + { + "version": "v1", + "name": "outlier-correspondence-sfm", + "description": DIRECTION_TEXT, + "focus": ["outlier rejection", "correspondence pruning", "sfm"], + "prefer": {"keywords": ["robust estimation"]}, + } + ) + + +def _tier1_signals(): + return { + "entity_overlaps": [ + { + "node_a_id": "cv.sfm", + "node_b_id": "cv.matching", + "taxonomic_distance": 4, + "shared_entity_count": 3, + "overlap_score": 0.42, + "shared_entity_ids": json.dumps( + [{"name": "ransac"}, {"name": "essential matrix"}] + ), + "shared_entity_types": json.dumps({"method": 2}), + } + ], + "pattern_matches": [], + "contradiction_clusters": [], + "taxonomy_map": [], + } + + +def _tier2_signals(): + return { + "contradiction_clusters": [], + "performance_plateaus": [], + "limitation_clusters": [], + "high_potential_insights": [ + { + "title": "Matching pipelines disagree on inlier definitions", + "insight_type": "limitation", + "hypothesis": "Correspondence filtering thresholds are protocol artifacts", + "paradigm_score": 7, + } + ], + "mechanism_mismatches": [], + "protocol_artifacts": [], + "negative_space_gaps": [], + "hidden_variable_bridges": [], + "claim_method_gaps": [], + } + + +class ConstraintBlockTests(unittest.TestCase): + def test_block_carries_direction_and_keywords(self): + from agents.agenda_relevance import agenda_constraint_block + + block = agenda_constraint_block(_make_agenda()) + self.assertIn(CONSTRAINT_HEADER, block) + self.assertIn(DIRECTION_TEXT, block) + self.assertIn("outlier rejection", block) + self.assertIn("correspondence pruning", block) + self.assertIn("robust estimation", block) + + def test_match_terms_include_phrases_and_tokens(self): + from agents.agenda_relevance import agenda_match_terms + + terms = agenda_match_terms(_make_agenda()) + for expected in ( + "outlier rejection", # verbatim phrase + "outlier", # phrase token + "pruning", + "sfm", + "robust estimation", # prefer.keywords + ): + self.assertIn(expected, terms) + + +class Tier1PromptInjectionTests(unittest.TestCase): + def test_structure_prompt_without_agenda_is_unchanged(self): + from agents.paradigm_agent import _build_structure_prompt + + signals = _tier1_signals() + default_prompt = _build_structure_prompt(signals) + self.assertEqual(default_prompt, _build_structure_prompt(signals, agenda=None)) + self.assertNotIn(CONSTRAINT_HEADER, default_prompt) + + def test_structure_prompt_with_agenda_appends_constraint(self): + from agents.paradigm_agent import _build_structure_prompt + + signals = _tier1_signals() + prompt = _build_structure_prompt(signals, agenda=_make_agenda()) + self.assertIn(CONSTRAINT_HEADER, prompt) + self.assertIn(DIRECTION_TEXT, prompt) + self.assertIn("outlier rejection", prompt) + # Evidence section still present, before the constraint block. + self.assertLess(prompt.index("CROSS-FIELD EVIDENCE"), prompt.index(CONSTRAINT_HEADER)) + + def test_formalization_prompt_injection(self): + from agents.paradigm_agent import _build_formalization_prompt + + candidate = {"title": "Candidate", "field_a": {}, "field_b": {}} + signals = _tier1_signals() + default_prompt = _build_formalization_prompt(candidate, signals) + self.assertEqual( + default_prompt, _build_formalization_prompt(candidate, signals, agenda=None) + ) + self.assertNotIn(CONSTRAINT_HEADER, default_prompt) + + prompt = _build_formalization_prompt(candidate, signals, agenda=_make_agenda()) + self.assertIn(CONSTRAINT_HEADER, prompt) + self.assertIn(DIRECTION_TEXT, prompt) + + +class Tier2PromptInjectionTests(unittest.TestCase): + PROBLEM = { + "title": "Inlier threshold drift", + "source_type": "limitation", + "source_evidence": "3 papers", + "formal_statement": "Minimize false matches", + "current_failure_mode": "Fixed thresholds", + "desideratum": "Adaptive filtering", + "impact_scope": "matching pipelines", + "related_node_ids": ["cv.sfm"], + } + METHOD = { + "name": "GraphGate", + "type": "training_procedure", + "one_line": "Filters correspondences", + "definition": "L = ...", + "key_properties": [], + "limitations": "none stated", + } + + def test_problem_prompt_injection(self): + from agents.paper_idea_agent import _build_problem_prompt + + signals = _tier2_signals() + default_prompt = _build_problem_prompt(signals) + self.assertEqual(default_prompt, _build_problem_prompt(signals, agenda=None)) + self.assertNotIn(CONSTRAINT_HEADER, default_prompt) + + prompt = _build_problem_prompt(signals, agenda=_make_agenda()) + self.assertIn(CONSTRAINT_HEADER, prompt) + self.assertIn(DIRECTION_TEXT, prompt) + self.assertIn("outlier rejection", prompt) + + def test_method_prompt_injection(self): + from agents.paper_idea_agent import _build_method_prompt + + default_prompt = _build_method_prompt(self.PROBLEM) + self.assertEqual(default_prompt, _build_method_prompt(self.PROBLEM, agenda=None)) + self.assertNotIn(CONSTRAINT_HEADER, default_prompt) + + prompt = _build_method_prompt(self.PROBLEM, agenda=_make_agenda()) + self.assertIn(CONSTRAINT_HEADER, prompt) + self.assertIn(DIRECTION_TEXT, prompt) + + def test_experiment_prompt_injection(self): + from agents.paper_idea_agent import _build_experiment_prompt + + default_prompt = _build_experiment_prompt(self.PROBLEM, self.METHOD) + self.assertEqual( + default_prompt, _build_experiment_prompt(self.PROBLEM, self.METHOD, agenda=None) + ) + self.assertNotIn(CONSTRAINT_HEADER, default_prompt) + + prompt = _build_experiment_prompt(self.PROBLEM, self.METHOD, agenda=_make_agenda()) + self.assertIn(CONSTRAINT_HEADER, prompt) + self.assertIn(DIRECTION_TEXT, prompt) + + +class ScopeGateTests(unittest.TestCase): + IN_SCOPE = { + "title": "Adaptive outlier rejection on correspondence graphs", + "problem_statement": "Minimize wrong matches kept by fixed RANSAC thresholds in SfM", + "proposed_method": json.dumps({"name": "Consistency-weighted pruning"}), + } + OFF_SCOPE = { + "title": "Retrieval grounding gap in code generation", + "problem_statement": "Align retriever and generator objectives to cut hallucinated APIs", + "proposed_method": json.dumps({"name": "Joint retriever-generator objective"}), + } + + def test_in_scope_insight_passes(self): + from agents.agenda_relevance import insight_in_scope + + self.assertTrue(insight_in_scope(self.IN_SCOPE, _make_agenda())) + + def test_off_scope_insight_is_rejected(self): + from agents.agenda_relevance import insight_in_scope + + self.assertFalse(insight_in_scope(self.OFF_SCOPE, _make_agenda())) + + def test_no_agenda_passes_everything(self): + from agents.agenda_relevance import insight_in_scope + + self.assertTrue(insight_in_scope(self.OFF_SCOPE, None)) + + def test_threshold_is_configurable(self): + from agents.agenda_relevance import insight_in_scope + + # Title-only insight has a couple of hits; a high threshold drops it. + sparse = {"title": "A note on sfm"} + self.assertTrue(insight_in_scope(sparse, _make_agenda(), min_hits=1)) + self.assertFalse(insight_in_scope(sparse, _make_agenda(), min_hits=3)) + # Zero disables the gate. + self.assertTrue(insight_in_scope(self.OFF_SCOPE, _make_agenda(), min_hits=0)) + + def test_non_ascii_only_agenda_disables_gate(self): + from agents.agenda_loader import parse_agenda + from agents.agenda_relevance import insight_in_scope + + agenda = parse_agenda( + { + "version": "v1", + "name": "chinese-only", + "description": "粗差检测", + "focus": ["粗差检测"], + } + ) + # No ASCII-matchable term: dropping everything would be worse than + # dropping nothing, so the gate stands down. + self.assertTrue(insight_in_scope(self.OFF_SCOPE, agenda)) + + +class Tier2DroppedCountTests(unittest.TestCase): + def _run(self): + import agents.paper_idea_agent as pia + + in_scope_problem = { + "title": "Threshold-free outlier rejection for correspondence graphs", + "source_type": "limitation", + "source_evidence": "3 papers report fixed inlier thresholds", + "formal_statement": "Minimize surviving wrong matches in SfM correspondence pruning", + "current_failure_mode": "Fixed global thresholds ignore scene geometry", + "desideratum": "Per-edge adaptive filtering", + "impact_scope": "matching pipelines", + "mechanism_type": "protocol_artifact", + "related_node_ids": ["cv.sfm"], + } + off_scope_problem = { + "title": "Retrieval grounding gap in code synthesis", + "source_type": "plateau", + "source_evidence": "5 papers within 1%", + "formal_statement": "Minimize hallucinated API calls in retrieval-augmented code synthesis", + "current_failure_mode": "Retriever and generator trained separately", + "desideratum": "Joint objective", + "impact_scope": "code assistants", + "mechanism_type": "plateau", + "related_node_ids": ["nlp.codegen"], + } + method_payload = { + "method": { + "name": "EdgeGate", + "type": "training_procedure", + "one_line": "Per-edge gating", + "definition": "g(e) = sigma(w^T f(e))", + "key_properties": [], + "why_novel": "Differs from the three closest filters by gating per edge with learned geometry features", + "limitations": "needs labels", + } + } + exp_in = {"paper_title": "Adaptive correspondence pruning without global thresholds"} + exp_off = {"paper_title": "Jointly trained retrieval for grounded code synthesis"} + + with ( + mock.patch.object(pia, "get_tier2_signals", return_value=_tier2_signals()), + mock.patch.object(pia, "agenda_taxonomy_node_ids", return_value=[]), + mock.patch.object( + pia, + "call_llm_json", + side_effect=[ + ({"problems": [in_scope_problem, off_scope_problem]}, 10), + (method_payload, 10), + (exp_in, 10), + (method_payload, 10), + (exp_off, 10), + ], + ) as llm, + mock.patch.object(pia, "get_evosci_input_issue", return_value=None), + mock.patch.object(pia, "enrich_deep_insight", side_effect=lambda d: d), + mock.patch.object(pia, "build_evidence_packet", return_value={}), + ): + result = pia.discover_paper_ideas(agenda=_make_agenda()) + return result, llm + + def test_off_scope_idea_is_dropped_and_counted(self): + result, llm = self._run() + self.assertEqual(result["dropped_out_of_scope"], 1) + self.assertEqual(len(result["insights"]), 1) + self.assertIn("pruning", result["insights"][0]["title"].lower()) + # The generation prompt itself carried the constraint block. + problem_prompt = llm.call_args_list[0].args[1] + self.assertIn(CONSTRAINT_HEADER, problem_prompt) + self.assertIn(DIRECTION_TEXT, problem_prompt) + + +class Tier1DroppedCountTests(unittest.TestCase): + def _run(self): + import agents.paradigm_agent as pa + + in_scope_candidate = { + "title": "Outlier rejection and correspondence pruning share a consistency structure", + "confidence": 9, + "field_a": {}, + "field_b": {}, + "unifying_structure": "graph consistency", + "shared_failure_mode": "threshold sensitivity", + "evidence_from_graph": "shared ransac usage", + } + off_scope_candidate = { + "title": "Dialogue state tracking unifies with knowledge base completion", + "confidence": 5, + "field_a": {}, + "field_b": {}, + "unifying_structure": "slot filling", + "shared_failure_mode": "schema drift", + "evidence_from_graph": "shared ontology entities", + } + formal_in = { + "title": "Consistency-constrained outlier rejection across SfM pipelines", + "formal_structure": "argmin over correspondence subgraphs", + "transformation": "map pruning rules to consistency potentials", + } + formal_off = { + "title": "Dialogue state tracking as schema alignment", + "formal_structure": "argmax over slot assignments", + "transformation": "map ontology edges to slot values", + } + adversarial = { + "overall_score": 8, + "verdict": "interesting", + "attacks": [], + "strongest_attack": "", + } + + with ( + mock.patch.object(pa, "get_tier1_signals", return_value=_tier1_signals()), + mock.patch.object(pa, "agenda_taxonomy_node_ids", return_value=[]), + mock.patch.object( + pa, + "call_llm_json", + side_effect=[ + ({"candidates": [in_scope_candidate, off_scope_candidate]}, 10), + (formal_in, 10), + (formal_off, 10), + ], + ) as llm, + mock.patch.object(pa, "_call_with_provider", return_value=(adversarial, 5)), + mock.patch.object(pa, "get_evosci_input_issue", return_value=None), + mock.patch.object(pa, "enrich_deep_insight", side_effect=lambda d: d), + mock.patch.object(pa, "build_evidence_packet", return_value={}), + ): + result = pa.discover_paradigm_insights(agenda=_make_agenda()) + return result, llm + + def test_off_scope_insight_is_dropped_and_counted(self): + result, llm = self._run() + self.assertEqual(result["dropped_out_of_scope"], 1) + self.assertEqual(len(result["insights"]), 1) + self.assertIn("outlier rejection", result["insights"][0]["title"].lower()) + structure_prompt = llm.call_args_list[0].args[1] + self.assertIn(CONSTRAINT_HEADER, structure_prompt) + self.assertIn(DIRECTION_TEXT, structure_prompt) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_vnext_discovery.py b/tests/test_vnext_discovery.py index 47da0d1..ab404fc 100644 --- a/tests/test_vnext_discovery.py +++ b/tests/test_vnext_discovery.py @@ -133,7 +133,10 @@ def test_run_tier1_discovery_skips_zero_id_and_keeps_next_result(self): with ( mock.patch.object(discovery_scheduler, "_init_schema_v2"), - mock.patch("agents.paradigm_agent.discover_paradigm_insights", return_value=insights), + mock.patch( + "agents.paradigm_agent.discover_paradigm_insights", + return_value={"insights": insights, "dropped_out_of_scope": 0}, + ), mock.patch("agents.paradigm_agent.store_deep_insight", side_effect=[0, 42]), mock.patch.object(discovery_scheduler, "log_event"), ):