From 232ce05541a38770fa63601c5eeeb75ebc58ecb8 Mon Sep 17 00:00:00 2001 From: currentlycodinng <148545995+currentlycodinng@users.noreply.github.com> Date: Tue, 10 Mar 2026 13:55:18 +0100 Subject: [PATCH 1/6] Add prompt-learning skill: automated feedback-driven prompt improvement MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements the prompt-learning skill (RES-397) which automatically improves prompts by collecting feedback, normalizing it, generating "If [TRIGGER] then [ACTION]" rules via a meta-prompt, and validating with A/B experiments. - Create SKILL.md with 6-phase pipeline (Collect → Normalize → Meta-Prompt → Aggregate → Apply → Validate) and research-validated defaults (f=10, p=3) - Create resources/meta-prompt.md (v2 batch-mode template with issue taxonomy, worked example, and structured output format) - Add bidirectional companion skill references (feedback-loop, optimize-prompt, run-experiment, build-evaluator, trace-analysis) - Add validation script for structural/reference integrity checks - Regenerate artifacts via publish.sh (AGENTS.md, README.md, plugins) Co-Authored-By: Claude Opus 4.6 --- README.md | 1 + agents/AGENTS.md | 2 + scripts/validate_prompt_learning.py | 268 ++++++++++++++++ skills/build-evaluator/SKILL.md | 4 + skills/feedback-loop/SKILL.md | 1 + skills/optimize-prompt/SKILL.md | 1 + skills/prompt-learning/SKILL.md | 302 ++++++++++++++++++ .../prompt-learning/resources/meta-prompt.md | 285 +++++++++++++++++ skills/run-experiment/SKILL.md | 4 +- skills/trace-analysis/SKILL.md | 1 + 10 files changed, 868 insertions(+), 1 deletion(-) create mode 100644 scripts/validate_prompt_learning.py create mode 100644 skills/prompt-learning/SKILL.md create mode 100644 skills/prompt-learning/resources/meta-prompt.md diff --git a/README.md b/README.md index 7abf888..a7737af 100644 --- a/README.md +++ b/README.md @@ -235,6 +235,7 @@ The `action-plan` skill can file tickets directly to Linear. See [Linear MCP set | **manage-memory** | Create and configure orq.ai Memory Stores for persistent context in conversational agents | [SKILL.md](skills/manage-memory/SKILL.md) | | **monitor-production** | Analyze production trace data for anomalies, cost trends, latency regressions, and emerging failure modes | [SKILL.md](skills/monitor-production/SKILL.md) | | **optimize-prompt** | Systematically iterate on a prompt deployment using trace data, A/B testing, and structured refinement techniques | [SKILL.md](skills/optimize-prompt/SKILL.md) | +| **prompt-learning** | Automatically improve prompts by collecting feedback, generating "If [TRIGGER] then [ACTION]" rules via a meta-prompt, and validating with A/B experiments | [SKILL.md](skills/prompt-learning/SKILL.md) | | **regression-test** | Run a quick regression check against a golden dataset to verify recent changes haven't degraded quality | [SKILL.md](skills/regression-test/SKILL.md) | | **run-experiment** | End-to-end LLM evaluation workflow — error analysis, dataset creation, experiment execution, result analysis, and ticket filing | [SKILL.md](skills/run-experiment/SKILL.md) | | **scaffold-integration** | Generate SDK integration code (Python or Node) for orq.ai agents, deployments, and knowledge bases in the user's codebase | [SKILL.md](skills/scaffold-integration/SKILL.md) | diff --git a/agents/AGENTS.md b/agents/AGENTS.md index eef8644..d29e459 100644 --- a/agents/AGENTS.md +++ b/agents/AGENTS.md @@ -20,6 +20,7 @@ These skills are: - manage-memory -> "skills/manage-memory/SKILL.md" - monitor-production -> "skills/monitor-production/SKILL.md" - optimize-prompt -> "skills/optimize-prompt/SKILL.md" + - prompt-learning -> "skills/prompt-learning/SKILL.md" - regression-test -> "skills/regression-test/SKILL.md" - run-experiment -> "skills/run-experiment/SKILL.md" - scaffold-integration -> "skills/scaffold-integration/SKILL.md" @@ -47,6 +48,7 @@ manage-deployment: `Configure, version, and manage orq.ai deployments — model manage-memory: `Create and configure orq.ai Memory Stores for persistent context in conversational agents` monitor-production: `Analyze production trace data for anomalies, cost trends, latency regressions, and emerging failure modes` optimize-prompt: `Systematically iterate on a prompt deployment using trace data, A/B testing, and structured refinement techniques` +prompt-learning: `Automatically improve prompts by collecting feedback, generating "If [TRIGGER] then [ACTION]" rules via a meta-prompt, and validating with A/B experiments` regression-test: `Run a quick regression check against a golden dataset to verify recent changes haven't degraded quality` run-experiment: `End-to-end LLM evaluation workflow — error analysis, dataset creation, experiment execution, result analysis, and ticket filing` scaffold-integration: `Generate SDK integration code (Python or Node) for orq.ai agents, deployments, and knowledge bases in the user's codebase` diff --git a/scripts/validate_prompt_learning.py b/scripts/validate_prompt_learning.py new file mode 100644 index 0000000..6ae0530 --- /dev/null +++ b/scripts/validate_prompt_learning.py @@ -0,0 +1,268 @@ +#!/usr/bin/env -S uv run +# /// script +# requires-python = ">=3.10" +# dependencies = [] +# /// +"""Validate the prompt-learning skill integration. + +Checks: +1. Frontmatter matches existing skill patterns +2. All companion skill references point to existing skills +3. AGENTS.md includes the new skill entry +4. resources/meta-prompt.md is properly referenced in SKILL.md steps +""" + +from __future__ import annotations + +import re +import sys +from pathlib import Path + +ROOT = Path(__file__).resolve().parent.parent +SKILL_DIR = ROOT / "skills" / "prompt-learning" +SKILL_MD = SKILL_DIR / "SKILL.md" +META_PROMPT = SKILL_DIR / "resources" / "meta-prompt.md" +AGENTS_MD = ROOT / "agents" / "AGENTS.md" + +PASS = "\033[32m✓\033[0m" +FAIL = "\033[31m✗\033[0m" + + +def parse_frontmatter(text: str) -> dict[str, str]: + match = re.search(r"^---\s*\n(.*?)\n---\s*", text, re.DOTALL) + if not match: + return {} + data: dict[str, str] = {} + for line in match.group(1).splitlines(): + if ":" not in line: + continue + key, value = line.split(":", 1) + data[key.strip()] = value.strip() + return data + + +def collect_existing_skills() -> set[str]: + """Discover all skills that have a SKILL.md file.""" + skills = set() + for skill_md in ROOT.glob("skills/*/SKILL.md"): + meta = parse_frontmatter(skill_md.read_text(encoding="utf-8")) + name = meta.get("name") + if name: + skills.add(name) + return skills + + +def test_frontmatter() -> list[str]: + """Check 1: Frontmatter matches existing skill patterns.""" + errors = [] + + if not SKILL_MD.exists(): + return [f"SKILL.md not found at {SKILL_MD}"] + + text = SKILL_MD.read_text(encoding="utf-8") + meta = parse_frontmatter(text) + + # Required fields + for field in ("name", "description", "allowed-tools"): + if field not in meta: + errors.append(f"Missing frontmatter field: {field}") + + # Name should match directory + if meta.get("name") != "prompt-learning": + errors.append( + f"Frontmatter name '{meta.get('name')}' doesn't match " + f"directory 'prompt-learning'" + ) + + # Description should be non-empty + if not meta.get("description"): + errors.append("Frontmatter description is empty") + + # allowed-tools should contain core tools present in all skills + allowed = meta.get("allowed-tools", "") + for tool in ("Bash", "Read", "Write", "Edit", "Grep", "Glob", "AskUserQuestion"): + if tool not in allowed: + errors.append(f"allowed-tools missing core tool: {tool}") + + # Cross-check: compare against another skill's frontmatter pattern + reference_skill = ROOT / "skills" / "optimize-prompt" / "SKILL.md" + if reference_skill.exists(): + ref_meta = parse_frontmatter( + reference_skill.read_text(encoding="utf-8") + ) + ref_fields = set(ref_meta.keys()) + our_fields = set(meta.keys()) + missing = ref_fields - our_fields + if missing: + errors.append( + f"Frontmatter missing fields present in optimize-prompt: {missing}" + ) + + return errors + + +def test_companion_skills() -> list[str]: + """Check 2: All companion skill references point to existing skills.""" + errors = [] + + if not SKILL_MD.exists(): + return [f"SKILL.md not found at {SKILL_MD}"] + + text = SKILL_MD.read_text(encoding="utf-8") + existing = collect_existing_skills() + + # Extract companion skill names from backtick references after "Companion skills:" + companion_section = re.search( + r"\*\*Companion skills:\*\*\s*\n((?:- .*\n)*)", text + ) + if not companion_section: + errors.append("No 'Companion skills' section found") + return errors + + companion_names = re.findall(r"`([^`]+)`", companion_section.group(1)) + if not companion_names: + errors.append("No companion skills listed") + return errors + + for name in companion_names: + if name not in existing: + errors.append(f"Companion skill '{name}' does not exist as a skill") + + # Check bidirectional: do companion skills reference us back? + for name in companion_names: + companion_path = ROOT / "skills" / name / "SKILL.md" + if companion_path.exists(): + companion_text = companion_path.read_text(encoding="utf-8") + if "prompt-learning" not in companion_text: + errors.append( + f"Companion '{name}' does not reference 'prompt-learning' back" + ) + + return errors + + +def test_agents_md() -> list[str]: + """Check 3: AGENTS.md has correct formatting with new entry.""" + errors = [] + + if not AGENTS_MD.exists(): + return [f"AGENTS.md not found at {AGENTS_MD}"] + + text = AGENTS_MD.read_text(encoding="utf-8") + + # Check skill path entry + expected_path = 'prompt-learning -> "skills/prompt-learning/SKILL.md"' + if expected_path not in text: + errors.append(f"AGENTS.md missing path entry: {expected_path}") + + # Check description entry in available_skills + if "prompt-learning:" not in text: + errors.append("AGENTS.md missing description entry for prompt-learning") + + # Check alphabetical ordering in the skills list + path_entries = re.findall( + r" - (\S+) -> ", text + ) + if path_entries: + sorted_entries = sorted(path_entries, key=str.lower) + if path_entries != sorted_entries: + errors.append("AGENTS.md skill list is not alphabetically sorted") + + # Check alphabetical ordering in available_skills + desc_entries = re.findall(r"^(\S+):", text, re.MULTILINE) + # Filter to only skill entries (those that have backtick descriptions) + skill_descs = [ + e for e in desc_entries + if f"{e}:" in text and "`" in text.split(f"{e}:")[1].split("\n")[0] + ] + if skill_descs: + sorted_descs = sorted(skill_descs, key=str.lower) + if skill_descs != sorted_descs: + errors.append( + "AGENTS.md available_skills descriptions not alphabetically sorted" + ) + + return errors + + +def test_meta_prompt_reference() -> list[str]: + """Check 4: resources/meta-prompt.md exists and is referenced in SKILL.md.""" + errors = [] + + if not META_PROMPT.exists(): + errors.append(f"Meta-prompt not found at {META_PROMPT}") + return errors + + if not SKILL_MD.exists(): + errors.append(f"SKILL.md not found at {SKILL_MD}") + return errors + + skill_text = SKILL_MD.read_text(encoding="utf-8") + + # Check that SKILL.md references the meta-prompt file + if "resources/meta-prompt.md" not in skill_text: + errors.append( + "SKILL.md does not reference 'resources/meta-prompt.md'" + ) + + # Check that meta-prompt contains key structural elements + meta_text = META_PROMPT.read_text(encoding="utf-8") + + required_sections = [ + "GOAL", + "INPUTS", + "PROCESS", + "OUTPUT FORMAT", + "EXAMPLE", + "ISSUE TAXONOMY", + "FAILURE_EXAMPLES", + "POSITIVE_EXAMPLES", + "RULES_TO_APPEND", + "ITERATION_GUIDANCE", + ] + for section in required_sections: + if section not in meta_text: + errors.append(f"Meta-prompt missing required section: {section}") + + # Check that the meta-prompt has the "If [TRIGGER] then [ACTION]" rule format + if "If [TRIGGER]" not in meta_text: + errors.append( + "Meta-prompt missing rule format: 'If [TRIGGER], then [ACTION]'" + ) + + # Check that LEARNED_RULES section is referenced + if "LEARNED_RULES" not in meta_text: + errors.append("Meta-prompt missing LEARNED_RULES reference") + + return errors + + +def main() -> None: + checks = [ + ("Frontmatter matches existing skill patterns", test_frontmatter), + ("Companion skill references are valid", test_companion_skills), + ("AGENTS.md includes prompt-learning correctly", test_agents_md), + ("Meta-prompt is referenced and well-structured", test_meta_prompt_reference), + ] + + total_errors = 0 + for label, check_fn in checks: + errors = check_fn() + if errors: + print(f"{FAIL} {label}") + for err in errors: + print(f" - {err}") + total_errors += len(errors) + else: + print(f"{PASS} {label}") + + print() + if total_errors: + print(f"{total_errors} error(s) found.") + sys.exit(1) + else: + print("All checks passed.") + + +if __name__ == "__main__": + main() diff --git a/skills/build-evaluator/SKILL.md b/skills/build-evaluator/SKILL.md index baf2066..3cf8a0f 100644 --- a/skills/build-evaluator/SKILL.md +++ b/skills/build-evaluator/SKILL.md @@ -8,6 +8,10 @@ allowed-tools: Bash, Read, Write, Edit, Grep, Glob, WebFetch, Task, AskUserQuest Design and create production-grade LLM evaluators on the orq.ai platform, grounded in evaluation best practices. +**Companion skills:** +- `prompt-learning` — uses evaluator scores as AI feedback to generate prompt rules +- `run-experiment` — run experiments using evaluators built with this skill + ## When to use - User asks to create an LLM-as-a-Judge evaluator diff --git a/skills/feedback-loop/SKILL.md b/skills/feedback-loop/SKILL.md index 34df427..91a99af 100644 --- a/skills/feedback-loop/SKILL.md +++ b/skills/feedback-loop/SKILL.md @@ -11,6 +11,7 @@ Set up user feedback collection and analyze feedback patterns to drive data-info **Companion skills:** - `trace-analysis` — deep-dive into traces flagged by negative feedback - `action-plan` — prioritize improvements based on feedback patterns +- `prompt-learning` — automatically turn feedback patterns into prompt rules - `scaffold-integration` — generate SDK code for feedback collection ## When to use diff --git a/skills/optimize-prompt/SKILL.md b/skills/optimize-prompt/SKILL.md index 69a646c..0004da5 100644 --- a/skills/optimize-prompt/SKILL.md +++ b/skills/optimize-prompt/SKILL.md @@ -12,6 +12,7 @@ Systematically improve prompt deployments through trace-driven failure analysis, - `trace-analysis` — identify failure patterns that inform prompt edits - `run-experiment` — run A/B experiments comparing prompt versions - `build-evaluator` — create evaluators to measure prompt improvements +- `prompt-learning` — automated feedback-driven rule generation (complementary approach) ## When to use diff --git a/skills/prompt-learning/SKILL.md b/skills/prompt-learning/SKILL.md new file mode 100644 index 0000000..1f4d09f --- /dev/null +++ b/skills/prompt-learning/SKILL.md @@ -0,0 +1,302 @@ +--- +name: prompt-learning +description: Automatically improve prompts by collecting feedback, generating "If [TRIGGER] then [ACTION]" rules via a meta-prompt, and validating with A/B experiments +allowed-tools: Bash, Read, Write, Edit, Grep, Glob, WebFetch, Task, AskUserQuestion, mcp__linear-server__*, orq* +--- + +# Prompt Learning + +Automatically improve prompts through feedback-driven rule generation. Collects human or AI feedback, normalizes it to a shared representation, generates targeted "If [TRIGGER] then [ACTION]" rules via a meta-prompt, and appends them to a `### LEARNED_RULES` section — then validates with A/B experiments. + +This skill is **automated and feedback-driven**, distinct from `optimize-prompt` (which is manual/trace-driven). The pipeline is: Collect → Normalize → Meta-Prompt → Aggregate → Apply → Validate. + +**Companion skills:** +- `feedback-loop` — set up feedback collection and analyze feedback patterns +- `optimize-prompt` — manual trace-driven prompt refinement (complementary approach) +- `run-experiment` — run A/B experiments comparing prompt versions +- `build-evaluator` — create evaluators to measure prompt improvements +- `trace-analysis` — deep-dive into traces to understand failure modes + +## When to use + +- User has feedback (human or AI evaluator) and wants automated prompt improvement +- User wants to learn rules from production feedback patterns +- User asks "how do I automatically improve my prompt from feedback?" +- Action plan recommends feedback-driven prompt improvement +- User wants to close the loop between feedback collection and prompt updates +- User has evaluator scores and wants to turn failures into prompt rules + +## orq.ai Documentation + +Consult these docs when working with the orq.ai platform: +- **Prompts overview:** https://docs.orq.ai/docs/prompts/overview +- **Prompt management:** https://docs.orq.ai/docs/prompts/management +- **Deployments overview:** https://docs.orq.ai/docs/deployments/overview +- **Experiments:** https://docs.orq.ai/docs/experiments/creating +- **Traces:** https://docs.orq.ai/docs/observability/traces +- **Feedback:** https://docs.orq.ai/docs/feedback/overview +- **Evaluators:** https://docs.orq.ai/docs/evaluators/overview + +### orq.ai Prompt Capabilities +- Prompts are versioned — each edit creates a new version, previous versions are preserved +- Deployments link to specific prompt versions and model configurations +- Experiments can compare two prompt versions on the same dataset +- Template variables: `{{log.input}}`, `{{log.output}}`, `{{log.messages}}`, `{{log.retrievals}}`, `{{log.reference}}` +- Rules are appended to a `### LEARNED_RULES` section — the rest of the prompt remains untouched + +### orq MCP Tools + +Use the orq MCP server (`https://my.orq.ai/v2/mcp`) as the primary interface. For operations not yet available via MCP, use the HTTP API as fallback. + +**Available MCP tools for this skill:** + +| Tool | Purpose | +|------|---------| +| `search_entities` | Find prompts (`type: "prompts"`) and deployments | +| `list_traces` | Pull recent traces with feedback data | +| `list_spans` | List spans within a trace | +| `get_span` | Get detailed span information | +| `create_experiment` | Run A/B experiment comparing prompt versions | +| `list_experiment_runs` | Check experiment progress | +| `get_experiment_run` | Get experiment results | + +**HTTP API fallback** (for operations not yet in MCP): + +```bash +# List prompts +curl -s https://my.orq.ai/v2/prompts \ + -H "Authorization: Bearer $ORQ_API_KEY" \ + -H "Content-Type: application/json" | jq + +# Get prompt details with versions +curl -s https://my.orq.ai/v2/prompts/ \ + -H "Authorization: Bearer $ORQ_API_KEY" \ + -H "Content-Type: application/json" | jq + +# Create a new prompt version +curl -s -X POST https://my.orq.ai/v2/prompts//versions \ + -H "Authorization: Bearer $ORQ_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{"messages": [...], "model": "...", "parameters": {...}}' | jq +``` + +## Core Principles + +### 1. Feedback Is Fuel, Not Truth +Human and AI feedback use the **same core method** — only preprocessing differs. Normalize all feedback to a shared representation (verdict + severity + issue tags + explanation) before processing. Never trust a single piece of feedback in isolation. + +### 2. Only Recurring Patterns Get Rules +Require 2+ occurrences of a pattern before generating a rule. One-off issues are noise, not signal. The meta-prompt explicitly skips single-occurrence patterns. + +### 3. Rules Are Additive, Never Destructive +Rules are appended to a `### LEARNED_RULES` section in the prompt. Never rewrite or remove existing prompt instructions. Rules augment the prompt — they don't replace it. + +### 4. Positive Anchors Prevent Regression +Always include p=3 positive traces (regression anchors) in each meta-prompt batch. These ensure generated rules don't break what already works. Skipping anchors leads to over-correction. + +### 5. Validate Before Promoting +Never deploy rules without running an A/B experiment. The validation experiment compares the prompt with rules against the baseline on the same dataset. No vibes-checking. + +## Issue Taxonomy + +The meta-prompt classifies failures into these types: + +| Issue Type | Description | +|------------|-------------| +| `accuracy` | Factually incorrect or imprecise outputs | +| `missing_requirement` | Fails to address part of the user's request | +| `policy` | Violates organizational policies or guidelines | +| `safety` | Produces harmful, biased, or inappropriate content | +| `formatting` | Wrong output structure, missing fields, schema violations | +| `verbosity` | Too long or too short for the context | +| `tone` | Inappropriate register, persona drift | +| `tool_use` | Wrong tool selected, incorrect arguments, misinterpreted results | +| `reasoning` | Flawed logic, incorrect deductions | +| `hallucination` | Fabricated facts, citations, or capabilities | + +## Destructive Actions + +The following actions require explicit user confirmation via `AskUserQuestion` before execution: +- Applying generated rules to a prompt (creating a new version with `### LEARNED_RULES`) +- Promoting a rule-enhanced prompt version to a production deployment +- Removing or modifying existing learned rules + +## Research-Validated Defaults + +These defaults come from systematic experiments (RES-205): + +| Parameter | Default | Range | Notes | +|-----------|---------|-------|-------| +| Failures per batch (f) | 10 | 5-15 | f=10 outperforms f=5 (+0.27) and f=20 (+1.23) | +| Positives per batch (p) | 3 | 2-5 | p=0 over-corrects (8 rules); p=3 focuses (2 rules) | +| Iterations | 2 | 1-5 | 2 for GPT/Claude; up to 5 for Gemini | +| Occurrence threshold | 2+ | — | One-offs are skipped | +| Rules per iteration | 1-5 | — | Prioritized by frequency × severity | +| Total rule cap | 10 | — | Across all iterations | + +## Steps + +Follow these steps **in order**. Do NOT skip steps. + +### Phase 1: Identify Target and Collect Feedback (Collect) + +1. **Identify the target prompt/deployment:** + - Use `search_entities` with `type: "prompts"` to find the target prompt + - Use HTTP API to get full prompt details including current version + - Document: system message, user template, model, parameters + +2. **Collect feedback data:** + - Use `list_traces` to pull traces with feedback for the target deployment + - Collect at least 50 traces (more is better) from a meaningful time period + - Separate into: negative feedback traces and positive feedback traces + - Identify the feedback source: human (thumbs up/down, corrections, free-text) or AI evaluator scores + +3. **Verify sufficient data:** + - Need at least f=10 failure traces to proceed + - Need at least p=3 positive traces for regression anchors + - If insufficient, inform user and suggest using `feedback-loop` to set up collection first + +### Phase 2: Normalize Feedback (Normalize) + +4. **Normalize feedback to shared representation:** + + All feedback — human or AI — gets normalized to this shape: + ```json + { + "verdict": "fail" | "pass" | "borderline", + "severity": 1-5, + "issue_tags": [""], + "explanation": "..." + } + ``` + + **For human feedback:** + - Thumbs down → `{"verdict": "fail", "severity": 3, "issue_tags": [], "explanation": ""}` + - Free-text correction → extract issue tags and explanation from the text + - Numerical rating (e.g., 1-5) → map to verdict (1-2: fail, 3: borderline, 4-5: pass) + + **For AI evaluator feedback:** + - Boolean false → `{"verdict": "fail", "severity": 3, "issue_tags": [], "explanation": ""}` + - Categorical/numerical → map to verdict based on scale, carry explanation through + + If raw feedback lacks explanations (e.g., bare thumbs-down), use the LLM to enrich: pass the input/output pair and ask for a brief failure analysis to populate `issue_tags` and `explanation`. + +5. **Sample the batch:** + - Sample f=10 representative failures (diverse issue types, not all the same failure) + - Sample p=3 positive traces as regression anchors + - If more failures exist, prioritize diversity across issue types + +### Phase 3: Generate Rules (Meta-Prompt) + +6. **Load the meta-prompt template** from `resources/meta-prompt.md`. + +7. **Fill in the meta-prompt variables:** + - `PROMPT_TYPE`: "agent" or "evaluator" based on target + - `CURRENT_PROMPT`: full text of the current prompt version + - `ITERATION`: current iteration number (starts at 1) + - `FEEDBACK_SOURCE`: "human" or "ai_eval" + - `FAILURE_EXAMPLES`: the f=10 sampled failures with normalized feedback + - `POSITIVE_EXAMPLES`: the p=3 sampled positive traces + +8. **Execute the meta-prompt** and collect the structured output: + - A) PATTERN_ANALYSIS — recurring patterns and one-off issues + - B) ANCHOR_CHECK — conflicts with positive anchors + - C) RULES — numbered "If [TRIGGER] then [ACTION]" rules + - D) RULES_TO_APPEND — formatted text block for the prompt + - E) REGRESSION_TESTS — test cases for validation + - F) ITERATION_GUIDANCE — continue or stop recommendation + +9. **Review the output** with the user: + - Show the identified patterns and their frequency/severity + - Show the generated rules + - Highlight any anchor conflicts + - Present ITERATION_GUIDANCE recommendation + +### Phase 4: Aggregate and Apply + +10. **Aggregate rules** across iterations (if iteration > 1): + - Merge new rules with existing `### LEARNED_RULES` section + - Remove duplicates or conflicting rules + - Enforce total rule cap of 10 + +11. **Apply rules to the prompt** — **ask user confirmation first:** + - Create a new prompt version with `### LEARNED_RULES` section appended + - Use HTTP API to create the new version + - Document what rules were added and which patterns they address + + Format of the appended section: + ``` + ### LEARNED_RULES + - If [TRIGGER], then [ACTION]. + - If [TRIGGER], then [ACTION]. + ... + ``` + +### Phase 5: Validate (A/B Experiment) + +12. **Set up a validation experiment:** + - Use `create_experiment` to compare baseline (no rules) vs variant (with rules) + - Use the same dataset for both runs + - Include evaluators that measure the targeted failure types + - Include the regression tests from the meta-prompt output + +13. **Run the experiment and analyze results:** + - Use `list_experiment_runs` to monitor progress + - Use `get_experiment_run` to fetch results + - Compare: + ``` + | Evaluator | Baseline | Variant | Delta | + |-----------|----------|---------|-------| + | [target metric] | X% | Y% | +Z% | + | [regression metric] | X% | Y% | +Z% | + ``` + +14. **Decision framework:** + - **Clear win** (>5% improvement on target, no regression) → Promote variant + - **Mixed results** (improvement + regression elsewhere) → Investigate, iterate + - **No improvement** → Re-examine feedback normalization, try different samples + - **Regression** → Revert, rules may be too aggressive + +### Phase 6: Iterate + +15. **Check iteration guidance:** + - If meta-prompt recommends `"continue"` AND iteration < max (2 for GPT/Claude, 5 for Gemini): + - Return to Phase 1 Step 2 with updated prompt (now including rules) + - Collect fresh feedback or use remaining unprocessed failures + - Increment iteration counter + - If meta-prompt recommends `"stop"` OR max iterations reached: + - Present final summary to user + - If validated, **ask user confirmation** to promote to production deployment + +16. **Final summary:** + ``` + ## Prompt Learning Summary + - **Target:** [prompt/deployment name] + - **Iterations completed:** [N] + - **Rules generated:** [N] + - **Feedback source:** human | ai_eval + - **Key patterns addressed:** [list] + - **Validation result:** [pass/fail with metrics] + - **Status:** [promoted / pending promotion / reverted] + ``` + +## Anti-Patterns + +| Anti-Pattern | Why It's Wrong | What to Do Instead | +|---|---|---| +| Acting on single-occurrence feedback | Noise, not signal | Require 2+ occurrences before generating rules | +| Rewriting the whole prompt with rules | Destroys existing instructions | Only append to `### LEARNED_RULES` section | +| Skipping positive anchors | Rules may break what already works | Always include p=3 positive traces in each batch | +| Running more than 5 iterations | Diminishing returns, overfitting risk | Stop at 2 iterations (5 for Gemini) | +| Treating human and AI feedback differently in the pipeline | Research shows same method works for both | Normalize to shared representation, then process identically | +| Deploying rules without validation experiment | No evidence the rules actually help | Always A/B test before promoting | +| Generating rules from too few failures | Insufficient pattern evidence | Wait for f=10 failures minimum per batch | + +## Open in orq.ai + +After completing this skill, direct the user to the relevant platform page: + +- **View/edit the prompt:** `https://my.orq.ai/prompts` — review the prompt with `### LEARNED_RULES` section +- **Check traces with feedback:** `https://my.orq.ai/traces` — inspect traces that provided the feedback signal +- **View experiment results:** `https://my.orq.ai/experiments` — review the A/B validation experiment +- **Feedback overview:** `https://my.orq.ai/feedback` — monitor ongoing feedback collection diff --git a/skills/prompt-learning/resources/meta-prompt.md b/skills/prompt-learning/resources/meta-prompt.md new file mode 100644 index 0000000..0aa56c2 --- /dev/null +++ b/skills/prompt-learning/resources/meta-prompt.md @@ -0,0 +1,285 @@ +# Prompt Learning Meta-Prompt (v2, batch-mode) + +Use this meta-prompt template when generating rules from feedback. Pass it the variables described in INPUTS below. + +--- + +``` +You are a prompt engineer improving a prompt based on feedback from multiple examples. + +══════════════════════════════════════════════════════════════════════ +GOAL +══════════════════════════════════════════════════════════════════════ +Analyze a batch of feedback (failures + positive anchors) and produce minimal, high-impact rules that: +1. Fix recurring failure patterns +2. Don't break existing good behavior (regression anchors) + +══════════════════════════════════════════════════════════════════════ +INPUTS +══════════════════════════════════════════════════════════════════════ +1) PROMPT_TYPE: "agent" | "evaluator" +2) CURRENT_PROMPT: The prompt to improve +3) ITERATION: Current iteration number (1-8) +4) FEEDBACK_SOURCE: "human" | "ai_eval" + +5) FAILURE_EXAMPLES (5-15 samples with negative feedback; typically 6-14): + [ + { + "user_input": "...", + "model_output": "...", + "reference": "..." (optional), + "feedback": + }, + ... + ] + +6) POSITIVE_EXAMPLES (2-5 regression anchors; typically 3-5): + [ + { + "user_input": "...", + "model_output": "...", + "feedback": "pass" | { "value": true, ... } + }, + ... + ] + +FEEDBACK SHAPES: +- Human categorical: "fail" | "pass" | "borderline" +- Human numerical: 3 (just the number) +- Human free text: "The response was too vague..." +- AI eval boolean: { "value": true|false, "explanation": "..." } +- AI eval categorical: { "value": "A"|"B"|"C", "explanation": "..." } +- AI eval numerical: { "value": 6, "scale": "1-10", "explanation": "..." } +- Enriched normalized (recommended if raw feedback lacks explanations): + { "verdict": "fail", "severity": 4, "issue_tags": ["missing_requirement"], "explanation": "..." } + +══════════════════════════════════════════════════════════════════════ +PROCESS +══════════════════════════════════════════════════════════════════════ + +STEP 1: ANALYZE FAILURE PATTERNS +Group the failure examples by issue type. Identify recurring patterns (2+ occurrences). + +ISSUE TAXONOMY: +- accuracy: factually wrong +- missing_requirement: didn't fulfill explicit requirement +- policy: violated policy/guideline +- safety: unsafe content +- formatting: wrong format/structure +- verbosity: too long/short +- tone: wrong tone/style +- tool_use: incorrect tool usage +- reasoning: flawed logic +- hallucination: made up information + +Output pattern analysis: +{ + "patterns": [ + { + "issue_tag": "", + "count": , + "severity": <1-5>, + "examples": [], + "root_cause": "" + } + ], + "one_off_issues": [] +} + +STEP 2: CHECK AGAINST POSITIVE ANCHORS +For each identified pattern, verify the proposed fix won't break positive examples. + +{ + "anchor_conflicts": [ + { + "pattern": "", + "conflicting_anchor": , + "conflict_reason": "" + } + ] +} + +STEP 3: GENERATE RULES (only for recurring patterns without conflicts) +Create 1-5 rules that address the most impactful patterns. + +Rule format: "If [TRIGGER], then [ACTION]." + +Prioritize by: frequency × severity + +Skip patterns that: +- Appeared only once (one-off) +- Would conflict with positive anchors +- Are too vague to create testable rules + +STEP 4: FORMAT RULES_TO_APPEND +Text block for ### LEARNED_RULES section. + +STEP 5: GENERATE REGRESSION TESTS +Create 5-10 test cases: +- 3-5 "should_now_pass" (based on failure patterns) +- 2-5 "should_still_pass" (based on positive anchors) + +STEP 6: ITERATION GUIDANCE +Based on current iteration, suggest: +- Continue: if significant patterns remain unfixed +- Stop: if diminishing returns (patterns are one-offs or low severity) + +══════════════════════════════════════════════════════════════════════ +OUTPUT FORMAT +══════════════════════════════════════════════════════════════════════ + +### A) PATTERN_ANALYSIS +```json +{ + "patterns": [...], + "one_off_issues": [...] +} +``` + +### B) ANCHOR_CHECK +```json +{ + "anchor_conflicts": [...], + "safe_to_patch": [] +} +``` + +### C) RULES + + +### D) RULES_TO_APPEND +``` + +``` + +### E) REGRESSION_TESTS +```json +[...] +``` + +### F) ITERATION_GUIDANCE +{ + "recommendation": "continue" | "stop", + "reason": "", + "remaining_issues": , + "expected_next_iteration_gain": "high" | "medium" | "low" +} + +══════════════════════════════════════════════════════════════════════ +EXAMPLE +══════════════════════════════════════════════════════════════════════ + +Input: +- PROMPT_TYPE: agent +- CURRENT_PROMPT: "You are a helpful customer service assistant..." +- ITERATION: 1 +- FEEDBACK_SOURCE: ai_eval + +- FAILURE_EXAMPLES: + [ + {"user_input": "Return item bought 45 days ago", "model_output": "Sure, go to returns...", "feedback": {"value": false, "explanation": "Didn't check 30-day policy"}}, + {"user_input": "Return item from 2 months ago", "model_output": "Click return button...", "feedback": {"value": false, "explanation": "Ignored return window"}}, + {"user_input": "Refund for old purchase", "model_output": "Here's how to return...", "feedback": {"value": false, "explanation": "Should have mentioned policy limits"}}, + {"user_input": "Where is my order?", "model_output": "Check your email", "feedback": {"value": false, "explanation": "Too brief, should offer to look up"}}, + {"user_input": "Cancel subscription", "model_output": "Subscription cancelled.", "feedback": {"value": false, "explanation": "Didn't confirm or ask for feedback"}} + ] + +- POSITIVE_EXAMPLES: + [ + {"user_input": "Return item bought yesterday", "model_output": "I'd be happy to help with your return...", "feedback": {"value": true, "explanation": "Complete and helpful"}}, + {"user_input": "Track my package", "model_output": "Let me look that up for you. Your order #123 is...", "feedback": {"value": true, "explanation": "Proactive and specific"}} + ] + +Output: + +### A) PATTERN_ANALYSIS +```json +{ + "patterns": [ + { + "issue_tag": "policy", + "count": 3, + "severity": 4, + "examples": [0, 1, 2], + "root_cause": "Model provides return instructions without checking/mentioning the 30-day return policy" + }, + { + "issue_tag": "verbosity", + "count": 1, + "severity": 2, + "examples": [3], + "root_cause": "Response too brief" + }, + { + "issue_tag": "missing_requirement", + "count": 1, + "severity": 3, + "examples": [4], + "root_cause": "Didn't confirm action or gather feedback" + } + ], + "one_off_issues": ["verbosity", "missing_requirement (cancellation)"] +} +``` + +### B) ANCHOR_CHECK +```json +{ + "anchor_conflicts": [], + "safe_to_patch": ["policy"] +} +``` + +### C) RULES +1. If customer asks about returning an item, then first check if the purchase date is within the 30-day return window before providing return instructions. +2. If return request is outside the 30-day policy, then inform customer of the policy limit and offer alternatives (store credit, exception request). + +### D) RULES_TO_APPEND +``` +- For return requests: Always verify purchase date against 30-day return policy before providing instructions. +- If item is outside return window: State the policy clearly, then offer alternatives rather than denying outright. +``` + +### E) REGRESSION_TESTS +```json +[ + {"input": "Return item bought 45 days ago", "expected": "Mention 30-day policy, offer alternatives", "type": "should_now_pass"}, + {"input": "Refund for purchase from last month (35 days)", "expected": "Check date, explain policy", "type": "should_now_pass"}, + {"input": "Return item bought 2 months ago", "expected": "Policy limit + alternatives", "type": "should_now_pass"}, + {"input": "Return item bought yesterday", "expected": "Proceed with return instructions normally", "type": "should_still_pass"}, + {"input": "Track my package", "expected": "Look up order, provide status", "type": "should_still_pass"} +] +``` + +### F) ITERATION_GUIDANCE +```json +{ + "recommendation": "continue", + "reason": "Addressed 1 major pattern (policy/returns). Two one-off issues remain but may recur with more data.", + "remaining_issues": 2, + "expected_next_iteration_gain": "medium" +} +``` + +══════════════════════════════════════════════════════════════════════ +NOW PROCESS THE ACTUAL INPUT +══════════════════════════════════════════════════════════════════════ +``` + +--- + +## Template Usage Notes + +1. **PROMPT_TYPE**: Set to `"agent"` for agent/deployment prompts, `"evaluator"` for evaluator prompts. + +2. **FAILURE_EXAMPLES**: Sample f=10 failures (research-validated default). Include representative failures, not exhaustive lists. Ensure at least 2 examples per pattern you want to address. + +3. **POSITIVE_EXAMPLES**: Always include p=3 positive traces as regression anchors. These prevent over-correction by ensuring rules don't break existing good behavior. + +4. **FEEDBACK_SOURCE**: Set to `"human"` for user feedback (thumbs up/down, corrections, free-text) or `"ai_eval"` for evaluator scores. The same meta-prompt processes both — only the preprocessing (normalization) differs. + +5. **Iteration count**: Default to 2 iterations for most models (GPT, Claude). Use up to 5 for Gemini models. Stop early if ITERATION_GUIDANCE recommends `"stop"`. + +6. **Rule cap**: Maximum 10 total rules across all iterations. If approaching the cap, prioritize highest frequency × severity rules. + +7. **Variables**: Fill in `CURRENT_PROMPT` with the full prompt text, `FAILURE_EXAMPLES` and `POSITIVE_EXAMPLES` with sampled traces from the target deployment. diff --git a/skills/run-experiment/SKILL.md b/skills/run-experiment/SKILL.md index 566a8e8..38c26dc 100644 --- a/skills/run-experiment/SKILL.md +++ b/skills/run-experiment/SKILL.md @@ -8,7 +8,9 @@ allowed-tools: Bash, Read, Write, Edit, Grep, Glob, Task, AskUserQuestion, mcp__ End-to-end workflow for evaluating LLM pipelines using the orq.ai platform, grounded in evaluation best practices. -**Companion skill:** `build-evaluator` — use that skill for detailed judge prompt design. This skill orchestrates the broader workflow that wraps around it. +**Companion skills:** +- `build-evaluator` — use that skill for detailed judge prompt design. This skill orchestrates the broader workflow that wraps around it. +- `prompt-learning` — automated feedback-driven rule generation validated via experiments ## When to use diff --git a/skills/trace-analysis/SKILL.md b/skills/trace-analysis/SKILL.md index cf9c9e2..d454174 100644 --- a/skills/trace-analysis/SKILL.md +++ b/skills/trace-analysis/SKILL.md @@ -11,6 +11,7 @@ Systematic methodology for reading LLM traces, identifying failure modes, and bu **Companion skills:** - `build-evaluator` — build automated evaluators for persistent failure modes - `action-plan` — turn findings into prioritized improvement plans +- `prompt-learning` — automatically turn trace-identified patterns into prompt rules ## When to use From 5ea399b6220f20af6a5b1d7d95178a1addb42dce Mon Sep 17 00:00:00 2001 From: currentlycodinng <148545995+currentlycodinng@users.noreply.github.com> Date: Tue, 10 Mar 2026 14:14:55 +0100 Subject: [PATCH 2/6] Fix review findings: dedup taxonomy, add versioning doc, improve validation - Remove duplicated Issue Taxonomy table from SKILL.md, reference meta-prompt.md as single source of truth instead - Add missing "Prompt versioning" documentation link - Mark defaults table as preliminary pending RES-205 results - Move Template Usage Notes out of meta-prompt.md (not seen by LLM) - Add cross-file consistency check to validation script (taxonomy tags, output sections, LEARNED_RULES references) - Fix false positive in taxonomy tag regex matching Co-Authored-By: Claude Opus 4.6 --- scripts/validate_prompt_learning.py | 59 +++++++++++++++++++ skills/prompt-learning/SKILL.md | 26 +++----- .../prompt-learning/resources/meta-prompt.md | 16 +---- 3 files changed, 67 insertions(+), 34 deletions(-) diff --git a/scripts/validate_prompt_learning.py b/scripts/validate_prompt_learning.py index 6ae0530..a01116a 100644 --- a/scripts/validate_prompt_learning.py +++ b/scripts/validate_prompt_learning.py @@ -237,12 +237,71 @@ def test_meta_prompt_reference() -> list[str]: return errors +def test_cross_file_consistency() -> list[str]: + """Check 5: Parameters and taxonomy are consistent between SKILL.md and meta-prompt.md.""" + errors = [] + + if not SKILL_MD.exists() or not META_PROMPT.exists(): + return ["Cannot check consistency — files missing"] + + skill_text = SKILL_MD.read_text(encoding="utf-8") + meta_text = META_PROMPT.read_text(encoding="utf-8") + + # Check that all taxonomy tags mentioned in meta-prompt are referenced in SKILL.md + # Only match tags in the ISSUE TAXONOMY section (between "ISSUE TAXONOMY:" and next empty line) + taxonomy_match = re.search( + r"ISSUE TAXONOMY:\n((?:- \w+:.*\n)+)", meta_text + ) + taxonomy_tags = ( + re.findall(r"^- (\w+):", taxonomy_match.group(1), re.MULTILINE) + if taxonomy_match + else [] + ) + for tag in taxonomy_tags: + if tag not in skill_text: + errors.append( + f"Taxonomy tag '{tag}' in meta-prompt.md not referenced in SKILL.md" + ) + + # Check that SKILL.md taxonomy references match meta-prompt taxonomy + skill_tags = re.findall(r"`(\w+)`", skill_text.split("Issue Taxonomy")[1].split("##")[0]) if "Issue Taxonomy" in skill_text else [] + meta_tags = set(taxonomy_tags) + for tag in skill_tags: + if tag in ( + "accuracy", "missing_requirement", "policy", "safety", + "formatting", "verbosity", "tone", "tool_use", "reasoning", + "hallucination", + ) and tag not in meta_tags: + errors.append( + f"Taxonomy tag '{tag}' in SKILL.md not found in meta-prompt.md" + ) + + # Check that key structural elements referenced in SKILL.md steps exist in meta-prompt + # (e.g., output sections A-F) + for section_label in ("PATTERN_ANALYSIS", "ANCHOR_CHECK", "RULES_TO_APPEND", "REGRESSION_TESTS", "ITERATION_GUIDANCE"): + if section_label in skill_text and section_label not in meta_text: + errors.append( + f"SKILL.md references output section '{section_label}' not found in meta-prompt.md" + ) + + # Check that the LEARNED_RULES format is consistent + skill_has_learned_rules = "### LEARNED_RULES" in skill_text + meta_has_learned_rules = "LEARNED_RULES" in meta_text + if skill_has_learned_rules and not meta_has_learned_rules: + errors.append("SKILL.md references LEARNED_RULES but meta-prompt.md does not") + if meta_has_learned_rules and not skill_has_learned_rules: + errors.append("meta-prompt.md references LEARNED_RULES but SKILL.md does not") + + return errors + + def main() -> None: checks = [ ("Frontmatter matches existing skill patterns", test_frontmatter), ("Companion skill references are valid", test_companion_skills), ("AGENTS.md includes prompt-learning correctly", test_agents_md), ("Meta-prompt is referenced and well-structured", test_meta_prompt_reference), + ("Cross-file consistency (SKILL.md ↔ meta-prompt.md)", test_cross_file_consistency), ] total_errors = 0 diff --git a/skills/prompt-learning/SKILL.md b/skills/prompt-learning/SKILL.md index 1f4d09f..b1b0815 100644 --- a/skills/prompt-learning/SKILL.md +++ b/skills/prompt-learning/SKILL.md @@ -31,6 +31,7 @@ This skill is **automated and feedback-driven**, distinct from `optimize-prompt` Consult these docs when working with the orq.ai platform: - **Prompts overview:** https://docs.orq.ai/docs/prompts/overview - **Prompt management:** https://docs.orq.ai/docs/prompts/management +- **Prompt versioning:** https://docs.orq.ai/docs/prompts/versioning - **Deployments overview:** https://docs.orq.ai/docs/deployments/overview - **Experiments:** https://docs.orq.ai/docs/experiments/creating - **Traces:** https://docs.orq.ai/docs/observability/traces @@ -99,20 +100,7 @@ Never deploy rules without running an A/B experiment. The validation experiment ## Issue Taxonomy -The meta-prompt classifies failures into these types: - -| Issue Type | Description | -|------------|-------------| -| `accuracy` | Factually incorrect or imprecise outputs | -| `missing_requirement` | Fails to address part of the user's request | -| `policy` | Violates organizational policies or guidelines | -| `safety` | Produces harmful, biased, or inappropriate content | -| `formatting` | Wrong output structure, missing fields, schema violations | -| `verbosity` | Too long or too short for the context | -| `tone` | Inappropriate register, persona drift | -| `tool_use` | Wrong tool selected, incorrect arguments, misinterpreted results | -| `reasoning` | Flawed logic, incorrect deductions | -| `hallucination` | Fabricated facts, citations, or capabilities | +The meta-prompt classifies failures into 10 types: `accuracy`, `missing_requirement`, `policy`, `safety`, `formatting`, `verbosity`, `tone`, `tool_use`, `reasoning`, and `hallucination`. See the full taxonomy with descriptions in `resources/meta-prompt.md` (ISSUE TAXONOMY section). ## Destructive Actions @@ -121,15 +109,15 @@ The following actions require explicit user confirmation via `AskUserQuestion` b - Promoting a rule-enhanced prompt version to a production deployment - Removing or modifying existing learned rules -## Research-Validated Defaults +## Defaults -These defaults come from systematic experiments (RES-205): +> **Note:** These defaults are preliminary. RES-205 experiments are still running — values will be updated once results are final. | Parameter | Default | Range | Notes | |-----------|---------|-------|-------| -| Failures per batch (f) | 10 | 5-15 | f=10 outperforms f=5 (+0.27) and f=20 (+1.23) | -| Positives per batch (p) | 3 | 2-5 | p=0 over-corrects (8 rules); p=3 focuses (2 rules) | -| Iterations | 2 | 1-5 | 2 for GPT/Claude; up to 5 for Gemini | +| Failures per batch (f) | 10 | 5-15 | Representative sample, not exhaustive | +| Positives per batch (p) | 3 | 2-5 | Regression anchors to prevent over-correction | +| Iterations | 2 | 1-5 | 2 for GPT/Claude; up to 5 for Gemini (pending validation) | | Occurrence threshold | 2+ | — | One-offs are skipped | | Rules per iteration | 1-5 | — | Prioritized by frequency × severity | | Total rule cap | 10 | — | Across all iterations | diff --git a/skills/prompt-learning/resources/meta-prompt.md b/skills/prompt-learning/resources/meta-prompt.md index 0aa56c2..4346d08 100644 --- a/skills/prompt-learning/resources/meta-prompt.md +++ b/skills/prompt-learning/resources/meta-prompt.md @@ -268,18 +268,4 @@ NOW PROCESS THE ACTUAL INPUT --- -## Template Usage Notes - -1. **PROMPT_TYPE**: Set to `"agent"` for agent/deployment prompts, `"evaluator"` for evaluator prompts. - -2. **FAILURE_EXAMPLES**: Sample f=10 failures (research-validated default). Include representative failures, not exhaustive lists. Ensure at least 2 examples per pattern you want to address. - -3. **POSITIVE_EXAMPLES**: Always include p=3 positive traces as regression anchors. These prevent over-correction by ensuring rules don't break existing good behavior. - -4. **FEEDBACK_SOURCE**: Set to `"human"` for user feedback (thumbs up/down, corrections, free-text) or `"ai_eval"` for evaluator scores. The same meta-prompt processes both — only the preprocessing (normalization) differs. - -5. **Iteration count**: Default to 2 iterations for most models (GPT, Claude). Use up to 5 for Gemini models. Stop early if ITERATION_GUIDANCE recommends `"stop"`. - -6. **Rule cap**: Maximum 10 total rules across all iterations. If approaching the cap, prioritize highest frequency × severity rules. - -7. **Variables**: Fill in `CURRENT_PROMPT` with the full prompt text, `FAILURE_EXAMPLES` and `POSITIVE_EXAMPLES` with sampled traces from the target deployment. +> **Usage:** This file contains the raw prompt template between the ``` fences. Everything outside the fences is for the agent operator, not the LLM. See SKILL.md Phase 3 for how to fill in the variables. From 96961f6f4220393fdb6cbe5db352dd2ad01200c6 Mon Sep 17 00:00:00 2001 From: currentlycodinng <148545995+currentlycodinng@users.noreply.github.com> Date: Tue, 10 Mar 2026 14:28:24 +0100 Subject: [PATCH 3/6] Inline meta-prompt into SKILL.md, remove separate resource file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per Anthropic docs, resource files are supplemental content that Claude may skip loading. The meta-prompt is the core mechanism of this skill — it must always be available. Inlined it directly into Phase 3 Step 7. - Move meta-prompt template from resources/meta-prompt.md into SKILL.md - Delete resources/ directory (no longer needed) - Restore Issue Taxonomy table inline (single source of truth) - Rewrite validation script for inline meta-prompt checks Co-Authored-By: Claude Opus 4.6 --- scripts/validate_prompt_learning.py | 154 ++++------ skills/prompt-learning/SKILL.md | 91 +++++- .../prompt-learning/resources/meta-prompt.md | 271 ------------------ 3 files changed, 133 insertions(+), 383 deletions(-) delete mode 100644 skills/prompt-learning/resources/meta-prompt.md diff --git a/scripts/validate_prompt_learning.py b/scripts/validate_prompt_learning.py index a01116a..ef288f4 100644 --- a/scripts/validate_prompt_learning.py +++ b/scripts/validate_prompt_learning.py @@ -9,7 +9,7 @@ 1. Frontmatter matches existing skill patterns 2. All companion skill references point to existing skills 3. AGENTS.md includes the new skill entry -4. resources/meta-prompt.md is properly referenced in SKILL.md steps +4. Meta-prompt template is inline in SKILL.md with required structural elements """ from __future__ import annotations @@ -18,10 +18,10 @@ import sys from pathlib import Path + ROOT = Path(__file__).resolve().parent.parent SKILL_DIR = ROOT / "skills" / "prompt-learning" SKILL_MD = SKILL_DIR / "SKILL.md" -META_PROMPT = SKILL_DIR / "resources" / "meta-prompt.md" AGENTS_MD = ROOT / "agents" / "AGENTS.md" PASS = "\033[32m✓\033[0m" @@ -185,112 +185,65 @@ def test_agents_md() -> list[str]: return errors -def test_meta_prompt_reference() -> list[str]: - """Check 4: resources/meta-prompt.md exists and is referenced in SKILL.md.""" +def test_inline_meta_prompt() -> list[str]: + """Check 4: Meta-prompt template is inline in SKILL.md with required elements.""" errors = [] - if not META_PROMPT.exists(): - errors.append(f"Meta-prompt not found at {META_PROMPT}") - return errors - if not SKILL_MD.exists(): - errors.append(f"SKILL.md not found at {SKILL_MD}") - return errors + return [f"SKILL.md not found at {SKILL_MD}"] - skill_text = SKILL_MD.read_text(encoding="utf-8") + text = SKILL_MD.read_text(encoding="utf-8") - # Check that SKILL.md references the meta-prompt file - if "resources/meta-prompt.md" not in skill_text: + # The meta-prompt should be inline in Phase 3 (not in a separate file) + resources_dir = SKILL_DIR / "resources" + if resources_dir.exists() and (resources_dir / "meta-prompt.md").exists(): errors.append( - "SKILL.md does not reference 'resources/meta-prompt.md'" + "Meta-prompt exists as separate file resources/meta-prompt.md — " + "should be inlined in SKILL.md Phase 3" ) - # Check that meta-prompt contains key structural elements - meta_text = META_PROMPT.read_text(encoding="utf-8") - - required_sections = [ - "GOAL", - "INPUTS", - "PROCESS", - "OUTPUT FORMAT", - "EXAMPLE", - "ISSUE TAXONOMY", - "FAILURE_EXAMPLES", - "POSITIVE_EXAMPLES", - "RULES_TO_APPEND", - "ITERATION_GUIDANCE", + # Check that Phase 3 contains the meta-prompt template + if "Execute the following meta-prompt" not in text: + errors.append("SKILL.md Phase 3 missing inline meta-prompt instruction") + + # Check required structural elements are present in the inline template + required_elements = [ + ("GOAL", "meta-prompt GOAL section"), + ("FAILURE_EXAMPLES", "failure examples input"), + ("POSITIVE_EXAMPLES", "positive examples input"), + ("FEEDBACK SHAPES", "feedback shape reference"), + ("STEP 1", "failure pattern analysis step"), + ("STEP 2", "anchor check step"), + ("STEP 3", "rule generation step"), + ("RULES_TO_APPEND", "rules output format"), + ("REGRESSION_TESTS", "regression test generation"), + ("ITERATION_GUIDANCE", "iteration guidance output"), + ("If [TRIGGER]", "rule format specification"), + ("LEARNED_RULES", "learned rules section reference"), ] - for section in required_sections: - if section not in meta_text: - errors.append(f"Meta-prompt missing required section: {section}") - - # Check that the meta-prompt has the "If [TRIGGER] then [ACTION]" rule format - if "If [TRIGGER]" not in meta_text: - errors.append( - "Meta-prompt missing rule format: 'If [TRIGGER], then [ACTION]'" - ) - - # Check that LEARNED_RULES section is referenced - if "LEARNED_RULES" not in meta_text: - errors.append("Meta-prompt missing LEARNED_RULES reference") - - return errors - - -def test_cross_file_consistency() -> list[str]: - """Check 5: Parameters and taxonomy are consistent between SKILL.md and meta-prompt.md.""" - errors = [] - - if not SKILL_MD.exists() or not META_PROMPT.exists(): - return ["Cannot check consistency — files missing"] - - skill_text = SKILL_MD.read_text(encoding="utf-8") - meta_text = META_PROMPT.read_text(encoding="utf-8") - - # Check that all taxonomy tags mentioned in meta-prompt are referenced in SKILL.md - # Only match tags in the ISSUE TAXONOMY section (between "ISSUE TAXONOMY:" and next empty line) - taxonomy_match = re.search( - r"ISSUE TAXONOMY:\n((?:- \w+:.*\n)+)", meta_text - ) - taxonomy_tags = ( - re.findall(r"^- (\w+):", taxonomy_match.group(1), re.MULTILINE) - if taxonomy_match - else [] - ) - for tag in taxonomy_tags: - if tag not in skill_text: - errors.append( - f"Taxonomy tag '{tag}' in meta-prompt.md not referenced in SKILL.md" - ) - - # Check that SKILL.md taxonomy references match meta-prompt taxonomy - skill_tags = re.findall(r"`(\w+)`", skill_text.split("Issue Taxonomy")[1].split("##")[0]) if "Issue Taxonomy" in skill_text else [] - meta_tags = set(taxonomy_tags) - for tag in skill_tags: - if tag in ( - "accuracy", "missing_requirement", "policy", "safety", - "formatting", "verbosity", "tone", "tool_use", "reasoning", - "hallucination", - ) and tag not in meta_tags: - errors.append( - f"Taxonomy tag '{tag}' in SKILL.md not found in meta-prompt.md" - ) - - # Check that key structural elements referenced in SKILL.md steps exist in meta-prompt - # (e.g., output sections A-F) - for section_label in ("PATTERN_ANALYSIS", "ANCHOR_CHECK", "RULES_TO_APPEND", "REGRESSION_TESTS", "ITERATION_GUIDANCE"): - if section_label in skill_text and section_label not in meta_text: - errors.append( - f"SKILL.md references output section '{section_label}' not found in meta-prompt.md" - ) - - # Check that the LEARNED_RULES format is consistent - skill_has_learned_rules = "### LEARNED_RULES" in skill_text - meta_has_learned_rules = "LEARNED_RULES" in meta_text - if skill_has_learned_rules and not meta_has_learned_rules: - errors.append("SKILL.md references LEARNED_RULES but meta-prompt.md does not") - if meta_has_learned_rules and not skill_has_learned_rules: - errors.append("meta-prompt.md references LEARNED_RULES but SKILL.md does not") + for element, description in required_elements: + if element not in text: + errors.append(f"SKILL.md missing meta-prompt element: {description} ({element})") + + # Check that the issue taxonomy tags in the inline template match the + # Issue Taxonomy section + taxonomy_section = text.split("Issue Taxonomy")[1].split("##")[0] if "Issue Taxonomy" in text else "" + taxonomy_tags = re.findall(r"`(\w+)`", taxonomy_section) + expected_tags = { + "accuracy", "missing_requirement", "policy", "safety", + "formatting", "verbosity", "tone", "tool_use", "reasoning", + "hallucination", + } + # Only check tags that look like taxonomy entries + found_tags = {t for t in taxonomy_tags if t in expected_tags} + missing_tags = expected_tags - found_tags + if missing_tags: + errors.append(f"Issue Taxonomy section missing tags: {missing_tags}") + + # Check that the inline meta-prompt also lists the taxonomy tags + # (look for the compact taxonomy list in the template) + if "accuracy" not in text or "hallucination" not in text: + errors.append("Inline meta-prompt missing taxonomy tag references") return errors @@ -300,8 +253,7 @@ def main() -> None: ("Frontmatter matches existing skill patterns", test_frontmatter), ("Companion skill references are valid", test_companion_skills), ("AGENTS.md includes prompt-learning correctly", test_agents_md), - ("Meta-prompt is referenced and well-structured", test_meta_prompt_reference), - ("Cross-file consistency (SKILL.md ↔ meta-prompt.md)", test_cross_file_consistency), + ("Meta-prompt is inline and well-structured", test_inline_meta_prompt), ] total_errors = 0 diff --git a/skills/prompt-learning/SKILL.md b/skills/prompt-learning/SKILL.md index b1b0815..80db3f1 100644 --- a/skills/prompt-learning/SKILL.md +++ b/skills/prompt-learning/SKILL.md @@ -100,7 +100,20 @@ Never deploy rules without running an A/B experiment. The validation experiment ## Issue Taxonomy -The meta-prompt classifies failures into 10 types: `accuracy`, `missing_requirement`, `policy`, `safety`, `formatting`, `verbosity`, `tone`, `tool_use`, `reasoning`, and `hallucination`. See the full taxonomy with descriptions in `resources/meta-prompt.md` (ISSUE TAXONOMY section). +The meta-prompt classifies failures into these types: + +| Issue Type | Description | +|------------|-------------| +| `accuracy` | Factually incorrect or imprecise outputs | +| `missing_requirement` | Fails to address part of the user's request | +| `policy` | Violates organizational policies or guidelines | +| `safety` | Produces harmful, biased, or inappropriate content | +| `formatting` | Wrong output structure, missing fields, schema violations | +| `verbosity` | Too long or too short for the context | +| `tone` | Inappropriate register, persona drift | +| `tool_use` | Wrong tool selected, incorrect arguments, misinterpreted results | +| `reasoning` | Flawed logic, incorrect deductions | +| `hallucination` | Fabricated facts, citations, or capabilities | ## Destructive Actions @@ -176,9 +189,7 @@ Follow these steps **in order**. Do NOT skip steps. ### Phase 3: Generate Rules (Meta-Prompt) -6. **Load the meta-prompt template** from `resources/meta-prompt.md`. - -7. **Fill in the meta-prompt variables:** +6. **Build the meta-prompt** by filling in the template below with collected data: - `PROMPT_TYPE`: "agent" or "evaluator" based on target - `CURRENT_PROMPT`: full text of the current prompt version - `ITERATION`: current iteration number (starts at 1) @@ -186,13 +197,71 @@ Follow these steps **in order**. Do NOT skip steps. - `FAILURE_EXAMPLES`: the f=10 sampled failures with normalized feedback - `POSITIVE_EXAMPLES`: the p=3 sampled positive traces -8. **Execute the meta-prompt** and collect the structured output: - - A) PATTERN_ANALYSIS — recurring patterns and one-off issues - - B) ANCHOR_CHECK — conflicts with positive anchors - - C) RULES — numbered "If [TRIGGER] then [ACTION]" rules - - D) RULES_TO_APPEND — formatted text block for the prompt - - E) REGRESSION_TESTS — test cases for validation - - F) ITERATION_GUIDANCE — continue or stop recommendation +7. **Execute the following meta-prompt** (send it to the LLM with the variables filled in): + + ~~~ + You are a prompt engineer improving a prompt based on feedback from multiple examples. + + GOAL: Analyze a batch of feedback (failures + positive anchors) and produce minimal, high-impact rules that: + 1. Fix recurring failure patterns + 2. Don't break existing good behavior (regression anchors) + + INPUTS: + 1) PROMPT_TYPE: "agent" | "evaluator" + 2) CURRENT_PROMPT: The prompt to improve + 3) ITERATION: Current iteration number + 4) FEEDBACK_SOURCE: "human" | "ai_eval" + 5) FAILURE_EXAMPLES (5-15 samples with negative feedback): + [{"user_input": "...", "model_output": "...", "reference": "..." (optional), "feedback": }, ...] + 6) POSITIVE_EXAMPLES (2-5 regression anchors): + [{"user_input": "...", "model_output": "...", "feedback": "pass" | {"value": true, ...}}, ...] + + FEEDBACK SHAPES: + - Human categorical: "fail" | "pass" | "borderline" + - Human numerical: 3 (just the number) + - Human free text: "The response was too vague..." + - AI eval boolean: {"value": true|false, "explanation": "..."} + - AI eval categorical: {"value": "A"|"B"|"C", "explanation": "..."} + - AI eval numerical: {"value": 6, "scale": "1-10", "explanation": "..."} + - Enriched normalized: {"verdict": "fail", "severity": 4, "issue_tags": ["missing_requirement"], "explanation": "..."} + + PROCESS: + + STEP 1 — ANALYZE FAILURE PATTERNS: + Group failures by issue type. Identify recurring patterns (2+ occurrences). + Issue taxonomy: accuracy, missing_requirement, policy, safety, formatting, verbosity, tone, tool_use, reasoning, hallucination. + Output: {"patterns": [{"issue_tag": "...", "count": N, "severity": 1-5, "examples": [indices], "root_cause": "..."}], "one_off_issues": [...]} + + STEP 2 — CHECK AGAINST POSITIVE ANCHORS: + For each pattern, verify the fix won't break positive examples. + Output: {"anchor_conflicts": [{"pattern": "...", "conflicting_anchor": index, "conflict_reason": "..."}]} + + STEP 3 — GENERATE RULES (only for recurring patterns without conflicts): + Create 1-5 rules. Format: "If [TRIGGER], then [ACTION]." + Prioritize by: frequency × severity. + Skip: one-offs, anchor conflicts, patterns too vague to test. + + STEP 4 — FORMAT RULES_TO_APPEND: + Text block for ### LEARNED_RULES section. + + STEP 5 — GENERATE REGRESSION TESTS: + Create 5-10 test cases: 3-5 "should_now_pass" + 2-5 "should_still_pass". + + STEP 6 — ITERATION GUIDANCE: + Recommend "continue" (significant patterns remain) or "stop" (diminishing returns). + + OUTPUT FORMAT: + A) PATTERN_ANALYSIS — JSON with patterns and one_off_issues + B) ANCHOR_CHECK — JSON with anchor_conflicts and safe_to_patch list + C) RULES — numbered list + D) RULES_TO_APPEND — text block for the prompt + E) REGRESSION_TESTS — JSON array of test cases + F) ITERATION_GUIDANCE — {"recommendation": "continue"|"stop", "reason": "...", "remaining_issues": N, "expected_next_iteration_gain": "high"|"medium"|"low"} + + NOW PROCESS THE ACTUAL INPUT. + ~~~ + +8. **Collect the structured output** (sections A through F). 9. **Review the output** with the user: - Show the identified patterns and their frequency/severity diff --git a/skills/prompt-learning/resources/meta-prompt.md b/skills/prompt-learning/resources/meta-prompt.md deleted file mode 100644 index 4346d08..0000000 --- a/skills/prompt-learning/resources/meta-prompt.md +++ /dev/null @@ -1,271 +0,0 @@ -# Prompt Learning Meta-Prompt (v2, batch-mode) - -Use this meta-prompt template when generating rules from feedback. Pass it the variables described in INPUTS below. - ---- - -``` -You are a prompt engineer improving a prompt based on feedback from multiple examples. - -══════════════════════════════════════════════════════════════════════ -GOAL -══════════════════════════════════════════════════════════════════════ -Analyze a batch of feedback (failures + positive anchors) and produce minimal, high-impact rules that: -1. Fix recurring failure patterns -2. Don't break existing good behavior (regression anchors) - -══════════════════════════════════════════════════════════════════════ -INPUTS -══════════════════════════════════════════════════════════════════════ -1) PROMPT_TYPE: "agent" | "evaluator" -2) CURRENT_PROMPT: The prompt to improve -3) ITERATION: Current iteration number (1-8) -4) FEEDBACK_SOURCE: "human" | "ai_eval" - -5) FAILURE_EXAMPLES (5-15 samples with negative feedback; typically 6-14): - [ - { - "user_input": "...", - "model_output": "...", - "reference": "..." (optional), - "feedback": - }, - ... - ] - -6) POSITIVE_EXAMPLES (2-5 regression anchors; typically 3-5): - [ - { - "user_input": "...", - "model_output": "...", - "feedback": "pass" | { "value": true, ... } - }, - ... - ] - -FEEDBACK SHAPES: -- Human categorical: "fail" | "pass" | "borderline" -- Human numerical: 3 (just the number) -- Human free text: "The response was too vague..." -- AI eval boolean: { "value": true|false, "explanation": "..." } -- AI eval categorical: { "value": "A"|"B"|"C", "explanation": "..." } -- AI eval numerical: { "value": 6, "scale": "1-10", "explanation": "..." } -- Enriched normalized (recommended if raw feedback lacks explanations): - { "verdict": "fail", "severity": 4, "issue_tags": ["missing_requirement"], "explanation": "..." } - -══════════════════════════════════════════════════════════════════════ -PROCESS -══════════════════════════════════════════════════════════════════════ - -STEP 1: ANALYZE FAILURE PATTERNS -Group the failure examples by issue type. Identify recurring patterns (2+ occurrences). - -ISSUE TAXONOMY: -- accuracy: factually wrong -- missing_requirement: didn't fulfill explicit requirement -- policy: violated policy/guideline -- safety: unsafe content -- formatting: wrong format/structure -- verbosity: too long/short -- tone: wrong tone/style -- tool_use: incorrect tool usage -- reasoning: flawed logic -- hallucination: made up information - -Output pattern analysis: -{ - "patterns": [ - { - "issue_tag": "", - "count": , - "severity": <1-5>, - "examples": [], - "root_cause": "" - } - ], - "one_off_issues": [] -} - -STEP 2: CHECK AGAINST POSITIVE ANCHORS -For each identified pattern, verify the proposed fix won't break positive examples. - -{ - "anchor_conflicts": [ - { - "pattern": "", - "conflicting_anchor": , - "conflict_reason": "" - } - ] -} - -STEP 3: GENERATE RULES (only for recurring patterns without conflicts) -Create 1-5 rules that address the most impactful patterns. - -Rule format: "If [TRIGGER], then [ACTION]." - -Prioritize by: frequency × severity - -Skip patterns that: -- Appeared only once (one-off) -- Would conflict with positive anchors -- Are too vague to create testable rules - -STEP 4: FORMAT RULES_TO_APPEND -Text block for ### LEARNED_RULES section. - -STEP 5: GENERATE REGRESSION TESTS -Create 5-10 test cases: -- 3-5 "should_now_pass" (based on failure patterns) -- 2-5 "should_still_pass" (based on positive anchors) - -STEP 6: ITERATION GUIDANCE -Based on current iteration, suggest: -- Continue: if significant patterns remain unfixed -- Stop: if diminishing returns (patterns are one-offs or low severity) - -══════════════════════════════════════════════════════════════════════ -OUTPUT FORMAT -══════════════════════════════════════════════════════════════════════ - -### A) PATTERN_ANALYSIS -```json -{ - "patterns": [...], - "one_off_issues": [...] -} -``` - -### B) ANCHOR_CHECK -```json -{ - "anchor_conflicts": [...], - "safe_to_patch": [] -} -``` - -### C) RULES - - -### D) RULES_TO_APPEND -``` - -``` - -### E) REGRESSION_TESTS -```json -[...] -``` - -### F) ITERATION_GUIDANCE -{ - "recommendation": "continue" | "stop", - "reason": "", - "remaining_issues": , - "expected_next_iteration_gain": "high" | "medium" | "low" -} - -══════════════════════════════════════════════════════════════════════ -EXAMPLE -══════════════════════════════════════════════════════════════════════ - -Input: -- PROMPT_TYPE: agent -- CURRENT_PROMPT: "You are a helpful customer service assistant..." -- ITERATION: 1 -- FEEDBACK_SOURCE: ai_eval - -- FAILURE_EXAMPLES: - [ - {"user_input": "Return item bought 45 days ago", "model_output": "Sure, go to returns...", "feedback": {"value": false, "explanation": "Didn't check 30-day policy"}}, - {"user_input": "Return item from 2 months ago", "model_output": "Click return button...", "feedback": {"value": false, "explanation": "Ignored return window"}}, - {"user_input": "Refund for old purchase", "model_output": "Here's how to return...", "feedback": {"value": false, "explanation": "Should have mentioned policy limits"}}, - {"user_input": "Where is my order?", "model_output": "Check your email", "feedback": {"value": false, "explanation": "Too brief, should offer to look up"}}, - {"user_input": "Cancel subscription", "model_output": "Subscription cancelled.", "feedback": {"value": false, "explanation": "Didn't confirm or ask for feedback"}} - ] - -- POSITIVE_EXAMPLES: - [ - {"user_input": "Return item bought yesterday", "model_output": "I'd be happy to help with your return...", "feedback": {"value": true, "explanation": "Complete and helpful"}}, - {"user_input": "Track my package", "model_output": "Let me look that up for you. Your order #123 is...", "feedback": {"value": true, "explanation": "Proactive and specific"}} - ] - -Output: - -### A) PATTERN_ANALYSIS -```json -{ - "patterns": [ - { - "issue_tag": "policy", - "count": 3, - "severity": 4, - "examples": [0, 1, 2], - "root_cause": "Model provides return instructions without checking/mentioning the 30-day return policy" - }, - { - "issue_tag": "verbosity", - "count": 1, - "severity": 2, - "examples": [3], - "root_cause": "Response too brief" - }, - { - "issue_tag": "missing_requirement", - "count": 1, - "severity": 3, - "examples": [4], - "root_cause": "Didn't confirm action or gather feedback" - } - ], - "one_off_issues": ["verbosity", "missing_requirement (cancellation)"] -} -``` - -### B) ANCHOR_CHECK -```json -{ - "anchor_conflicts": [], - "safe_to_patch": ["policy"] -} -``` - -### C) RULES -1. If customer asks about returning an item, then first check if the purchase date is within the 30-day return window before providing return instructions. -2. If return request is outside the 30-day policy, then inform customer of the policy limit and offer alternatives (store credit, exception request). - -### D) RULES_TO_APPEND -``` -- For return requests: Always verify purchase date against 30-day return policy before providing instructions. -- If item is outside return window: State the policy clearly, then offer alternatives rather than denying outright. -``` - -### E) REGRESSION_TESTS -```json -[ - {"input": "Return item bought 45 days ago", "expected": "Mention 30-day policy, offer alternatives", "type": "should_now_pass"}, - {"input": "Refund for purchase from last month (35 days)", "expected": "Check date, explain policy", "type": "should_now_pass"}, - {"input": "Return item bought 2 months ago", "expected": "Policy limit + alternatives", "type": "should_now_pass"}, - {"input": "Return item bought yesterday", "expected": "Proceed with return instructions normally", "type": "should_still_pass"}, - {"input": "Track my package", "expected": "Look up order, provide status", "type": "should_still_pass"} -] -``` - -### F) ITERATION_GUIDANCE -```json -{ - "recommendation": "continue", - "reason": "Addressed 1 major pattern (policy/returns). Two one-off issues remain but may recur with more data.", - "remaining_issues": 2, - "expected_next_iteration_gain": "medium" -} -``` - -══════════════════════════════════════════════════════════════════════ -NOW PROCESS THE ACTUAL INPUT -══════════════════════════════════════════════════════════════════════ -``` - ---- - -> **Usage:** This file contains the raw prompt template between the ``` fences. Everything outside the fences is for the agent operator, not the LLM. See SKILL.md Phase 3 for how to fill in the variables. From bee253a52b999de7319fbe78128d84107c1acbea Mon Sep 17 00:00:00 2001 From: currentlycodinng <148545995+currentlycodinng@users.noreply.github.com> Date: Tue, 10 Mar 2026 15:15:17 +0100 Subject: [PATCH 4/6] Clarify meta-prompt execution: Claude follows the process directly The meta-prompt is instructions for Claude to follow as part of the skill, not a prompt to send to an external LLM. Updated wording to make this clear. Co-Authored-By: Claude Opus 4.6 --- skills/prompt-learning/SKILL.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skills/prompt-learning/SKILL.md b/skills/prompt-learning/SKILL.md index 80db3f1..5dce892 100644 --- a/skills/prompt-learning/SKILL.md +++ b/skills/prompt-learning/SKILL.md @@ -197,7 +197,7 @@ Follow these steps **in order**. Do NOT skip steps. - `FAILURE_EXAMPLES`: the f=10 sampled failures with normalized feedback - `POSITIVE_EXAMPLES`: the p=3 sampled positive traces -7. **Execute the following meta-prompt** (send it to the LLM with the variables filled in): +7. **Follow the meta-prompt process below** with the variables filled in from the collected data: ~~~ You are a prompt engineer improving a prompt based on feedback from multiple examples. @@ -261,7 +261,7 @@ Follow these steps **in order**. Do NOT skip steps. NOW PROCESS THE ACTUAL INPUT. ~~~ -8. **Collect the structured output** (sections A through F). +8. **Produce the structured output** (sections A through F) from the analysis above. 9. **Review the output** with the user: - Show the identified patterns and their frequency/severity From 978bc4dc942392261fcef8fc7a8c874c71847975 Mon Sep 17 00:00:00 2001 From: currentlycodinng <148545995+currentlycodinng@users.noreply.github.com> Date: Wed, 11 Mar 2026 10:25:08 +0100 Subject: [PATCH 5/6] Update prompt-learning with RES-205 final results MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Major changes based on completed research experiments: - Domain gating: focused domains only (0% success on broad tasks) - P=0 default: positives not required, P=0 outperforms P=3-5 - Iterations: 1 optimal (2 max), more causes prompt bloat - Multi-judge validation mandatory: single-judge overestimates by 40-60% - Model ceiling warning: top-tier models (>4.5/5) show no improvement - Reference comparison anti-pattern: CriSPO-style refs make results worse - Simplified meta-prompt: removed anchor check step (P=0), added expected_behavior to normalized representation, added severity mapping - Added "When NOT to use" section - Updated defaults table, anti-patterns, and validation script - Removed "preliminary/pending" language — results are final Co-Authored-By: Claude Opus 4.6 --- README.md | 2 +- agents/AGENTS.md | 2 +- scripts/validate_prompt_learning.py | 93 ++++++------ skills/prompt-learning/SKILL.md | 210 ++++++++++++++++------------ 4 files changed, 173 insertions(+), 134 deletions(-) diff --git a/README.md b/README.md index a7737af..9181186 100644 --- a/README.md +++ b/README.md @@ -235,7 +235,7 @@ The `action-plan` skill can file tickets directly to Linear. See [Linear MCP set | **manage-memory** | Create and configure orq.ai Memory Stores for persistent context in conversational agents | [SKILL.md](skills/manage-memory/SKILL.md) | | **monitor-production** | Analyze production trace data for anomalies, cost trends, latency regressions, and emerging failure modes | [SKILL.md](skills/monitor-production/SKILL.md) | | **optimize-prompt** | Systematically iterate on a prompt deployment using trace data, A/B testing, and structured refinement techniques | [SKILL.md](skills/optimize-prompt/SKILL.md) | -| **prompt-learning** | Automatically improve prompts by collecting feedback, generating "If [TRIGGER] then [ACTION]" rules via a meta-prompt, and validating with A/B experiments | [SKILL.md](skills/prompt-learning/SKILL.md) | +| **prompt-learning** | Automatically improve prompts by collecting feedback, generating "If [TRIGGER] then [ACTION]" rules via a meta-prompt, and validating with multi-judge experiments | [SKILL.md](skills/prompt-learning/SKILL.md) | | **regression-test** | Run a quick regression check against a golden dataset to verify recent changes haven't degraded quality | [SKILL.md](skills/regression-test/SKILL.md) | | **run-experiment** | End-to-end LLM evaluation workflow — error analysis, dataset creation, experiment execution, result analysis, and ticket filing | [SKILL.md](skills/run-experiment/SKILL.md) | | **scaffold-integration** | Generate SDK integration code (Python or Node) for orq.ai agents, deployments, and knowledge bases in the user's codebase | [SKILL.md](skills/scaffold-integration/SKILL.md) | diff --git a/agents/AGENTS.md b/agents/AGENTS.md index d29e459..5b03898 100644 --- a/agents/AGENTS.md +++ b/agents/AGENTS.md @@ -48,7 +48,7 @@ manage-deployment: `Configure, version, and manage orq.ai deployments — model manage-memory: `Create and configure orq.ai Memory Stores for persistent context in conversational agents` monitor-production: `Analyze production trace data for anomalies, cost trends, latency regressions, and emerging failure modes` optimize-prompt: `Systematically iterate on a prompt deployment using trace data, A/B testing, and structured refinement techniques` -prompt-learning: `Automatically improve prompts by collecting feedback, generating "If [TRIGGER] then [ACTION]" rules via a meta-prompt, and validating with A/B experiments` +prompt-learning: `Automatically improve prompts by collecting feedback, generating "If [TRIGGER] then [ACTION]" rules via a meta-prompt, and validating with multi-judge experiments` regression-test: `Run a quick regression check against a golden dataset to verify recent changes haven't degraded quality` run-experiment: `End-to-end LLM evaluation workflow — error analysis, dataset creation, experiment execution, result analysis, and ticket filing` scaffold-integration: `Generate SDK integration code (Python or Node) for orq.ai agents, deployments, and knowledge bases in the user's codebase` diff --git a/scripts/validate_prompt_learning.py b/scripts/validate_prompt_learning.py index ef288f4..29aa501 100644 --- a/scripts/validate_prompt_learning.py +++ b/scripts/validate_prompt_learning.py @@ -9,7 +9,8 @@ 1. Frontmatter matches existing skill patterns 2. All companion skill references point to existing skills 3. AGENTS.md includes the new skill entry -4. Meta-prompt template is inline in SKILL.md with required structural elements +4. Meta-prompt is inline in SKILL.md with required structural elements +5. RES-205 research findings are reflected (domain gating, multi-judge, P=0) """ from __future__ import annotations @@ -62,29 +63,24 @@ def test_frontmatter() -> list[str]: text = SKILL_MD.read_text(encoding="utf-8") meta = parse_frontmatter(text) - # Required fields for field in ("name", "description", "allowed-tools"): if field not in meta: errors.append(f"Missing frontmatter field: {field}") - # Name should match directory if meta.get("name") != "prompt-learning": errors.append( f"Frontmatter name '{meta.get('name')}' doesn't match " f"directory 'prompt-learning'" ) - # Description should be non-empty if not meta.get("description"): errors.append("Frontmatter description is empty") - # allowed-tools should contain core tools present in all skills allowed = meta.get("allowed-tools", "") for tool in ("Bash", "Read", "Write", "Edit", "Grep", "Glob", "AskUserQuestion"): if tool not in allowed: errors.append(f"allowed-tools missing core tool: {tool}") - # Cross-check: compare against another skill's frontmatter pattern reference_skill = ROOT / "skills" / "optimize-prompt" / "SKILL.md" if reference_skill.exists(): ref_meta = parse_frontmatter( @@ -111,7 +107,6 @@ def test_companion_skills() -> list[str]: text = SKILL_MD.read_text(encoding="utf-8") existing = collect_existing_skills() - # Extract companion skill names from backtick references after "Companion skills:" companion_section = re.search( r"\*\*Companion skills:\*\*\s*\n((?:- .*\n)*)", text ) @@ -128,7 +123,6 @@ def test_companion_skills() -> list[str]: if name not in existing: errors.append(f"Companion skill '{name}' does not exist as a skill") - # Check bidirectional: do companion skills reference us back? for name in companion_names: companion_path = ROOT / "skills" / name / "SKILL.md" if companion_path.exists(): @@ -150,38 +144,19 @@ def test_agents_md() -> list[str]: text = AGENTS_MD.read_text(encoding="utf-8") - # Check skill path entry expected_path = 'prompt-learning -> "skills/prompt-learning/SKILL.md"' if expected_path not in text: errors.append(f"AGENTS.md missing path entry: {expected_path}") - # Check description entry in available_skills if "prompt-learning:" not in text: errors.append("AGENTS.md missing description entry for prompt-learning") - # Check alphabetical ordering in the skills list - path_entries = re.findall( - r" - (\S+) -> ", text - ) + path_entries = re.findall(r" - (\S+) -> ", text) if path_entries: sorted_entries = sorted(path_entries, key=str.lower) if path_entries != sorted_entries: errors.append("AGENTS.md skill list is not alphabetically sorted") - # Check alphabetical ordering in available_skills - desc_entries = re.findall(r"^(\S+):", text, re.MULTILINE) - # Filter to only skill entries (those that have backtick descriptions) - skill_descs = [ - e for e in desc_entries - if f"{e}:" in text and "`" in text.split(f"{e}:")[1].split("\n")[0] - ] - if skill_descs: - sorted_descs = sorted(skill_descs, key=str.lower) - if skill_descs != sorted_descs: - errors.append( - "AGENTS.md available_skills descriptions not alphabetically sorted" - ) - return errors @@ -194,7 +169,6 @@ def test_inline_meta_prompt() -> list[str]: text = SKILL_MD.read_text(encoding="utf-8") - # The meta-prompt should be inline in Phase 3 (not in a separate file) resources_dir = SKILL_DIR / "resources" if resources_dir.exists() and (resources_dir / "meta-prompt.md").exists(): errors.append( @@ -202,19 +176,14 @@ def test_inline_meta_prompt() -> list[str]: "should be inlined in SKILL.md Phase 3" ) - # Check that Phase 3 contains the meta-prompt template - if "Execute the following meta-prompt" not in text: + if "Follow the meta-prompt process below" not in text: errors.append("SKILL.md Phase 3 missing inline meta-prompt instruction") - # Check required structural elements are present in the inline template required_elements = [ ("GOAL", "meta-prompt GOAL section"), ("FAILURE_EXAMPLES", "failure examples input"), - ("POSITIVE_EXAMPLES", "positive examples input"), ("FEEDBACK SHAPES", "feedback shape reference"), ("STEP 1", "failure pattern analysis step"), - ("STEP 2", "anchor check step"), - ("STEP 3", "rule generation step"), ("RULES_TO_APPEND", "rules output format"), ("REGRESSION_TESTS", "regression test generation"), ("ITERATION_GUIDANCE", "iteration guidance output"), @@ -225,25 +194,62 @@ def test_inline_meta_prompt() -> list[str]: if element not in text: errors.append(f"SKILL.md missing meta-prompt element: {description} ({element})") - # Check that the issue taxonomy tags in the inline template match the - # Issue Taxonomy section taxonomy_section = text.split("Issue Taxonomy")[1].split("##")[0] if "Issue Taxonomy" in text else "" - taxonomy_tags = re.findall(r"`(\w+)`", taxonomy_section) expected_tags = { "accuracy", "missing_requirement", "policy", "safety", "formatting", "verbosity", "tone", "tool_use", "reasoning", "hallucination", } - # Only check tags that look like taxonomy entries + taxonomy_tags = re.findall(r"`(\w+)`", taxonomy_section) found_tags = {t for t in taxonomy_tags if t in expected_tags} missing_tags = expected_tags - found_tags if missing_tags: errors.append(f"Issue Taxonomy section missing tags: {missing_tags}") - # Check that the inline meta-prompt also lists the taxonomy tags - # (look for the compact taxonomy list in the template) - if "accuracy" not in text or "hallucination" not in text: - errors.append("Inline meta-prompt missing taxonomy tag references") + return errors + + +def test_research_alignment() -> list[str]: + """Check 5: RES-205 research findings are reflected in SKILL.md.""" + errors = [] + + if not SKILL_MD.exists(): + return [f"SKILL.md not found at {SKILL_MD}"] + + text = SKILL_MD.read_text(encoding="utf-8") + + # Domain gating + if "When NOT to use" not in text: + errors.append("Missing 'When NOT to use' section (domain gating)") + if "focused" not in text.lower(): + errors.append("Missing focused domain guidance") + if "broad" not in text.lower() and "general" not in text.lower(): + errors.append("Missing warning about broad/general domains") + + # Multi-judge validation + if "multi-judge" not in text.lower() and "multi_judge" not in text.lower(): + errors.append("Missing multi-judge validation requirement") + if "3+" not in text and "3 " not in text: + errors.append("Missing 3+ judge requirement") + if "overestimate" not in text.lower(): + errors.append("Missing single-judge overestimation warning (40-60%)") + + # P=0 default + defaults_section = text.split("## Defaults")[1].split("##")[0] if "## Defaults" in text else "" + if "P=0" not in defaults_section and "p) | 0" not in defaults_section: + errors.append("Defaults table should show P=0 (research finding)") + + # Ceiling effect + if "ceiling" not in text.lower(): + errors.append("Missing model ceiling effect warning") + + # No "preliminary" or "pending" language + if "preliminary" in text.lower() or "pending validation" in text.lower(): + errors.append("Still contains 'preliminary'/'pending' language — results are final") + + # Reference comparison anti-pattern + if "reference" not in text.lower() or "worse" not in text.lower(): + errors.append("Missing anti-pattern about reference comparisons making results worse") return errors @@ -254,6 +260,7 @@ def main() -> None: ("Companion skill references are valid", test_companion_skills), ("AGENTS.md includes prompt-learning correctly", test_agents_md), ("Meta-prompt is inline and well-structured", test_inline_meta_prompt), + ("RES-205 research findings reflected", test_research_alignment), ] total_errors = 0 diff --git a/skills/prompt-learning/SKILL.md b/skills/prompt-learning/SKILL.md index 5dce892..518cb3b 100644 --- a/skills/prompt-learning/SKILL.md +++ b/skills/prompt-learning/SKILL.md @@ -1,12 +1,12 @@ --- name: prompt-learning -description: Automatically improve prompts by collecting feedback, generating "If [TRIGGER] then [ACTION]" rules via a meta-prompt, and validating with A/B experiments +description: Automatically improve prompts by collecting feedback, generating "If [TRIGGER] then [ACTION]" rules via a meta-prompt, and validating with multi-judge experiments allowed-tools: Bash, Read, Write, Edit, Grep, Glob, WebFetch, Task, AskUserQuestion, mcp__linear-server__*, orq* --- # Prompt Learning -Automatically improve prompts through feedback-driven rule generation. Collects human or AI feedback, normalizes it to a shared representation, generates targeted "If [TRIGGER] then [ACTION]" rules via a meta-prompt, and appends them to a `### LEARNED_RULES` section — then validates with A/B experiments. +Automatically improve prompts through feedback-driven rule generation. Collects human or AI feedback, normalizes it to a shared representation, generates targeted "If [TRIGGER] then [ACTION]" rules via a meta-prompt, and appends them to a `### LEARNED_RULES` section — then validates with multi-judge experiments. This skill is **automated and feedback-driven**, distinct from `optimize-prompt` (which is manual/trace-driven). The pipeline is: Collect → Normalize → Meta-Prompt → Aggregate → Apply → Validate. @@ -25,6 +25,14 @@ This skill is **automated and feedback-driven**, distinct from `optimize-prompt` - Action plan recommends feedback-driven prompt improvement - User wants to close the loop between feedback collection and prompt updates - User has evaluator scores and wants to turn failures into prompt rules +- The target prompt serves a **focused domain** (email writing, code review, customer support, data extraction) + +## When NOT to use + +- **Broad/general tasks** — prompt learning shows 0% significant improvement on general helpfulness or open-ended chat (RES-205: 0/70 configs significant on MTBench helpfulness) +- **Top-tier models already at ceiling** — models scoring >4.5/5 on baseline show no improvement (GPT-4o at 4.75/5 had zero gain) +- **Without multi-judge validation** — single-judge evaluation overestimates improvement by 40-60%. If you cannot set up 3+ diverse judge models, results will be unreliable +- Use `optimize-prompt` instead for manual, trace-driven refinement on any domain type ## orq.ai Documentation @@ -83,20 +91,20 @@ curl -s -X POST https://my.orq.ai/v2/prompts//versions \ ## Core Principles -### 1. Feedback Is Fuel, Not Truth -Human and AI feedback use the **same core method** — only preprocessing differs. Normalize all feedback to a shared representation (verdict + severity + issue tags + explanation) before processing. Never trust a single piece of feedback in isolation. +### 1. Focused Domains Only +Prompt learning works on **narrow, well-defined tasks** (email writing, code review, customer support, data extraction). It does not work on broad/general helpfulness — RES-205 showed 0% significant improvement across 70 configurations on general helpfulness tasks. Always verify the target prompt serves a focused domain before proceeding. + +### 2. Feedback Is Fuel, Not Truth +Human and AI feedback use the **same core method** — only preprocessing differs. Normalize all feedback to a shared representation (verdict + severity + issue tags + expected behavior) before processing. Never trust a single piece of feedback in isolation. -### 2. Only Recurring Patterns Get Rules +### 3. Only Recurring Patterns Get Rules Require 2+ occurrences of a pattern before generating a rule. One-off issues are noise, not signal. The meta-prompt explicitly skips single-occurrence patterns. -### 3. Rules Are Additive, Never Destructive +### 4. Rules Are Additive, Never Destructive Rules are appended to a `### LEARNED_RULES` section in the prompt. Never rewrite or remove existing prompt instructions. Rules augment the prompt — they don't replace it. -### 4. Positive Anchors Prevent Regression -Always include p=3 positive traces (regression anchors) in each meta-prompt batch. These ensure generated rules don't break what already works. Skipping anchors leads to over-correction. - -### 5. Validate Before Promoting -Never deploy rules without running an A/B experiment. The validation experiment compares the prompt with rules against the baseline on the same dataset. No vibes-checking. +### 5. Multi-Judge Validation Is Mandatory +Never validate with a single judge model. Single-judge evaluation overestimates improvement by 40-60% (RES-205: single-judge showed +40%, multi-judge showed +6%). Always use 3+ diverse judge models for validation experiments. ## Issue Taxonomy @@ -124,42 +132,58 @@ The following actions require explicit user confirmation via `AskUserQuestion` b ## Defaults -> **Note:** These defaults are preliminary. RES-205 experiments are still running — values will be updated once results are final. +Research-validated configuration (RES-205): | Parameter | Default | Range | Notes | |-----------|---------|-------|-------| -| Failures per batch (f) | 10 | 5-15 | Representative sample, not exhaustive | -| Positives per batch (p) | 3 | 2-5 | Regression anchors to prevent over-correction | -| Iterations | 2 | 1-5 | 2 for GPT/Claude; up to 5 for Gemini (pending validation) | +| Failures per batch (f) | 10 | 5-15 | f=10 is optimal; f=5 too few to find patterns | +| Positives per batch (p) | 0 | 0-5 | P=0 outperforms P=3-5 on focused domains | +| Iterations | 1 | 1-2 | 1 iteration gives best results; 2 for consistency. More causes prompt bloat | | Occurrence threshold | 2+ | — | One-offs are skipped | | Rules per iteration | 1-5 | — | Prioritized by frequency × severity | | Total rule cap | 10 | — | Across all iterations | +| Validation judges | 3+ | 3-5 | Diverse models required; single-judge overestimates by 40-60% | + +**Expected effect size:** +0.4 to +0.9 on a 5-point scale for focused domains. Set expectations accordingly. + +**Model tier matters:** + +| Model Tier | Recommendation | +|------------|----------------| +| Mid-tier (Gemini Flash, GPT-4o-mini) | Good candidate — room to improve | +| Top-tier (GPT-4o, Claude Sonnet) | Skip — likely at ceiling (>4.5/5 baseline) | ## Steps Follow these steps **in order**. Do NOT skip steps. -### Phase 1: Identify Target and Collect Feedback (Collect) +### Phase 1: Identify Target and Assess Feasibility 1. **Identify the target prompt/deployment:** - Use `search_entities` with `type: "prompts"` to find the target prompt - Use HTTP API to get full prompt details including current version - Document: system message, user template, model, parameters -2. **Collect feedback data:** +2. **Assess feasibility — check domain and model tier:** + - **Domain check:** Is this a focused task (email, code review, support, extraction) or a broad/general task (open chat, helpfulness)? + - If broad/general → **stop**. Inform user that prompt learning is not effective for this domain type. Suggest `optimize-prompt` instead. + - **Model check:** What is the baseline quality score? + - If baseline > 4.5/5 → **warn** the user that top-tier models show ceiling effects and prompt learning may not yield improvement. + - **Present assessment** to the user before proceeding. + +3. **Collect feedback data:** - Use `list_traces` to pull traces with feedback for the target deployment - Collect at least 50 traces (more is better) from a meaningful time period - Separate into: negative feedback traces and positive feedback traces - Identify the feedback source: human (thumbs up/down, corrections, free-text) or AI evaluator scores -3. **Verify sufficient data:** +4. **Verify sufficient data:** - Need at least f=10 failure traces to proceed - - Need at least p=3 positive traces for regression anchors - If insufficient, inform user and suggest using `feedback-loop` to set up collection first ### Phase 2: Normalize Feedback (Normalize) -4. **Normalize feedback to shared representation:** +5. **Normalize feedback to shared representation:** All feedback — human or AI — gets normalized to this shape: ```json @@ -167,54 +191,60 @@ Follow these steps **in order**. Do NOT skip steps. "verdict": "fail" | "pass" | "borderline", "severity": 1-5, "issue_tags": [""], - "explanation": "..." + "expected_behavior": "what should have happened" } ``` + **Severity mapping:** + + | Condition | Severity | + |-----------|----------| + | Score on 1-5 scale (5 best) | `severity = 6 - score` | + | Score on 1-10 scale (10 best) | `severity = ceil((10 - score) / 2)` | + | Policy/safety violation or hallucination | 5 | + | Wrong answer or missed key requirement | 4 | + | Partial/incomplete response | 3 | + | Minor format/tone/verbosity issue | 2 | + | Nitpick/stylistic preference | 1 | + **For human feedback:** - - Thumbs down → `{"verdict": "fail", "severity": 3, "issue_tags": [], "explanation": ""}` - - Free-text correction → extract issue tags and explanation from the text - - Numerical rating (e.g., 1-5) → map to verdict (1-2: fail, 3: borderline, 4-5: pass) + - Thumbs down → `{"verdict": "fail", "severity": 3, "issue_tags": [], "expected_behavior": ""}` + - Free-text correction → extract issue tags and expected behavior from the text + - Numerical rating (e.g., 1-5) → map to verdict (1-2: fail, 3: borderline, 4-5: pass), use severity mapping above **For AI evaluator feedback:** - - Boolean false → `{"verdict": "fail", "severity": 3, "issue_tags": [], "explanation": ""}` + - Boolean false → `{"verdict": "fail", "severity": 3, "issue_tags": [], "expected_behavior": ""}` - Categorical/numerical → map to verdict based on scale, carry explanation through - If raw feedback lacks explanations (e.g., bare thumbs-down), use the LLM to enrich: pass the input/output pair and ask for a brief failure analysis to populate `issue_tags` and `explanation`. + If raw feedback lacks explanations (e.g., bare thumbs-down), use the LLM to enrich: pass the input/output pair and ask for a brief failure analysis to populate `issue_tags` and `expected_behavior`. -5. **Sample the batch:** +6. **Sample the batch:** - Sample f=10 representative failures (diverse issue types, not all the same failure) - - Sample p=3 positive traces as regression anchors - If more failures exist, prioritize diversity across issue types ### Phase 3: Generate Rules (Meta-Prompt) -6. **Build the meta-prompt** by filling in the template below with collected data: +7. **Build the meta-prompt** by filling in the template below with collected data: - `PROMPT_TYPE`: "agent" or "evaluator" based on target - `CURRENT_PROMPT`: full text of the current prompt version - `ITERATION`: current iteration number (starts at 1) - `FEEDBACK_SOURCE`: "human" or "ai_eval" - `FAILURE_EXAMPLES`: the f=10 sampled failures with normalized feedback - - `POSITIVE_EXAMPLES`: the p=3 sampled positive traces -7. **Follow the meta-prompt process below** with the variables filled in from the collected data: +8. **Follow the meta-prompt process below** with the variables filled in from the collected data: ~~~ You are a prompt engineer improving a prompt based on feedback from multiple examples. - GOAL: Analyze a batch of feedback (failures + positive anchors) and produce minimal, high-impact rules that: - 1. Fix recurring failure patterns - 2. Don't break existing good behavior (regression anchors) + GOAL: Analyze a batch of feedback failures and produce minimal, high-impact rules that fix recurring failure patterns. INPUTS: 1) PROMPT_TYPE: "agent" | "evaluator" 2) CURRENT_PROMPT: The prompt to improve 3) ITERATION: Current iteration number 4) FEEDBACK_SOURCE: "human" | "ai_eval" - 5) FAILURE_EXAMPLES (5-15 samples with negative feedback): - [{"user_input": "...", "model_output": "...", "reference": "..." (optional), "feedback": }, ...] - 6) POSITIVE_EXAMPLES (2-5 regression anchors): - [{"user_input": "...", "model_output": "...", "feedback": "pass" | {"value": true, ...}}, ...] + 5) FAILURE_EXAMPLES (10 samples with negative feedback): + [{"user_input": "...", "model_output": "...", "feedback": }, ...] FEEDBACK SHAPES: - Human categorical: "fail" | "pass" | "borderline" @@ -223,7 +253,7 @@ Follow these steps **in order**. Do NOT skip steps. - AI eval boolean: {"value": true|false, "explanation": "..."} - AI eval categorical: {"value": "A"|"B"|"C", "explanation": "..."} - AI eval numerical: {"value": 6, "scale": "1-10", "explanation": "..."} - - Enriched normalized: {"verdict": "fail", "severity": 4, "issue_tags": ["missing_requirement"], "explanation": "..."} + - Enriched normalized: {"verdict": "fail", "severity": 4, "issue_tags": ["missing_requirement"], "expected_behavior": "..."} PROCESS: @@ -232,51 +262,45 @@ Follow these steps **in order**. Do NOT skip steps. Issue taxonomy: accuracy, missing_requirement, policy, safety, formatting, verbosity, tone, tool_use, reasoning, hallucination. Output: {"patterns": [{"issue_tag": "...", "count": N, "severity": 1-5, "examples": [indices], "root_cause": "..."}], "one_off_issues": [...]} - STEP 2 — CHECK AGAINST POSITIVE ANCHORS: - For each pattern, verify the fix won't break positive examples. - Output: {"anchor_conflicts": [{"pattern": "...", "conflicting_anchor": index, "conflict_reason": "..."}]} - - STEP 3 — GENERATE RULES (only for recurring patterns without conflicts): + STEP 2 — GENERATE RULES (only for recurring patterns): Create 1-5 rules. Format: "If [TRIGGER], then [ACTION]." Prioritize by: frequency × severity. - Skip: one-offs, anchor conflicts, patterns too vague to test. + Skip: one-offs, patterns too vague to test. - STEP 4 — FORMAT RULES_TO_APPEND: + STEP 3 — FORMAT RULES_TO_APPEND: Text block for ### LEARNED_RULES section. - STEP 5 — GENERATE REGRESSION TESTS: + STEP 4 — GENERATE REGRESSION TESTS: Create 5-10 test cases: 3-5 "should_now_pass" + 2-5 "should_still_pass". - STEP 6 — ITERATION GUIDANCE: - Recommend "continue" (significant patterns remain) or "stop" (diminishing returns). + STEP 5 — ITERATION GUIDANCE: + Recommend "stop" (default after iteration 1) or "continue" (only if major patterns remain unfixed). OUTPUT FORMAT: A) PATTERN_ANALYSIS — JSON with patterns and one_off_issues - B) ANCHOR_CHECK — JSON with anchor_conflicts and safe_to_patch list - C) RULES — numbered list - D) RULES_TO_APPEND — text block for the prompt - E) REGRESSION_TESTS — JSON array of test cases - F) ITERATION_GUIDANCE — {"recommendation": "continue"|"stop", "reason": "...", "remaining_issues": N, "expected_next_iteration_gain": "high"|"medium"|"low"} + B) RULES — numbered list + C) RULES_TO_APPEND — text block for the prompt + D) REGRESSION_TESTS — JSON array of test cases + E) ITERATION_GUIDANCE — {"recommendation": "continue"|"stop", "reason": "...", "remaining_issues": N} NOW PROCESS THE ACTUAL INPUT. ~~~ -8. **Produce the structured output** (sections A through F) from the analysis above. +9. **Produce the structured output** (sections A through E) from the analysis above. -9. **Review the output** with the user: - - Show the identified patterns and their frequency/severity - - Show the generated rules - - Highlight any anchor conflicts - - Present ITERATION_GUIDANCE recommendation +10. **Review the output** with the user: + - Show the identified patterns and their frequency/severity + - Show the generated rules + - Present ITERATION_GUIDANCE recommendation ### Phase 4: Aggregate and Apply -10. **Aggregate rules** across iterations (if iteration > 1): +11. **Aggregate rules** across iterations (if iteration > 1): - Merge new rules with existing `### LEARNED_RULES` section - Remove duplicates or conflicting rules - Enforce total rule cap of 10 -11. **Apply rules to the prompt** — **ask user confirmation first:** +12. **Apply rules to the prompt** — **ask user confirmation first:** - Create a new prompt version with `### LEARNED_RULES` section appended - Use HTTP API to create the new version - Document what rules were added and which patterns they address @@ -289,51 +313,57 @@ Follow these steps **in order**. Do NOT skip steps. ... ``` -### Phase 5: Validate (A/B Experiment) +### Phase 5: Validate (Multi-Judge Experiment) -12. **Set up a validation experiment:** +13. **Set up a multi-judge validation experiment:** - Use `create_experiment` to compare baseline (no rules) vs variant (with rules) - - Use the same dataset for both runs + - Use the same dataset for both runs (10-50 examples) + - **Configure 3+ diverse judge models** (e.g., Gemini, GPT, Claude) — single-judge overestimates by 40-60% - Include evaluators that measure the targeted failure types - Include the regression tests from the meta-prompt output -13. **Run the experiment and analyze results:** +14. **Run the experiment and analyze results:** - Use `list_experiment_runs` to monitor progress - Use `get_experiment_run` to fetch results - - Compare: + - Compare across **all judges**: ``` - | Evaluator | Baseline | Variant | Delta | - |-----------|----------|---------|-------| - | [target metric] | X% | Y% | +Z% | - | [regression metric] | X% | Y% | +Z% | + | Judge Model | Evaluator | Baseline | Variant | Delta | + |-------------|-----------|----------|---------|-------| + | [judge 1] | [metric] | X% | Y% | +Z% | + | [judge 2] | [metric] | X% | Y% | +Z% | + | [judge 3] | [metric] | X% | Y% | +Z% | ``` -14. **Decision framework:** - - **Clear win** (>5% improvement on target, no regression) → Promote variant - - **Mixed results** (improvement + regression elsewhere) → Investigate, iterate - - **No improvement** → Re-examine feedback normalization, try different samples - - **Regression** → Revert, rules may be too aggressive +15. **Decision framework:** + - **Clear win** — majority of judges show improvement, no regression → Promote variant + - **Mixed results** — judges disagree → Investigate, may be noise + - **No improvement** — most judges show no change → Re-examine feedback, try different samples + - **Regression** — any judge shows regression → Revert, rules may be too aggressive + - **Single judge shows large gain but others don't** → Discard. This is the 40-60% overestimation pattern. -### Phase 6: Iterate +### Phase 6: Iterate (Usually Stop at 1) -15. **Check iteration guidance:** - - If meta-prompt recommends `"continue"` AND iteration < max (2 for GPT/Claude, 5 for Gemini): - - Return to Phase 1 Step 2 with updated prompt (now including rules) - - Collect fresh feedback or use remaining unprocessed failures +16. **Check iteration guidance:** + - **Default: stop after iteration 1.** Research shows 1 iteration gives best results; more iterations cause prompt bloat and diminishing returns. + - If significant failure patterns remain AND iteration < 2: + - Return to Phase 2 Step 6 with updated prompt (now including rules) + - Use remaining unprocessed failures - Increment iteration counter - - If meta-prompt recommends `"stop"` OR max iterations reached: + - If meta-prompt recommends `"stop"` OR iteration = 2: - Present final summary to user - If validated, **ask user confirmation** to promote to production deployment -16. **Final summary:** +17. **Final summary:** ``` ## Prompt Learning Summary - **Target:** [prompt/deployment name] + - **Domain type:** [focused domain description] - **Iterations completed:** [N] - **Rules generated:** [N] - **Feedback source:** human | ai_eval - **Key patterns addressed:** [list] - - **Validation result:** [pass/fail with metrics] + - **Multi-judge validation:** [pass/fail with per-judge metrics] + - **Effect size:** [delta on 5-point scale] - **Status:** [promoted / pending promotion / reverted] ``` @@ -341,13 +371,15 @@ Follow these steps **in order**. Do NOT skip steps. | Anti-Pattern | Why It's Wrong | What to Do Instead | |---|---|---| +| Using on broad/general tasks | 0% significant improvement on helpfulness (RES-205) | Only use on focused domains (email, code review, support) | +| Validating with a single judge | Overestimates improvement by 40-60% | Always use 3+ diverse judge models | | Acting on single-occurrence feedback | Noise, not signal | Require 2+ occurrences before generating rules | | Rewriting the whole prompt with rules | Destroys existing instructions | Only append to `### LEARNED_RULES` section | -| Skipping positive anchors | Rules may break what already works | Always include p=3 positive traces in each batch | -| Running more than 5 iterations | Diminishing returns, overfitting risk | Stop at 2 iterations (5 for Gemini) | +| Running more than 2 iterations | Prompt bloat, diminishing returns | Stop at 1 iteration (2 max) | | Treating human and AI feedback differently in the pipeline | Research shows same method works for both | Normalize to shared representation, then process identically | -| Deploying rules without validation experiment | No evidence the rules actually help | Always A/B test before promoting | -| Generating rules from too few failures | Insufficient pattern evidence | Wait for f=10 failures minimum per batch | +| Deploying rules without multi-judge validation | No reliable evidence the rules actually help | Always validate with 3+ judges before promoting | +| Applying to top-tier models at ceiling | Models scoring >4.5/5 show no improvement | Check baseline score first; skip if already high | +| Adding reference comparisons to the meta-prompt | CriSPO-style references made results worse (-21% vs -14%) | Use failure-only analysis without reference comparison | ## Open in orq.ai @@ -355,5 +387,5 @@ After completing this skill, direct the user to the relevant platform page: - **View/edit the prompt:** `https://my.orq.ai/prompts` — review the prompt with `### LEARNED_RULES` section - **Check traces with feedback:** `https://my.orq.ai/traces` — inspect traces that provided the feedback signal -- **View experiment results:** `https://my.orq.ai/experiments` — review the A/B validation experiment +- **View experiment results:** `https://my.orq.ai/experiments` — review the multi-judge validation experiment - **Feedback overview:** `https://my.orq.ai/feedback` — monitor ongoing feedback collection From a0cc2d7c186a635da0f05997378592cede2755e8 Mon Sep 17 00:00:00 2001 From: currentlycodinng <148545995+currentlycodinng@users.noreply.github.com> Date: Wed, 11 Mar 2026 10:42:20 +0100 Subject: [PATCH 6/6] Add model-specific configs, split model strategy, and freetext feedback guidance RES-205 findings: model-family-specific defaults (Claude/Gemini/GPT/Other), split model strategy (+20% vs +13%), freetext > categorical (+26.7% vs +6.7%). Co-Authored-By: Claude Opus 4.6 --- skills/prompt-learning/SKILL.md | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/skills/prompt-learning/SKILL.md b/skills/prompt-learning/SKILL.md index 518cb3b..f5990c9 100644 --- a/skills/prompt-learning/SKILL.md +++ b/skills/prompt-learning/SKILL.md @@ -150,8 +150,20 @@ Research-validated configuration (RES-205): | Model Tier | Recommendation | |------------|----------------| -| Mid-tier (Gemini Flash, GPT-4o-mini) | Good candidate — room to improve | -| Top-tier (GPT-4o, Claude Sonnet) | Skip — likely at ceiling (>4.5/5 baseline) | +| Small (Claude Haiku, Gemini Flash) | Best candidate — +40% improvement observed | +| Mid-tier (GPT-4o-mini) | Good candidate — room to improve | +| Top-tier (GPT-4o, Claude Sonnet) | Skip — likely at ceiling (>4.5/5 baseline), -3% to -10% observed | + +**Model-specific optimal configs:** + +| Model Family | F | P | Iterations | Notes | +|-------------|---|---|------------|-------| +| Claude | 10 | 0 | 3 | Small models (Haiku) learn best | +| Gemini | 15 | 0 | 1 | Higher failure count, single iteration | +| GPT | 10-15 | 3-5 | 3 | Benefits from positive anchors unlike others | +| Other / unknown | 10 | 0 | 1 | Conservative defaults; not experimentally validated — monitor closely | + +**Split model strategy (recommended):** Use a cheap model as the learner (the model being improved) and a powerful model as the generator (the model running the meta-prompt). RES-205 showed +20% win rate with split models vs +13% with same-model — the powerful generator produces better rules when analyzing a smaller model's failures. ## Steps @@ -176,6 +188,7 @@ Follow these steps **in order**. Do NOT skip steps. - Collect at least 50 traces (more is better) from a meaningful time period - Separate into: negative feedback traces and positive feedback traces - Identify the feedback source: human (thumbs up/down, corrections, free-text) or AI evaluator scores + - **Prefer freetext feedback** when available — RES-205 showed +26.7% improvement from freetext vs +6.7% from categorical feedback. If the user only has thumbs up/down, recommend enriching with freetext explanations. 4. **Verify sufficient data:** - Need at least f=10 failure traces to proceed