+
+
+
+
+
+
+
+
diff --git a/openexp/viz.py b/openexp/viz.py
new file mode 100644
index 0000000..2881c84
--- /dev/null
+++ b/openexp/viz.py
@@ -0,0 +1,1675 @@
+"""OpenExp Visualization — data export for self-contained HTML dashboard.
+
+Reads Q-cache, observations, sessions, predictions/outcomes and produces
+a sanitized JSON dict that gets embedded in the viz.html template.
+
+No raw memory text or file paths are included — aggregate stats only.
+"""
+import json
+import re
+import statistics
+from collections import Counter, defaultdict
+from datetime import datetime
+from pathlib import Path
+
+
+def _histogram(values, bin_start=-0.5, bin_end=1.0, num_bins=15):
+ """Create histogram bins from a list of numeric values."""
+ if not values:
+ return {"histogram": [], "stats": {}}
+
+ step = (bin_end - bin_start) / num_bins
+ counts = [0] * num_bins
+ for v in values:
+ idx = int((v - bin_start) / step)
+ idx = max(0, min(idx, num_bins - 1))
+ counts[idx] += 1
+
+ bins = []
+ for i in range(num_bins):
+ lo = bin_start + i * step
+ hi = lo + step
+ bins.append({"bin_start": round(lo, 4), "bin_end": round(hi, 4), "count": counts[i]})
+
+ return {
+ "histogram": bins,
+ "stats": {
+ "min": round(min(values), 4),
+ "max": round(max(values), 4),
+ "mean": round(statistics.mean(values), 4),
+ "median": round(statistics.median(values), 4),
+ "std": round(statistics.stdev(values), 4) if len(values) > 1 else 0,
+ "count": len(values),
+ },
+ }
+
+
+def _parse_date(ts_str):
+ """Extract date string (YYYY-MM-DD) from an ISO timestamp."""
+ if not ts_str:
+ return None
+ return ts_str[:10]
+
+
+def _load_jsonl(path):
+ """Load JSONL file, return list of dicts. Silently skip bad lines."""
+ entries = []
+ p = Path(path)
+ if not p.exists():
+ return entries
+ with open(p) as f:
+ for line in f:
+ line = line.strip()
+ if line:
+ try:
+ entries.append(json.loads(line))
+ except json.JSONDecodeError:
+ continue
+ return entries
+
+
+def _count_lines(path):
+ """Count lines in a file without reading content."""
+ p = Path(path)
+ if not p.exists():
+ return 0
+ count = 0
+ with open(p, "rb") as f:
+ for _ in f:
+ count += 1
+ return count
+
+
+def export_viz_data(no_qdrant=False):
+ """Export all visualization data as a dict ready for JSON embedding.
+
+ Args:
+ no_qdrant: Skip Qdrant queries (lifecycle stats, memory types).
+ Useful when Docker is not running.
+
+ Returns:
+ dict with all visualization data (sanitized, no raw text/paths).
+ """
+ from .core.config import (
+ DATA_DIR, Q_CACHE_PATH, OBSERVATIONS_DIR, SESSIONS_DIR,
+ )
+ from .core.q_value import QCache, DEFAULT_Q_CONFIG
+ from .core.hybrid_search import DEFAULT_HYBRID_WEIGHTS, STATUS_WEIGHTS
+
+ data = {}
+
+ # --- Q-cache ---
+ q_cache = QCache()
+ q_cache.load(Q_CACHE_PATH)
+ cache = q_cache._cache
+
+ # Extract flat q_data for default experience from nested format
+ def _flat(exp_dict):
+ """Get q_data for 'default' experience from nested cache entry."""
+ if isinstance(exp_dict, dict) and "default" in exp_dict:
+ return exp_dict["default"]
+ return exp_dict # fallback for any legacy format
+
+ flat_values = [_flat(v) for v in cache.values()]
+
+ q_combined = [v.get("q_value", 0.0) for v in flat_values]
+ q_action = [v.get("q_action", 0.0) for v in flat_values]
+ q_hypothesis = [v.get("q_hypothesis", 0.5) for v in flat_values]
+ q_fit = [v.get("q_fit", 0.5) for v in flat_values]
+
+ data["q_distribution"] = {
+ "combined": _histogram(q_combined),
+ "action": _histogram(q_action),
+ "hypothesis": _histogram(q_hypothesis),
+ "fit": _histogram(q_fit),
+ }
+
+ # Q-value evolution over time (group by date)
+ date_groups = defaultdict(lambda: {"combined": [], "action": [], "hypothesis": [], "fit": []})
+ for v in flat_values:
+ date = _parse_date(v.get("q_updated_at", ""))
+ if date:
+ date_groups[date]["combined"].append(v.get("q_value", 0.0))
+ date_groups[date]["action"].append(v.get("q_action", 0.0))
+ date_groups[date]["hypothesis"].append(v.get("q_hypothesis", 0.5))
+ date_groups[date]["fit"].append(v.get("q_fit", 0.5))
+
+ q_evolution = []
+ for date in sorted(date_groups.keys()):
+ g = date_groups[date]
+ q_evolution.append({
+ "date": date,
+ "mean_combined": round(statistics.mean(g["combined"]), 4) if g["combined"] else 0,
+ "mean_action": round(statistics.mean(g["action"]), 4) if g["action"] else 0,
+ "mean_hypothesis": round(statistics.mean(g["hypothesis"]), 4) if g["hypothesis"] else 0,
+ "mean_fit": round(statistics.mean(g["fit"]), 4) if g["fit"] else 0,
+ "count_updated": len(g["combined"]),
+ })
+ data["q_evolution"] = q_evolution
+
+ # Visits distribution
+ visits = [v.get("q_visits", 0) for v in flat_values]
+ visit_counts = Counter(visits)
+ data["visits_distribution"] = {
+ "histogram": [
+ {"visits": k, "count": v}
+ for k, v in sorted(visit_counts.items())
+ ]
+ }
+
+ # Calibration counts
+ calibrations = Counter(v.get("calibration", "uncalibrated") or "uncalibrated" for v in flat_values)
+ data["calibration_counts"] = dict(calibrations)
+
+ # --- Scoring config ---
+ data["scoring_config"] = {
+ "weights": {k: round(v, 2) for k, v in DEFAULT_HYBRID_WEIGHTS.items()},
+ "q_layer_weights": {
+ "action": DEFAULT_Q_CONFIG["q_action_weight"],
+ "hypothesis": DEFAULT_Q_CONFIG["q_hypothesis_weight"],
+ "fit": DEFAULT_Q_CONFIG["q_fit_weight"],
+ },
+ "q_learning": {
+ "alpha": DEFAULT_Q_CONFIG["alpha"],
+ "q_init": DEFAULT_Q_CONFIG["q_init"],
+ "q_floor": DEFAULT_Q_CONFIG["q_floor"],
+ "q_ceiling": DEFAULT_Q_CONFIG["q_ceiling"],
+ },
+ "status_weights": {k: round(v, 2) for k, v in STATUS_WEIGHTS.items()},
+ }
+
+ # --- Observations (line counts only, no content) ---
+ obs_dir = Path(OBSERVATIONS_DIR)
+ obs_timeline = []
+ if obs_dir.exists():
+ for f in sorted(obs_dir.glob("observations-*.jsonl")):
+ # Extract date from filename: observations-YYYY-MM-DD.jsonl
+ m = re.search(r"observations-(\d{4}-\d{2}-\d{2})\.jsonl$", f.name)
+ if m:
+ obs_timeline.append({
+ "date": m.group(1),
+ "observations_count": _count_lines(f),
+ })
+ data["observations_timeline"] = obs_timeline
+
+ # --- Sessions ---
+ sessions_dir = Path(SESSIONS_DIR)
+ session_dates = Counter()
+ if sessions_dir.exists():
+ for f in sessions_dir.glob("*.md"):
+ # Filename: YYYY-MM-DD-hexid.md
+ m = re.search(r"^(\d{4}-\d{2}-\d{2})", f.name)
+ if m:
+ session_dates[m.group(1)] += 1
+ data["sessions_by_date"] = [
+ {"date": d, "count": c} for d, c in sorted(session_dates.items())
+ ]
+
+ # --- Session retrievals ---
+ retrievals_path = DATA_DIR / "session_retrievals.jsonl"
+ retrievals = _load_jsonl(retrievals_path)
+ retrieval_dates = Counter()
+ retrieval_scores = []
+ for r in retrievals:
+ date = _parse_date(r.get("timestamp", ""))
+ if date:
+ retrieval_dates[date] += 1
+ scores = r.get("scores", [])
+ retrieval_scores.extend(scores)
+
+ data["retrievals"] = {
+ "total": len(retrievals),
+ "by_date": [{"date": d, "count": c} for d, c in sorted(retrieval_dates.items())],
+ "score_stats": _histogram(retrieval_scores, bin_start=0, bin_end=1.0, num_bins=10) if retrieval_scores else {"histogram": [], "stats": {}},
+ }
+
+ # --- Predictions & outcomes ---
+ predictions = _load_jsonl(DATA_DIR / "predictions.jsonl")
+ outcomes = _load_jsonl(DATA_DIR / "outcomes.jsonl")
+
+ resolved_count = sum(1 for p in predictions if p.get("status") == "resolved")
+ pending_count = sum(1 for p in predictions if p.get("status") != "resolved")
+ outcome_rewards = [o.get("reward", 0) for o in outcomes]
+
+ data["predictions"] = {
+ "total": len(predictions),
+ "resolved": resolved_count,
+ "pending": pending_count,
+ "avg_reward": round(statistics.mean(outcome_rewards), 4) if outcome_rewards else 0,
+ "reward_distribution": _histogram(outcome_rewards, bin_start=-1.0, bin_end=1.0, num_bins=10) if outcome_rewards else {"histogram": [], "stats": {}},
+ }
+
+ # --- Lifecycle (Qdrant) ---
+ lifecycle_data = {}
+ memory_types = {}
+ if not no_qdrant:
+ try:
+ from .core.lifecycle import MemoryLifecycle
+ lc = MemoryLifecycle()
+ lifecycle_data = lc.get_lifecycle_stats()
+ except Exception:
+ lifecycle_data = {}
+
+ try:
+ from .core.config import COLLECTION_NAME, QDRANT_HOST, QDRANT_PORT
+ from qdrant_client import QdrantClient
+ client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT, timeout=5)
+ # Get memory type distribution
+ scroll_result = client.scroll(
+ collection_name=COLLECTION_NAME,
+ limit=100,
+ with_payload=["type"],
+ )
+ type_counts = Counter()
+ # Scroll all points to count types
+ points, next_offset = scroll_result
+ while points:
+ for point in points:
+ t = (point.payload or {}).get("type", "unknown")
+ type_counts[t] += 1
+ if next_offset is None:
+ break
+ points, next_offset = client.scroll(
+ collection_name=COLLECTION_NAME,
+ offset=next_offset,
+ limit=100,
+ with_payload=["type"],
+ )
+ memory_types = dict(type_counts)
+ except Exception:
+ memory_types = {}
+
+ data["lifecycle"] = lifecycle_data
+ data["memory_types"] = memory_types
+
+ # --- Meta ---
+ all_dates = [_parse_date(v.get("q_updated_at", "")) for v in cache.values()]
+ all_dates = [d for d in all_dates if d]
+
+ data["meta"] = {
+ "generated_at": datetime.now().isoformat(),
+ "total_memories": len(cache),
+ "total_observations": sum(o["observations_count"] for o in obs_timeline),
+ "total_sessions": sum(s["count"] for s in data["sessions_by_date"]),
+ "total_retrievals": len(retrievals),
+ "data_range": {
+ "first": min(all_dates) if all_dates else None,
+ "last": max(all_dates) if all_dates else None,
+ },
+ }
+
+ _sanitize(data)
+ return data
+
+
+def _redact(text):
+ """Redact sensitive info from observation summaries for demo display."""
+ if not text:
+ return ""
+ # Redact file paths (with or without trailing path)
+ text = re.sub(r"/Users/\w+(?:/[^\s\"']*)?", "/~/...", text)
+ text = re.sub(r"/home/\w+(?:/[^\s\"']*)?", "/~/...", text)
+ # Redact email addresses → keep domain hint
+ text = re.sub(r"[\w.+-]+@[\w.-]+\.\w+", lambda m: m.group(0).split("@")[0][:2] + "***@" + m.group(0).split("@")[1], text)
+ # Redact API keys
+ text = re.sub(r"sk-ant-\S+", "sk-***", text)
+ return text
+
+
+def _classify_step(obs):
+ """Classify an observation into a human-readable step type for the replay."""
+ tool = obs.get("tool", "")
+ summary = obs.get("summary", "")
+ s = summary.lower()
+
+ if "read_email" in s or "gmail" in s:
+ if "unread" in s or "inbox" in s:
+ return "scan_inbox", "Scanning inbox"
+ if "from:" in s or "--full" in s:
+ return "read_email", "Reading email thread"
+ if "in:sent" in s:
+ return "check_sent", "Checking sent history"
+ if "subject:" in s:
+ return "search_email", "Searching emails"
+ return "read_email", "Reading emails"
+ if "send_email" in s:
+ return "send_email", "Sending email reply"
+ if "search_memory" in s or "search -q" in s:
+ return "recall", "Recalling memories"
+ if "add_memory" in s:
+ return "store", "Storing new memory"
+ if "crm" in s or "leads.csv" in s or "activities.csv" in s:
+ return "crm", "Updating CRM"
+ if tool == "Edit":
+ return "edit", "Editing file"
+ if tool == "Write":
+ return "write", "Writing file"
+ if "grep" in s or "search" in s:
+ return "search", "Searching context"
+ if "git commit" in s or "git push" in s:
+ return "commit", "Committing changes"
+ return "action", "Working"
+
+
+def _build_conversation(session_retrievals, steps, session_obs):
+ """Build a conversation timeline from retrieval queries and observations.
+
+ Retrieval queries contain user messages (the hook fires on each user prompt).
+ Observations contain Claude's actions. We pair them into a chat timeline.
+
+ All text is redacted: names replaced with fictional ones, paths removed,
+ emails anonymized.
+ """
+ # Name replacement map — anonymize any real names in queries
+ _name_map = {}
+ _name_counter = [0]
+ _fictional_names = ["Alex", "Sarah", "Marcus", "Elena", "James", "Nadia"]
+
+ def _anonymize_name(match):
+ name = match.group(0)
+ if name.lower() not in _name_map:
+ idx = _name_counter[0] % len(_fictional_names)
+ _name_map[name.lower()] = _fictional_names[idx]
+ _name_counter[0] += 1
+ return _name_map[name.lower()]
+
+ def _is_cyrillic(text):
+ """Check if text is predominantly Cyrillic (non-English)."""
+ cyrillic = sum(1 for c in text if '\u0400' <= c <= '\u04ff')
+ return cyrillic > len(text) * 0.3
+
+ def _translate_intent(text, next_obs=None):
+ """Translate non-English user messages to English based on intent keywords.
+
+ Uses keyword matching to produce a natural English equivalent.
+ For a demo, this provides readable English without needing an LLM.
+ """
+ t = text.lower()
+
+ # Common intent patterns (Ukrainian/Russian → English)
+ if any(w in t for w in ["пошт", "email", "inbox", "mail", "лист"]):
+ if any(w in t for w in ["відписал", "написал", "replied", "відповіл"]):
+ return "Check the email? They replied. Write back and ask about the next steps."
+ if any(w in t for w in ["перевір", "check", "подивись"]):
+ return "Can you check the inbox for new messages?"
+ return "Check the email and handle it."
+ if any(w in t for w in ["давай", "go ahead", "ok", "ага", "так"]):
+ return "OK, go ahead."
+ if any(w in t for w in ["напиш", "write", "send", "відправ"]):
+ return "Write and send the reply."
+ if any(w in t for w in ["crm", "lead", "deal", "pipeline"]):
+ return "Update the CRM with the latest info."
+ if any(w in t for w in ["зроби", "do", "fix", "виправ"]):
+ return "Make the changes we discussed."
+
+ # Fallback: if still Cyrillic, summarize generically based on next action
+ if _is_cyrillic(text):
+ if next_obs:
+ step_type, _ = _classify_step(next_obs)
+ intent_map = {
+ "scan_inbox": "Check the inbox for new messages.",
+ "read_email": "Read that email thread.",
+ "search_email": "Search for the relevant emails.",
+ "send_email": "Send the reply.",
+ "recall": "Search our memory for context.",
+ "store": "Save this to memory.",
+ "crm": "Update the CRM.",
+ "edit": "Make the edits.",
+ "commit": "Commit the changes.",
+ }
+ return intent_map.get(step_type, "Handle this task.")
+ return "Handle this task."
+
+ return text
+
+ def _clean_query(query):
+ """Clean a retrieval query into a presentable user message."""
+ if not query:
+ return None
+ # Retrieval queries often have system context prepended — extract user part
+ # Look for natural language after system prefixes
+ parts = query.split("\n")
+ # Filter out lines that look like system context (paths, commands, etc.)
+ user_lines = []
+ for line in parts:
+ line = line.strip()
+ if not line:
+ continue
+ # Skip system-like lines
+ if any(line.startswith(p) for p in ["/", "Ran:", "Edited ", "Wrote ", "- ", "**"]):
+ continue
+ if re.match(r"^[a-f0-9]{8,}", line):
+ continue
+ # Skip very short fragments
+ if len(line) < 3:
+ continue
+ user_lines.append(line)
+
+ text = " ".join(user_lines).strip()
+ if not text or len(text) < 5:
+ return None
+
+ # Redact sensitive info
+ text = _redact(text)
+ return text
+
+ def _describe_action(obs):
+ """Generate a Claude response description from an observation."""
+ summary = obs.get("summary", "")
+ step_type, _ = _classify_step(obs)
+
+ if step_type == "scan_inbox":
+ return "Let me check the inbox for recent messages..."
+ if step_type == "search_email":
+ return "Searching for the relevant email thread..."
+ if step_type == "read_email":
+ return "Reading the full email conversation..."
+ if step_type == "check_sent":
+ return "Checking what was already sent to see the context..."
+ if step_type == "send_email":
+ return "Sending the reply now."
+ if step_type == "recall":
+ return "Searching memory for relevant context..."
+ if step_type == "store":
+ return "Saving this to memory for future reference."
+ if step_type == "crm":
+ return "Updating the CRM with the latest status..."
+ if step_type == "edit":
+ return "Making the requested changes..."
+ if step_type == "write":
+ return "Creating the file..."
+ if step_type == "commit":
+ return "Committing the changes..."
+ return "Working on it..."
+
+ conversation = []
+
+ # Map retrieval timestamps to find which user messages correspond to which steps
+ # Retrieval[0] = session start (auto, context from previous session)
+ # Retrieval[1+] = user messages that triggered recall hooks
+
+ used_retrievals = set()
+
+ # Session start message
+ conversation.append({
+ "step_index": 0,
+ "role": "system",
+ "text": "Session started. Retrieving relevant memories from Q-weighted search...",
+ })
+
+ # Match user messages (from retrievals) to steps
+ for r_idx, r in enumerate(session_retrievals):
+ if r_idx == 0:
+ continue # skip session start auto-retrieval
+
+ r_ts = r.get("timestamp", "")
+ user_msg = _clean_query(r.get("query", ""))
+ if not user_msg:
+ continue
+
+ # Find the step that this user message precedes
+ matched_step = None
+ matched_obs = None
+ for step in steps:
+ step_ts = step.get("timestamp", "")
+ if step_ts and r_ts and step_ts >= r_ts and step.get("type") != "session_start":
+ matched_step = step
+ # Find the corresponding observation for context
+ obs_idx = step["index"] - (1 if steps[0]["type"] == "session_start" else 0)
+ if 0 <= obs_idx < len(session_obs):
+ matched_obs = session_obs[obs_idx]
+ break
+
+ step_idx = matched_step["index"] if matched_step else len(steps) - 1
+
+ # Translate non-English messages to English for demo
+ if _is_cyrillic(user_msg):
+ user_msg = _translate_intent(user_msg, matched_obs)
+
+ conversation.append({
+ "step_index": step_idx,
+ "role": "user",
+ "text": user_msg,
+ })
+ used_retrievals.add(r_idx)
+
+ # Add Claude action descriptions for each observation step
+ for step in steps:
+ if step["type"] in ("session_start", "session_end"):
+ continue
+ obs_idx = step["index"] - (1 if steps[0]["type"] == "session_start" else 0)
+ if 0 <= obs_idx < len(session_obs):
+ action_text = _describe_action(session_obs[obs_idx])
+ conversation.append({
+ "step_index": step["index"],
+ "role": "assistant",
+ "text": action_text,
+ })
+
+ # Session end message
+ conversation.append({
+ "step_index": len(steps) - 1,
+ "role": "system",
+ "text": "Session complete. Computing reward and updating Q-values for all retrieved memories.",
+ })
+
+ # Sort by step_index
+ conversation.sort(key=lambda m: (m["step_index"], 0 if m["role"] == "user" else 1 if m["role"] == "assistant" else 2))
+
+ return conversation
+
+
+def _truncate(text, max_len=120):
+ """Truncate text with ellipsis."""
+ if not text or len(text) <= max_len:
+ return text or ""
+ return text[:max_len - 1] + "…"
+
+
+def _summarize_actions(action_types):
+ """Map action types to a readable English summary sentence.
+
+ >>> _summarize_actions(["scan_inbox", "read_email", "check_sent"])
+ "I'll handle this by checking the inbox, reading the email thread and checking sent history."
+ """
+ verb_map = {
+ "scan_inbox": "checking the inbox",
+ "read_email": "reading the email thread",
+ "check_sent": "checking sent history",
+ "search_email": "searching emails",
+ "send_email": "sending the email reply",
+ "recall": "recalling relevant memories",
+ "store": "storing a new memory",
+ "crm": "updating the CRM",
+ "edit": "editing files",
+ "write": "writing files",
+ "search": "searching for context",
+ "commit": "committing changes",
+ "action": "working on it",
+ }
+ verbs = []
+ seen = set()
+ for t in action_types:
+ verb = verb_map.get(t, "working on it")
+ if verb not in seen:
+ verbs.append(verb)
+ seen.add(verb)
+ if not verbs:
+ return "Working on it."
+ if len(verbs) == 1:
+ return f"I'll handle this by {verbs[0]}."
+ return "I'll handle this by " + ", ".join(verbs[:-1]) + " and " + verbs[-1] + "."
+
+
+def _build_beats(steps, conversation, session_obs):
+ """Group raw steps into narrative beats delimited by user messages.
+
+ Returns a list of beat dicts with schema:
+ id, type, title, subtitle, conversation, actions,
+ memories_recalled, memories_count, step_indices,
+ phase, reward_info, duration_hint
+ """
+ # Find user message step_indices from conversation
+ user_msg_indices = []
+ user_msgs = {}
+ for msg in conversation:
+ if msg["role"] == "user":
+ user_msg_indices.append(msg["step_index"])
+ user_msgs[msg["step_index"]] = msg["text"]
+ user_msg_indices.sort()
+
+ beats = []
+ beat_id = 0
+
+ # --- Beat 0: system_start ---
+ start_steps = []
+ start_conv = []
+ for s in steps:
+ if s["type"] == "session_start":
+ start_steps.append(s)
+ for msg in conversation:
+ if msg["role"] == "system" and msg["step_index"] == 0:
+ start_conv.append(msg)
+
+ # Collect session-start memories — will be shown in first user_turn beat
+ session_start_mems = []
+ if start_steps:
+ for s in start_steps:
+ for m in s.get("memories_recalled", []):
+ if m["id"] not in {x["id"] for x in session_start_mems}:
+ session_start_mems.append(m)
+
+ beats.append({
+ "id": beat_id,
+ "type": "system_start",
+ "title": "Session Start",
+ "subtitle": "Waiting for user request...",
+ "conversation": [{"role": m["role"], "text": m["text"]} for m in start_conv],
+ "actions": [],
+ "memories_recalled": [],
+ "memories_count": 0,
+ "step_indices": [s["index"] for s in start_steps],
+ "phase": "start",
+ "reward_info": None,
+ "duration_hint": 2000,
+ })
+ beat_id += 1
+
+ # --- Work steps (between start and end) ---
+ work_steps = [s for s in steps if s["type"] not in ("session_start", "session_end")]
+
+ if not user_msg_indices:
+ # No user messages → single "auto" beat
+ if work_steps:
+ action_types = [s["type"] for s in work_steps]
+ actions = []
+ all_mems = list(session_start_mems) # include session-start memories
+ seen_mem_ids = {m["id"] for m in all_mems}
+ for s in work_steps:
+ _, label = _classify_step({"summary": s.get("description", ""), "tool": s.get("tool", "")})
+ actions.append({"label": label, "type": s["type"], "step_index": s["index"]})
+ for m in s.get("memories_recalled", []):
+ if m["id"] not in seen_mem_ids:
+ all_mems.append(m)
+ seen_mem_ids.add(m["id"])
+
+ subtitle = _summarize_actions(action_types)
+ beats.append({
+ "id": beat_id,
+ "type": "auto",
+ "title": "Automated work",
+ "subtitle": _truncate(subtitle, 150),
+ "conversation": [{"role": "assistant", "text": subtitle, "summary": True}],
+ "actions": actions,
+ "memories_recalled": all_mems,
+ "memories_count": len(all_mems),
+ "step_indices": [s["index"] for s in work_steps],
+ "phase": "work",
+ "reward_info": None,
+ "duration_hint": max(3500, len(actions) * 1200),
+ })
+ beat_id += 1
+ else:
+ # Group work steps by user messages
+ # Each user message starts a new beat that includes all steps
+ # until the next user message
+ boundaries = user_msg_indices + [max(s["index"] for s in steps) + 1]
+
+ for b_idx, boundary in enumerate(user_msg_indices):
+ next_boundary = boundaries[b_idx + 1]
+ user_text = user_msgs.get(boundary, "")
+
+ # Steps in this beat: from this user message to next boundary
+ beat_steps = [s for s in work_steps if boundary <= s["index"] < next_boundary]
+ # Also include steps before first user message if this is the first user beat
+ if b_idx == 0:
+ pre_steps = [s for s in work_steps if s["index"] < boundary]
+ beat_steps = pre_steps + beat_steps
+
+ action_types = [s["type"] for s in beat_steps]
+ actions = []
+ # First user_turn gets session-start memories
+ if b_idx == 0:
+ all_mems = list(session_start_mems)
+ seen_mem_ids = {m["id"] for m in all_mems}
+ else:
+ all_mems = []
+ seen_mem_ids = set()
+ for s in beat_steps:
+ _, label = _classify_step({"summary": s.get("description", ""), "tool": s.get("tool", "")})
+ actions.append({"label": label, "type": s["type"], "step_index": s["index"]})
+ for m in s.get("memories_recalled", []):
+ if m["id"] not in seen_mem_ids:
+ all_mems.append(m)
+ seen_mem_ids.add(m["id"])
+
+ subtitle = _summarize_actions(action_types) if action_types else ""
+
+ beat_conv = [{"role": "user", "text": user_text}]
+ if subtitle:
+ beat_conv.append({"role": "assistant", "text": subtitle, "summary": True})
+
+ # Generate a title from user text
+ title = _truncate(user_text, 50) if user_text else "Continue work"
+
+ beats.append({
+ "id": beat_id,
+ "type": "user_turn",
+ "title": title,
+ "subtitle": _truncate(subtitle, 150),
+ "conversation": beat_conv,
+ "actions": actions,
+ "memories_recalled": all_mems,
+ "memories_count": len(all_mems),
+ "step_indices": [s["index"] for s in beat_steps],
+ "phase": "work",
+ "reward_info": None,
+ "duration_hint": max(3500, len(actions) * 1200),
+ })
+ beat_id += 1
+
+ # --- Final beat: system_end ---
+ end_step = next((s for s in steps if s["type"] == "session_end"), None)
+ end_conv = [msg for msg in conversation if msg["role"] == "system" and msg["step_index"] == len(steps) - 1]
+
+ reward_info = end_step.get("reward_info") if end_step else None
+ mem_updated = reward_info.get("memories_updated", 0) if reward_info else 0
+
+ beats.append({
+ "id": beat_id,
+ "type": "system_end",
+ "title": "Session Complete",
+ "subtitle": f"{mem_updated} memories updated via Q-learning",
+ "conversation": [{"role": m["role"], "text": m["text"]} for m in end_conv],
+ "actions": [],
+ "memories_recalled": [],
+ "memories_count": 0,
+ "step_indices": [end_step["index"]] if end_step else [],
+ "phase": "reward",
+ "reward_info": reward_info,
+ "duration_hint": 5000,
+ })
+
+ return beats
+
+
+def _clean_memory_preview(content, memory_type):
+ """Clean and truncate memory content for display based on type.
+
+ Session summaries contain raw logs — extract only the useful part.
+ Other types get light cleanup with a generous length limit.
+ """
+ if not content:
+ return ""
+
+ # Session summaries: extract just the meaningful first line
+ if memory_type in ("session_summary", "session"):
+ # Try to find project/summary info
+ lines = content.split("\n")
+ for line in lines:
+ line = line.strip().strip("#").strip("-").strip()
+ if not line or len(line) < 10:
+ continue
+ # Skip raw code/JSON
+ if any(c in line for c in ["{", "}", "json.load", "=", "(f)", "cache ="]):
+ continue
+ return _redact(_truncate(line, 150))
+ return _redact(_truncate(content.split("\n")[0], 100))
+
+ # Action observations: often start with "Ran: " — clean that
+ if content.startswith("Ran: "):
+ content = content[5:]
+
+ return _redact(_truncate(content, 200))
+
+
+def _build_scenario(session_obs):
+ """Generate a narrative user story from session observations.
+
+ Returns a dict with story paragraphs, success/failure criteria.
+ The story is written for a general audience (HN/Reddit demo).
+ """
+ summaries = [o.get("summary", "").lower() for o in session_obs]
+
+ has_email_read = any("email" in s or "gmail" in s or "inbox" in s for s in summaries)
+ has_email_send = any("send_email" in s for s in summaries)
+ has_crm = any("crm" in s or "leads" in s or "activities" in s for s in summaries)
+ has_code = any(o.get("tool") in ("Edit", "Write") for o in session_obs)
+ has_commit = any("git commit" in s or "git push" in s for s in summaries)
+ n_actions = len(session_obs)
+
+ # --- Build narrative story ---
+ if has_email_read and has_email_send:
+ title = "Can AI reply to email using past context?"
+ story = (
+ "A user asks their AI assistant to check the inbox and reply to an email thread. "
+ "The catch: to write a good reply, the AI needs context from past conversations, "
+ "deal history, and previous decisions — all stored as memories."
+ )
+ challenge = (
+ "The system has hundreds of stored memories. It must find the RIGHT ones. "
+ "This is where Q-learning kicks in: memories that helped in previous sessions "
+ "have higher Q-values and rank first. Bad matches get penalized over time."
+ )
+ elif has_email_read:
+ title = "Can AI process email with the right context?"
+ story = (
+ "A user asks their AI to check the inbox and handle incoming emails. "
+ "To understand what matters, the AI needs context: who is this person? "
+ "What's the history? What was discussed before?"
+ )
+ challenge = (
+ "The system searches hundreds of stored memories to find relevant context. "
+ "Memories ranked by Q-value — past usefulness determines what surfaces first."
+ )
+ elif has_code and has_commit:
+ title = "Can AI write code using learned patterns?"
+ story = (
+ "A user asks their AI to make code changes and commit them. "
+ "The AI needs to recall coding patterns, architecture decisions, "
+ "and project conventions from past sessions."
+ )
+ challenge = (
+ "The right context makes the difference between clean code and bugs. "
+ "Q-learning ensures that helpful patterns rank higher over time."
+ )
+ elif has_crm:
+ title = "Can AI manage CRM with full context?"
+ story = (
+ "A user asks their AI to update the CRM with latest deal status. "
+ "The AI needs to recall deal history, contact details, and past interactions."
+ )
+ challenge = (
+ "CRM updates require accurate context. Q-learning ensures the right "
+ "deal context surfaces first, not outdated or irrelevant information."
+ )
+ else:
+ title = "Can AI complete tasks using learned experience?"
+ story = (
+ f"A user gives their AI assistant a task requiring {n_actions} actions. "
+ "The AI must recall relevant context from past sessions to do it well."
+ )
+ challenge = (
+ "The system searches stored memories, ranked by Q-value. "
+ "Each session, it learns which memories actually help — and which don't."
+ )
+
+ # Success / failure — concrete, short
+ success = []
+ failure = []
+ if has_email_read:
+ success.append("Finds relevant email context from memory")
+ if has_email_send:
+ success.append("Sends appropriate reply with full context")
+ if has_crm:
+ success.append("Updates CRM accurately")
+ if has_code:
+ success.append("Makes correct code changes")
+ success.append("Q-values go UP for useful memories")
+
+ if has_email_read:
+ failure.append("Retrieves wrong context (wrong client, old deal)")
+ if has_email_send:
+ failure.append("Sends reply missing key details")
+ failure.append("Q-values go DOWN for irrelevant memories")
+
+ return {
+ "title": title,
+ "story": story,
+ "challenge": challenge,
+ "success_criteria": success,
+ "failure_criteria": failure,
+ }
+
+
+def _build_outcome(session_obs, memory_q_values):
+ """Generate session outcome verdict from observations and Q-value changes.
+
+ Returns dict with verdict, achievements list, and key metrics.
+ """
+ summaries = [o.get("summary", "").lower() for o in session_obs]
+
+ # Count concrete achievements
+ achievements = []
+ email_read = sum(1 for s in summaries if "email" in s and ("read" in s or "inbox" in s or "gmail" in s))
+ email_sent = sum(1 for s in summaries if "send_email" in s)
+ crm_ops = sum(1 for s in summaries if "crm" in s or "leads" in s or "activities" in s)
+ files_mod = sum(1 for o in session_obs if o.get("tool") in ("Edit", "Write"))
+ mem_stored = sum(1 for s in summaries if "add_memory" in s)
+ commits = sum(1 for s in summaries if "git commit" in s)
+
+ if email_read > 0:
+ achievements.append(f"Email thread processed ({email_read} actions)")
+ if email_sent > 0:
+ achievements.append(f"Reply sent ({email_sent})")
+ if crm_ops > 0:
+ achievements.append(f"CRM updated ({crm_ops} ops)")
+ if files_mod > 0:
+ achievements.append(f"Files modified ({files_mod})")
+ if commits > 0:
+ achievements.append(f"Changes committed")
+ if mem_stored > 0:
+ achievements.append(f"New memories stored ({mem_stored})")
+
+ if not achievements:
+ achievements.append(f"{len(session_obs)} actions executed")
+
+ # Verdict from reward direction
+ positive = sum(1 for q in memory_q_values.values() if q.get("reward_direction") == "positive")
+ negative = sum(1 for q in memory_q_values.values() if q.get("reward_direction") == "negative")
+ total = len(memory_q_values)
+
+ if positive > 0 and negative == 0:
+ verdict = "productive"
+ verdict_label = "Productive Session"
+ verdict_emoji = "\u2705"
+ elif positive > negative:
+ verdict = "mostly_productive"
+ verdict_label = "Mostly Productive"
+ verdict_emoji = "\u2705"
+ elif negative > positive * 2:
+ verdict = "unproductive"
+ verdict_label = "Needs Improvement"
+ verdict_emoji = "\u26a0\ufe0f"
+ else:
+ verdict = "mixed"
+ verdict_label = "Mixed Results"
+ verdict_emoji = "\u2139\ufe0f"
+
+ return {
+ "verdict": verdict,
+ "verdict_label": verdict_label,
+ "verdict_emoji": verdict_emoji,
+ "achievements": achievements,
+ "metrics": {
+ "actions_taken": len(session_obs),
+ "memories_reinforced": positive,
+ "memories_penalized": negative,
+ "total_memories_updated": total,
+ },
+ }
+
+
+def generate_demo_replay():
+ """Generate a scripted demo replay with a realistic email-handling scenario.
+
+ Returns the same structure as export_replay_data() but with handcrafted,
+ anonymized content for a compelling HN/Reddit demo. Shows the full flow:
+ email found → memory query → context loaded → reply drafted → user approves → sent.
+
+ Rich conversation entries include content_type, flow states, and activity log.
+ """
+ from .core.q_value import DEFAULT_Q_CONFIG
+
+ now = datetime.now().isoformat()
+ today = datetime.now().strftime("%Y-%m-%d")
+
+ # --- Demo memories with realistic Q-values ---
+ memory_q_values = {
+ "a1b2c3d4": {
+ "combined": 0.55, "combined_before": 0.42, "combined_delta": 0.13,
+ "action": 0.58, "hypothesis": 0.50, "fit": 0.52,
+ "visits": 7, "last_reward": 0.52,
+ "reward_direction": "positive",
+ "preview": "DataBridge Inc \u2014 $25K annual contract. Alex Chen is CTO. "
+ "Initial contact Jan 2026. They focus on computer vision pipelines.",
+ "memory_type": "deal_context",
+ },
+ "b2c3d4e5": {
+ "combined": 0.51, "combined_before": 0.38, "combined_delta": 0.13,
+ "action": 0.54, "hypothesis": 0.45, "fit": 0.50,
+ "visits": 4, "last_reward": 0.52,
+ "reward_direction": "positive",
+ "preview": "Alex Chen prefers quarterly billing. Budget approval needed "
+ "above $20K. Decision-maker is VP Engineering.",
+ "memory_type": "client_preference",
+ },
+ "c3d4e5f6": {
+ "combined": 0.72, "combined_before": 0.60, "combined_delta": 0.12,
+ "action": 0.75, "hypothesis": 0.68, "fit": 0.70,
+ "visits": 12, "last_reward": 0.52,
+ "reward_direction": "positive",
+ "preview": "Standard volume discount: 10% above 30K items/month, "
+ "15% above 50K items/month. Enterprise tier requires annual commitment.",
+ "memory_type": "pricing_knowledge",
+ },
+ "d4e5f6a7": {
+ "combined": 0.38, "combined_before": 0.25, "combined_delta": 0.13,
+ "action": 0.40, "hypothesis": 0.35, "fit": 0.36,
+ "visits": 3, "last_reward": 0.52,
+ "reward_direction": "positive",
+ "preview": "Previous email to DataBridge discussed their CV pipeline: "
+ "200K images/month, bounding box + classification. "
+ "Quality requirement: 98%+ accuracy.",
+ "memory_type": "conversation_history",
+ },
+ "e5f6a7b8": {
+ "combined": 0.46, "combined_before": 0.33, "combined_delta": 0.13,
+ "action": 0.48, "hypothesis": 0.42, "fit": 0.44,
+ "visits": 5, "last_reward": 0.52,
+ "reward_direction": "positive",
+ "preview": "DataBridge evaluated 3 vendors, chose us for labeling quality. "
+ "Contract renewal discussion planned for Q2 2026.",
+ "memory_type": "deal_context",
+ },
+ }
+
+ scenario = {
+ "title": "Can AI reply to a client email using past deal context?",
+ "story": (
+ "A user asks their AI assistant to check the inbox. A client named Alex "
+ "has replied about proposal pricing. To write a good reply, the AI needs "
+ "to recall the deal history, pricing rules, and client preferences \u2014 "
+ "all stored as Q-ranked memories from previous sessions."
+ ),
+ "challenge": (
+ "The system has 847 stored memories. It must find the RIGHT 5 out of 847. "
+ "This is where Q-learning kicks in: memories that helped in previous email "
+ "sessions have higher Q-values and rank first. Irrelevant memories get "
+ "penalized over time."
+ ),
+ "success_criteria": [
+ "Finds the right client context from memory",
+ "Applies correct pricing rules",
+ "Sends a contextually accurate reply",
+ "Q-values go UP for useful memories",
+ ],
+ "failure_criteria": [
+ "Retrieves wrong client's deal history",
+ "Misquotes pricing or terms",
+ "Q-values go DOWN for irrelevant memories",
+ ],
+ }
+
+ outcome = {
+ "verdict": "productive",
+ "verdict_label": "Productive Session",
+ "verdict_emoji": "\u2705",
+ "achievements": [
+ "Email thread processed and replied",
+ "5 relevant memories retrieved from 847 total",
+ "Reply sent with correct pricing context",
+ "All 5 memories reinforced (+Q)",
+ ],
+ "metrics": {
+ "actions_taken": 6,
+ "memories_reinforced": 5,
+ "memories_penalized": 0,
+ "total_memories_updated": 5,
+ },
+ }
+
+ # --- Beats with rich conversation entries ---
+ beats = [
+ {
+ "id": 0, "type": "system_start",
+ "title": "Session Start",
+ "subtitle": "Loading agent memory...",
+ "conversation": [{
+ "role": "system", "text": "Session started. Loading 847 memories "
+ "from Q-weighted index...",
+ "content_type": "text", "flow": ["claude_to_memory"],
+ "activity": "\u2190 OpenExp: loaded 847 memories into search index",
+ }],
+ "actions": [], "memories_recalled": [], "memories_count": 0,
+ "step_indices": [0], "phase": "start",
+ "reward_info": None, "duration_hint": 2000,
+ },
+ {
+ "id": 1, "type": "user_turn",
+ "title": "Check inbox and handle email",
+ "subtitle": "User asks to check inbox and handle reply",
+ "conversation": [
+ {
+ "role": "user",
+ "text": "Check the inbox \u2014 Alex from DataBridge should "
+ "have replied about the proposal pricing.",
+ "content_type": "text", "flow": ["user_to_claude"],
+ "activity": "\u2197 User request received",
+ },
+ {
+ "role": "assistant",
+ "text": "Checking inbox via Gmail API...",
+ "content_type": "text", "flow": ["claude_to_tools"],
+ "activity": "\u2192 Gmail API: querying inbox for recent messages",
+ },
+ {
+ "role": "assistant", "text": "",
+ "content_type": "email_card",
+ "email": {
+ "from": "Alex Chen (DataBridge Inc)",
+ "subject": "Re: Data Labeling Proposal \u2014 Pricing Question",
+ "date": "2 hours ago",
+ "snippet": (
+ "Hi, thanks for the detailed proposal. Before we sign, "
+ "can you clarify the volume discount structure? We're "
+ "looking at 50K items/month initially, with plans to "
+ "scale to 100K by Q3. Also, is quarterly billing an "
+ "option? Our finance team prefers that cycle."
+ ),
+ },
+ "flow": ["tools_to_claude"],
+ "activity": "\u2190 Gmail: found 1 new email from Alex Chen",
+ },
+ {
+ "role": "assistant",
+ "text": "Let me check our history with DataBridge...",
+ "content_type": "text", "flow": ["claude_to_memory"],
+ "activity": "\u2192 OpenExp: searching 'DataBridge deal history pricing'",
+ },
+ {
+ "role": "assistant", "text": "",
+ "content_type": "memory_results",
+ "query": "DataBridge deal history pricing",
+ "memories": [
+ {"id": "a1b2c3d4",
+ "preview": "DataBridge Inc \u2014 $25K annual contract. "
+ "Alex Chen is CTO.",
+ "q_value": 0.42, "score": 0.89, "type": "deal_context"},
+ {"id": "c3d4e5f6",
+ "preview": "Volume discount: 10% above 30K, 15% above "
+ "50K items/month.",
+ "q_value": 0.60, "score": 0.85, "type": "pricing_knowledge"},
+ {"id": "b2c3d4e5",
+ "preview": "Alex prefers quarterly billing. Budget "
+ "approval needed above $20K.",
+ "q_value": 0.38, "score": 0.82, "type": "client_preference"},
+ {"id": "d4e5f6a7",
+ "preview": "Previous email: CV pipeline, 200K images/month.",
+ "q_value": 0.25, "score": 0.78,
+ "type": "conversation_history"},
+ {"id": "e5f6a7b8",
+ "preview": "Chose us over 2 vendors for quality. "
+ "Renewal in Q2.",
+ "q_value": 0.33, "score": 0.75, "type": "deal_context"},
+ ],
+ "flow": ["memory_to_claude"],
+ "activity": "\u2190 OpenExp: 5 memories found (best Q: 0.60)",
+ },
+ {
+ "role": "assistant",
+ "text": (
+ "Based on our deal context with DataBridge:\n\n"
+ "\u2022 Alex Chen is CTO, $25K annual contract\n"
+ "\u2022 They prefer quarterly billing "
+ "(his finance team's preference)\n"
+ "\u2022 Standard discount: 15% for 50K items/month volume\n"
+ "\u2022 Their CV pipeline processes 200K images/month\n"
+ "\u2022 They chose us over 2 other vendors for quality\n\n"
+ "I recommend: confirm the 15% volume discount for 50K+ items, "
+ "offer quarterly billing as he requested, and mention the Q2 "
+ "renewal timeline."
+ ),
+ "content_type": "text", "flow": ["claude_to_user"],
+ "activity": "\u2199 Claude: analysis complete, recommendation ready",
+ },
+ ],
+ "actions": [
+ {"label": "Querying Gmail API for recent messages",
+ "type": "scan_inbox", "step_index": 1},
+ {"label": "Found: 'Re: Data Labeling Proposal \u2014 Pricing'",
+ "type": "read_email", "step_index": 2},
+ {"label": "Searching OpenExp: 'DataBridge deal history pricing'",
+ "type": "recall", "step_index": 3},
+ {"label": "Retrieved 5 memories (best Q: 0.60)",
+ "type": "recall", "step_index": 4},
+ {"label": "Analyzing deal context and pricing rules",
+ "type": "action", "step_index": 5},
+ ],
+ "memories_recalled": [
+ {"id": "a1b2c3d4", "score": 0.89, "q_combined": 0.42},
+ {"id": "c3d4e5f6", "score": 0.85, "q_combined": 0.60},
+ {"id": "b2c3d4e5", "score": 0.82, "q_combined": 0.38},
+ {"id": "d4e5f6a7", "score": 0.78, "q_combined": 0.25},
+ {"id": "e5f6a7b8", "score": 0.75, "q_combined": 0.33},
+ ],
+ "memories_count": 5,
+ "step_indices": [1, 2, 3, 4, 5],
+ "phase": "work", "reward_info": None, "duration_hint": 8000,
+ },
+ {
+ "id": 2, "type": "user_turn",
+ "title": "Send the reply",
+ "subtitle": "User approves the draft",
+ "conversation": [
+ {
+ "role": "user",
+ "text": "Sounds good, send it.",
+ "content_type": "text", "flow": ["user_to_claude"],
+ "activity": "\u2197 User: approved",
+ },
+ {
+ "role": "assistant",
+ "text": "Drafting and sending the reply...",
+ "content_type": "text", "flow": ["claude_to_tools"],
+ "activity": "\u2192 Gmail API: composing and sending reply",
+ },
+ {
+ "role": "assistant", "text": "",
+ "content_type": "email_sent",
+ "email": {
+ "to": "Alex Chen (DataBridge Inc)",
+ "subject": "Re: Data Labeling Proposal \u2014 "
+ "Pricing Question",
+ "snippet": (
+ "Hi Alex,\n\n"
+ "Great questions! For your 50K items/month volume, "
+ "I'm happy to confirm our 15% volume discount \u2014 "
+ "that brings your per-item rate to $0.034.\n\n"
+ "Quarterly billing is absolutely available. I'll update "
+ "the proposal with Q1 billing terms.\n\n"
+ "Looking forward to finalizing before the Q2 renewal "
+ "cycle.\n\nBest regards"
+ ),
+ "status": "sent",
+ },
+ "flow": ["tools_to_claude"],
+ "activity": "\u2190 Gmail: reply sent successfully",
+ },
+ {
+ "role": "assistant",
+ "text": "Reply sent successfully. Saving this interaction "
+ "to memory for future reference.",
+ "content_type": "text", "flow": ["claude_to_memory"],
+ "activity": "\u2192 OpenExp: storing interaction as new memory",
+ },
+ ],
+ "actions": [
+ {"label": "Composing reply with pricing context",
+ "type": "action", "step_index": 6},
+ {"label": "Sending via Gmail API",
+ "type": "send_email", "step_index": 7},
+ {"label": "Saving interaction to OpenExp memory",
+ "type": "store", "step_index": 8},
+ ],
+ "memories_recalled": [], "memories_count": 0,
+ "step_indices": [6, 7, 8],
+ "phase": "work", "reward_info": None, "duration_hint": 5000,
+ },
+ {
+ "id": 3, "type": "system_end",
+ "title": "Session Complete",
+ "subtitle": "5 memories reinforced via Q-learning",
+ "conversation": [{
+ "role": "system",
+ "text": "Session complete. Computing reward and updating "
+ "Q-values for all 5 retrieved memories.",
+ "content_type": "text", "flow": ["claude_to_memory"],
+ "activity": "\u2190 Q-learning: reward applied to 5 memories",
+ }],
+ "actions": [], "memories_recalled": [], "memories_count": 0,
+ "step_indices": [9], "phase": "reward",
+ "reward_info": {"memories_updated": 5, "alpha": 0.25},
+ "duration_hint": 5000,
+ },
+ ]
+
+ # Steps (backward compat)
+ steps = [
+ {"index": i, "timestamp": now, "type": t, "label": l,
+ "description": d, "phase": p}
+ for i, (t, l, d, p) in enumerate([
+ ("session_start", "Session Start",
+ "Retrieved 5 memories from Q-weighted search", "recall"),
+ ("scan_inbox", "Scanning inbox",
+ "Querying Gmail API for recent messages", "work"),
+ ("read_email", "Reading email",
+ "Found email from Alex Chen about pricing", "work"),
+ ("recall", "Memory search",
+ "Searching OpenExp for DataBridge deal history", "recall"),
+ ("recall", "Memory results",
+ "Retrieved 5 memories (best Q: 0.60)", "recall"),
+ ("action", "Analysis",
+ "Analyzing deal context and drafting response", "work"),
+ ("action", "Composing",
+ "Composing reply with pricing context", "work"),
+ ("send_email", "Sending email",
+ "Sending reply via Gmail API", "work"),
+ ("store", "Saving memory",
+ "Saving interaction to OpenExp memory", "work"),
+ ("session_end", "Session End",
+ "Observations ingested, Q-values updated", "reward"),
+ ])
+ ]
+ steps[-1]["reward_info"] = {"memories_updated": 5, "alpha": 0.25}
+
+ conversation = [
+ {"step_index": 0, "role": "system",
+ "text": "Session started. Loading 847 memories..."},
+ {"step_index": 1, "role": "user",
+ "text": "Check the inbox \u2014 Alex from DataBridge should have "
+ "replied about the proposal pricing."},
+ {"step_index": 5, "role": "assistant",
+ "text": "I'll handle this by checking the inbox, reading the email "
+ "thread and recalling relevant memories."},
+ {"step_index": 6, "role": "user", "text": "Sounds good, send it."},
+ {"step_index": 7, "role": "assistant",
+ "text": "Sending the reply now."},
+ {"step_index": 9, "role": "system",
+ "text": "Session complete. 5 memories updated via Q-learning."},
+ ]
+
+ return {
+ "meta": {
+ "session_id": "demo0001",
+ "generated_at": now,
+ "date": today,
+ "total_steps": len(steps),
+ "total_observations": 8,
+ "memories_retrieved": 5,
+ "total_beats": len(beats),
+ "project": "demo",
+ "demo": True,
+ },
+ "scenario": scenario,
+ "outcome": outcome,
+ "steps": steps,
+ "conversation": conversation,
+ "beats": beats,
+ "memory_q_values": memory_q_values,
+ "q_config": {
+ "alpha": DEFAULT_Q_CONFIG["alpha"],
+ "q_floor": DEFAULT_Q_CONFIG["q_floor"],
+ "q_ceiling": DEFAULT_Q_CONFIG["q_ceiling"],
+ "layer_weights": {
+ "action": DEFAULT_Q_CONFIG["q_action_weight"],
+ "hypothesis": DEFAULT_Q_CONFIG["q_hypothesis_weight"],
+ "fit": DEFAULT_Q_CONFIG["q_fit_weight"],
+ },
+ },
+ }
+
+
+def export_replay_data(session_id):
+ """Export a single session as a step-by-step replay timeline.
+
+ Args:
+ session_id: Full or prefix of session UUID.
+
+ Returns:
+ dict with replay timeline, retrieval snapshots, and Q-value changes.
+ """
+ from .core.config import DATA_DIR, Q_CACHE_PATH, OBSERVATIONS_DIR, SESSIONS_DIR
+ from .core.q_value import QCache, DEFAULT_Q_CONFIG
+
+ # --- Load Q-cache ---
+ q_cache = QCache()
+ q_cache.load(Q_CACHE_PATH)
+ cache = q_cache._cache
+
+ # --- Find observations for this session ---
+ obs_dir = Path(OBSERVATIONS_DIR)
+ session_obs = []
+ full_session_id = None
+
+ if obs_dir.exists():
+ for f in sorted(obs_dir.glob("observations-*.jsonl")):
+ for entry in _load_jsonl(f):
+ sid = entry.get("session_id", "")
+ if sid.startswith(session_id):
+ full_session_id = sid
+ session_obs.append(entry)
+
+ if not session_obs:
+ return {"error": f"No observations found for session {session_id}"}
+
+ session_obs.sort(key=lambda x: x.get("timestamp", ""))
+
+ # --- Load retrievals for this session ---
+ retrievals_path = DATA_DIR / "session_retrievals.jsonl"
+ session_retrievals = []
+ for r in _load_jsonl(retrievals_path):
+ if r.get("session_id", "").startswith(session_id):
+ session_retrievals.append(r)
+ session_retrievals.sort(key=lambda x: x.get("timestamp", ""))
+
+ # Collect all retrieved memory IDs and their Q-values
+ all_memory_ids = set()
+ for r in session_retrievals:
+ all_memory_ids.update(r.get("memory_ids", []))
+
+ # --- Fetch memory content previews from Qdrant ---
+ memory_previews = {}
+ try:
+ from .core.config import COLLECTION_NAME, QDRANT_HOST, QDRANT_PORT
+ from qdrant_client import QdrantClient
+ qc = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT, timeout=5)
+ for mid in all_memory_ids:
+ try:
+ pts = qc.retrieve(
+ collection_name=COLLECTION_NAME,
+ ids=[mid],
+ with_payload=["memory", "memory_type"],
+ )
+ if pts:
+ content = pts[0].payload.get("memory", "")
+ mtype = pts[0].payload.get("memory_type", "fact")
+ preview = _clean_memory_preview(content, mtype)
+ memory_previews[mid[:8]] = {"preview": preview, "type": mtype}
+ except Exception:
+ continue
+ except Exception:
+ pass # Qdrant not available — no previews, degrade gracefully
+
+ memory_q_values = {}
+ alpha = DEFAULT_Q_CONFIG["alpha"]
+ for mid in all_memory_ids:
+ q_nested = cache.get(mid)
+ q = q_nested.get("default") if isinstance(q_nested, dict) and "default" in q_nested else q_nested
+ if q:
+ combined = q.get("q_value", 0)
+ last_reward = q.get("last_reward", 0) or 0
+ action_val = q.get("q_action", 0)
+ hyp_val = q.get("q_hypothesis", 0.5)
+ fit_val = q.get("q_fit", 0.5)
+
+ # Estimate before-session values by reversing the last reward
+ action_w = DEFAULT_Q_CONFIG["q_action_weight"]
+ combined_delta = round(action_w * alpha * last_reward, 4)
+ combined_before = round(combined - combined_delta, 3)
+
+ preview_info = memory_previews.get(mid[:8], {})
+
+ memory_q_values[mid[:8]] = {
+ "combined": round(combined, 3),
+ "combined_before": combined_before,
+ "combined_delta": combined_delta,
+ "action": round(action_val, 3),
+ "hypothesis": round(hyp_val, 3),
+ "fit": round(fit_val, 3),
+ "visits": q.get("q_visits", 0),
+ "last_reward": round(last_reward, 3),
+ "reward_direction": "positive" if last_reward > 0 else "negative" if last_reward < 0 else "neutral",
+ "preview": preview_info.get("preview", ""),
+ "memory_type": preview_info.get("type", ""),
+ }
+
+ # --- Build timeline steps ---
+ steps = []
+
+ # Step 0: Session Start + initial retrieval
+ if session_retrievals:
+ r = session_retrievals[0]
+ mem_ids = r.get("memory_ids", [])
+ scores = r.get("scores", [])
+ recalled = []
+ for i, mid in enumerate(mem_ids):
+ score = scores[i] if i < len(scores) else 0
+ q = memory_q_values.get(mid[:8], {})
+ recalled.append({
+ "id": mid[:8],
+ "score": round(score, 3),
+ "q_combined": q.get("combined", 0),
+ })
+
+ steps.append({
+ "index": 0,
+ "timestamp": r.get("timestamp", session_obs[0]["timestamp"]),
+ "type": "session_start",
+ "label": "Session Start",
+ "description": f"Retrieved {len(mem_ids)} memories from Q-weighted search",
+ "memories_recalled": recalled[:6],
+ "phase": "recall",
+ })
+
+ # Steps for each observation
+ for i, obs in enumerate(session_obs):
+ step_type, label = _classify_step(obs)
+ summary = _redact(obs.get("summary", ""))
+
+ # Check if there's a retrieval around this time (user message recall)
+ mid_retrievals = []
+ for r in session_retrievals[1:]:
+ r_ts = r.get("timestamp", "")
+ o_ts = obs.get("timestamp", "")
+ if r_ts and o_ts and r_ts <= o_ts:
+ mids = r.get("memory_ids", [])
+ scores = r.get("scores", [])
+ for j, mid in enumerate(mids[:4]):
+ sc = scores[j] if j < len(scores) else 0
+ q = memory_q_values.get(mid[:8], {})
+ mid_retrievals.append({
+ "id": mid[:8],
+ "score": round(sc, 3),
+ "q_combined": q.get("combined", 0),
+ })
+ break
+
+ step = {
+ "index": len(steps),
+ "timestamp": obs.get("timestamp", ""),
+ "type": step_type,
+ "label": label,
+ "description": summary[:200],
+ "tool": obs.get("tool", ""),
+ "obs_type": obs.get("type", ""),
+ "phase": "work",
+ }
+ if mid_retrievals:
+ step["memories_recalled"] = mid_retrievals
+ step["phase"] = "recall"
+
+ steps.append(step)
+
+ # Final step: Session End + reward
+ steps.append({
+ "index": len(steps),
+ "timestamp": session_obs[-1]["timestamp"] if session_obs else "",
+ "type": "session_end",
+ "label": "Session End",
+ "description": "Observations ingested, session reward computed, Q-values updated",
+ "phase": "reward",
+ "reward_info": {
+ "memories_updated": len(all_memory_ids),
+ "alpha": DEFAULT_Q_CONFIG["alpha"],
+ },
+ })
+
+ # --- Session summary ---
+ sess_dir = Path(SESSIONS_DIR)
+ session_summary = None
+ if sess_dir.exists():
+ for f in sess_dir.glob("*.md"):
+ if session_id in f.name:
+ session_summary = f.read_text()[:500]
+ # Redact paths in summary
+ session_summary = _redact(session_summary)
+ break
+
+ # --- Build conversation from retrieval queries ---
+ conversation = _build_conversation(session_retrievals, steps, session_obs)
+
+ # --- Build narrative beats ---
+ beats = _build_beats(steps, conversation, session_obs)
+
+ # --- Build scenario and outcome ---
+ scenario = _build_scenario(session_obs)
+ outcome = _build_outcome(session_obs, memory_q_values)
+
+ data = {
+ "meta": {
+ "session_id": full_session_id[:8] if full_session_id else session_id[:8],
+ "generated_at": datetime.now().isoformat(),
+ "date": _parse_date(session_obs[0]["timestamp"]) if session_obs else None,
+ "total_steps": len(steps),
+ "total_observations": len(session_obs),
+ "memories_retrieved": len(all_memory_ids),
+ "total_beats": len(beats),
+ "project": session_obs[0].get("project", "") if session_obs else "",
+ },
+ "scenario": scenario,
+ "outcome": outcome,
+ "steps": steps,
+ "conversation": conversation,
+ "beats": beats,
+ "memory_q_values": memory_q_values,
+ "q_config": {
+ "alpha": DEFAULT_Q_CONFIG["alpha"],
+ "q_floor": DEFAULT_Q_CONFIG["q_floor"],
+ "q_ceiling": DEFAULT_Q_CONFIG["q_ceiling"],
+ "layer_weights": {
+ "action": DEFAULT_Q_CONFIG["q_action_weight"],
+ "hypothesis": DEFAULT_Q_CONFIG["q_hypothesis_weight"],
+ "fit": DEFAULT_Q_CONFIG["q_fit_weight"],
+ },
+ },
+ }
+
+ _sanitize(data)
+ return data
+
+
+def find_best_replay_session():
+ """Find the most interesting session for replay demo.
+
+ Prefers sessions with email + memory recall + CRM activity.
+ Returns session_id prefix or None.
+ """
+ from .core.config import OBSERVATIONS_DIR
+
+ obs_dir = Path(OBSERVATIONS_DIR)
+ if not obs_dir.exists():
+ return None
+
+ # Score each session by "interestingness"
+ session_scores = defaultdict(lambda: {"count": 0, "email": 0, "memory": 0, "crm": 0, "date": ""})
+
+ for f in sorted(obs_dir.glob("observations-*.jsonl")):
+ for entry in _load_jsonl(f):
+ sid = entry.get("session_id", "")
+ if not sid:
+ continue
+ s = session_scores[sid]
+ s["count"] += 1
+ summary = entry.get("summary", "").lower()
+ if "email" in summary or "gmail" in summary or "send_email" in summary:
+ s["email"] += 1
+ if "search_memory" in summary or "add_memory" in summary:
+ s["memory"] += 1
+ if "crm" in summary or "leads" in summary or "activities" in summary:
+ s["crm"] += 1
+ ts = entry.get("timestamp", "")
+ if ts > s["date"]:
+ s["date"] = ts
+
+ # Rank: prefer diverse sessions (email + memory + crm) with recent dates
+ ranked = sorted(
+ session_scores.items(),
+ key=lambda x: (
+ min(x[1]["email"], 1) + min(x[1]["memory"], 1) + min(x[1]["crm"], 1),
+ x[1]["count"],
+ x[1]["date"],
+ ),
+ reverse=True,
+ )
+
+ if ranked:
+ return ranked[0][0]
+ return None
+
+
+def _sanitize(data):
+ """Assert no string values contain file paths or sensitive patterns."""
+ sensitive_patterns = [
+ r"/Users/\w+",
+ r"/home/\w+",
+ r"sk-ant-",
+ r"welababeldata",
+ r"ivanpasichnyk",
+ ]
+
+ def _check(obj, path=""):
+ if isinstance(obj, str):
+ for pat in sensitive_patterns:
+ if re.search(pat, obj, re.IGNORECASE):
+ raise ValueError(
+ f"Sensitive data found at {path}: matches pattern '{pat}'"
+ )
+ elif isinstance(obj, dict):
+ for k, v in obj.items():
+ _check(v, f"{path}.{k}")
+ elif isinstance(obj, list):
+ for i, v in enumerate(obj):
+ _check(v, f"{path}[{i}]")
+
+ _check(data)
diff --git a/tests/test_viz.py b/tests/test_viz.py
new file mode 100644
index 0000000..16ebb6a
--- /dev/null
+++ b/tests/test_viz.py
@@ -0,0 +1,666 @@
+"""Tests for OpenExp visualization data export."""
+import argparse
+import json
+import re
+from pathlib import Path
+from unittest.mock import patch, MagicMock
+
+import pytest
+
+from openexp.viz import (
+ _histogram, _parse_date, _sanitize, _redact, _classify_step,
+ _build_conversation, _build_beats, _summarize_actions, _truncate,
+ export_viz_data, export_replay_data, generate_demo_replay,
+)
+
+
+class TestHistogram:
+ def test_basic_binning(self):
+ values = [0.0, 0.1, 0.2, 0.5, 0.9, 1.0]
+ result = _histogram(values, bin_start=0, bin_end=1.0, num_bins=10)
+ assert len(result["histogram"]) == 10
+ assert sum(b["count"] for b in result["histogram"]) == len(values)
+
+ def test_stats(self):
+ values = [0.0, 0.5, 1.0]
+ result = _histogram(values)
+ assert result["stats"]["min"] == 0.0
+ assert result["stats"]["max"] == 1.0
+ assert result["stats"]["count"] == 3
+
+ def test_empty_values(self):
+ result = _histogram([])
+ assert result["histogram"] == []
+ assert result["stats"] == {}
+
+ def test_single_value(self):
+ result = _histogram([0.5])
+ assert result["stats"]["mean"] == 0.5
+ assert result["stats"]["std"] == 0
+
+ def test_negative_values(self):
+ values = [-0.5, -0.3, 0.0, 0.5]
+ result = _histogram(values, bin_start=-0.5, bin_end=1.0, num_bins=15)
+ assert sum(b["count"] for b in result["histogram"]) == len(values)
+
+ def test_all_same_value(self):
+ values = [0.5, 0.5, 0.5]
+ result = _histogram(values)
+ assert sum(b["count"] for b in result["histogram"]) == 3
+ assert result["stats"]["mean"] == 0.5
+
+
+class TestParseDate:
+ def test_iso_timestamp(self):
+ assert _parse_date("2026-03-20T17:41:11.837715+00:00") == "2026-03-20"
+
+ def test_date_only(self):
+ assert _parse_date("2026-03-20") == "2026-03-20"
+
+ def test_none(self):
+ assert _parse_date(None) is None
+
+ def test_empty(self):
+ assert _parse_date("") is None
+
+
+class TestSanitize:
+ def test_clean_data_passes(self):
+ data = {"key": "hello", "nested": {"list": [1, 2, "safe"]}}
+ _sanitize(data)
+
+ def test_file_path_caught(self):
+ with pytest.raises(ValueError, match="Sensitive data"):
+ _sanitize({"key": "/Users/someone/secret"})
+
+ def test_api_key_caught(self):
+ with pytest.raises(ValueError, match="Sensitive data"):
+ _sanitize({"key": "sk-ant-abc123"})
+
+ def test_username_caught(self):
+ with pytest.raises(ValueError, match="Sensitive data"):
+ _sanitize({"key": "ivanpasichnyk"})
+
+ def test_numeric_values_ok(self):
+ data = {"q": 0.5, "count": 100, "nested": [1, 2, 3]}
+ _sanitize(data)
+
+ def test_deep_nesting(self):
+ with pytest.raises(ValueError):
+ _sanitize({"a": {"b": {"c": ["/Users/test/path"]}}})
+
+
+class TestExportVizData:
+ def _make_q_cache(self, tmp_path, entries=None):
+ """Write a Q-cache JSON file and return its path."""
+ cache_path = tmp_path / "q_cache.json"
+ cache_path.write_text(json.dumps(entries or {}))
+ return cache_path
+
+ def test_empty_q_cache(self, tmp_path):
+ """Export with empty Q-cache should produce valid structure."""
+ cache_path = self._make_q_cache(tmp_path)
+ obs_dir = tmp_path / "obs"
+ obs_dir.mkdir()
+ sess_dir = tmp_path / "sess"
+ sess_dir.mkdir()
+
+ with patch("openexp.core.config.Q_CACHE_PATH", cache_path), \
+ patch("openexp.core.config.DATA_DIR", tmp_path), \
+ patch("openexp.core.config.OBSERVATIONS_DIR", obs_dir), \
+ patch("openexp.core.config.SESSIONS_DIR", sess_dir):
+ data = export_viz_data(no_qdrant=True)
+
+ assert data["meta"]["total_memories"] == 0
+ assert data["q_distribution"]["combined"]["histogram"] == []
+ assert data["q_evolution"] == []
+ assert data["lifecycle"] == {}
+
+ def test_with_q_values(self, tmp_path):
+ """Export with sample Q-values produces correct distribution."""
+ entries = {
+ "id1": {"default": {"q_value": 0.5, "q_action": 0.6, "q_hypothesis": 0.4, "q_fit": 0.5,
+ "q_visits": 2, "q_updated_at": "2026-03-20T10:00:00", "calibration": "neutral"}},
+ "id2": {"default": {"q_value": 0.3, "q_action": 0.3, "q_hypothesis": 0.3, "q_fit": 0.3,
+ "q_visits": 1, "q_updated_at": "2026-03-21T10:00:00", "calibration": "valuable"}},
+ }
+ cache_path = self._make_q_cache(tmp_path, entries)
+ obs_dir = tmp_path / "obs"
+ obs_dir.mkdir()
+ sess_dir = tmp_path / "sess"
+ sess_dir.mkdir()
+
+ with patch("openexp.core.config.Q_CACHE_PATH", cache_path), \
+ patch("openexp.core.config.DATA_DIR", tmp_path), \
+ patch("openexp.core.config.OBSERVATIONS_DIR", obs_dir), \
+ patch("openexp.core.config.SESSIONS_DIR", sess_dir):
+ data = export_viz_data(no_qdrant=True)
+
+ assert data["meta"]["total_memories"] == 2
+ assert data["q_distribution"]["combined"]["stats"]["count"] == 2
+ assert len(data["q_evolution"]) == 2
+ assert data["calibration_counts"]["neutral"] == 1
+ assert data["calibration_counts"]["valuable"] == 1
+
+ def test_output_is_json_serializable(self, tmp_path):
+ """Exported data must be JSON-serializable."""
+ cache_path = self._make_q_cache(tmp_path)
+ obs_dir = tmp_path / "obs"
+ obs_dir.mkdir()
+ sess_dir = tmp_path / "sess"
+ sess_dir.mkdir()
+
+ with patch("openexp.core.config.Q_CACHE_PATH", cache_path), \
+ patch("openexp.core.config.DATA_DIR", tmp_path), \
+ patch("openexp.core.config.OBSERVATIONS_DIR", obs_dir), \
+ patch("openexp.core.config.SESSIONS_DIR", sess_dir):
+ data = export_viz_data(no_qdrant=True)
+
+ json_str = json.dumps(data, default=str)
+ assert len(json_str) > 0
+
+ def test_with_observations(self, tmp_path):
+ """Observation files should be counted by line."""
+ cache_path = self._make_q_cache(tmp_path)
+ obs_dir = tmp_path / "obs"
+ obs_dir.mkdir()
+ sess_dir = tmp_path / "sess"
+ sess_dir.mkdir()
+
+ # Create a fake observations file
+ obs_file = obs_dir / "observations-2026-03-20.jsonl"
+ obs_file.write_text('{"a":1}\n{"b":2}\n{"c":3}\n')
+
+ with patch("openexp.core.config.Q_CACHE_PATH", cache_path), \
+ patch("openexp.core.config.DATA_DIR", tmp_path), \
+ patch("openexp.core.config.OBSERVATIONS_DIR", obs_dir), \
+ patch("openexp.core.config.SESSIONS_DIR", sess_dir):
+ data = export_viz_data(no_qdrant=True)
+
+ assert len(data["observations_timeline"]) == 1
+ assert data["observations_timeline"][0]["observations_count"] == 3
+ assert data["meta"]["total_observations"] == 3
+
+
+class TestCLIIntegration:
+ def test_viz_subparser_exists(self):
+ """CLI should have cmd_viz function."""
+ import openexp.cli as cli_mod
+ assert hasattr(cli_mod, "cmd_viz")
+
+ def test_viz_output_file(self, tmp_path):
+ """cmd_viz should create output HTML file."""
+ output = tmp_path / "test-viz.html"
+ cache_path = tmp_path / "q_cache.json"
+ cache_path.write_text("{}")
+ obs_dir = tmp_path / "obs"
+ obs_dir.mkdir()
+ sess_dir = tmp_path / "sess"
+ sess_dir.mkdir()
+
+ with patch("openexp.core.config.Q_CACHE_PATH", cache_path), \
+ patch("openexp.core.config.DATA_DIR", tmp_path), \
+ patch("openexp.core.config.OBSERVATIONS_DIR", obs_dir), \
+ patch("openexp.core.config.SESSIONS_DIR", sess_dir), \
+ patch("webbrowser.open"):
+ from openexp.cli import cmd_viz
+ args = argparse.Namespace(output=str(output), no_open=True, no_qdrant=True, replay=None)
+ cmd_viz(args)
+
+ assert output.exists()
+ content = output.read_text()
+ assert "VIZ_DATA" in content
+ assert "OpenExp" in content
+ assert not re.search(r"/Users/\w+", content)
+
+ def test_viz_replay_flag(self, tmp_path):
+ """cmd_viz with --replay should use replay template."""
+ cache_path = tmp_path / "q_cache.json"
+ cache_path.write_text("{}")
+ obs_dir = tmp_path / "obs"
+ obs_dir.mkdir()
+ sess_dir = tmp_path / "sess"
+ sess_dir.mkdir()
+
+ # Create fake observation for session abc12345
+ obs_file = obs_dir / "observations-2026-03-20.jsonl"
+ obs_file.write_text(json.dumps({
+ "id": "obs-1", "timestamp": "2026-03-20T10:00:00Z",
+ "session_id": "abc12345-xxxx", "type": "feature",
+ "tool": "Bash", "summary": "Ran: echo hello", "project": "test",
+ }) + "\n")
+
+ output = tmp_path / "test-replay.html"
+
+ with patch("openexp.core.config.Q_CACHE_PATH", cache_path), \
+ patch("openexp.core.config.DATA_DIR", tmp_path), \
+ patch("openexp.core.config.OBSERVATIONS_DIR", obs_dir), \
+ patch("openexp.core.config.SESSIONS_DIR", sess_dir), \
+ patch("webbrowser.open"):
+ from openexp.cli import cmd_viz
+ args = argparse.Namespace(
+ output=str(output), no_open=True, no_qdrant=True, replay="abc12345",
+ )
+ cmd_viz(args)
+
+ # Output goes to the specified path when --output is given
+ assert output.exists()
+ content = output.read_text()
+ assert "REPLAY_DATA" in content
+ assert "Session Replay" in content
+
+
+class TestRedact:
+ def test_redact_file_path(self):
+ assert "/~/..." in _redact("Ran: cat /Users/someone/file.txt")
+
+ def test_redact_email(self):
+ result = _redact("from:anna@example.com")
+ assert "anna@" not in result
+ assert "an***@example.com" in result
+
+ def test_redact_api_key(self):
+ assert "sk-***" in _redact("key: sk-ant-abc123def456")
+
+ def test_clean_text_unchanged(self):
+ assert _redact("hello world") == "hello world"
+
+ def test_empty(self):
+ assert _redact("") == ""
+ assert _redact(None) == ""
+
+
+class TestClassifyStep:
+ def test_scan_inbox(self):
+ assert _classify_step({"summary": "read_emails.py 15 is:unread"})[0] == "scan_inbox"
+
+ def test_send_email(self):
+ assert _classify_step({"summary": "send_email.py --to someone"})[0] == "send_email"
+
+ def test_search_email(self):
+ assert _classify_step({"summary": "read_emails.py subject:meeting"})[0] == "search_email"
+
+ def test_crm(self):
+ assert _classify_step({"summary": "grep crm/leads.csv"})[0] == "crm"
+
+ def test_generic(self):
+ assert _classify_step({"summary": "ls -la", "tool": "Bash"})[0] == "action"
+
+
+class TestExportReplayData:
+ def test_with_observations(self, tmp_path):
+ """Replay export should build timeline from observations."""
+ cache_path = tmp_path / "q_cache.json"
+ cache_path.write_text("{}")
+ obs_dir = tmp_path / "obs"
+ obs_dir.mkdir()
+ sess_dir = tmp_path / "sess"
+ sess_dir.mkdir()
+
+ obs = [
+ {"id": "obs-1", "timestamp": "2026-03-20T10:00:00Z",
+ "session_id": "test1234-abcd", "type": "feature",
+ "tool": "Bash", "summary": "Ran: read_emails.py is:unread", "project": "test"},
+ {"id": "obs-2", "timestamp": "2026-03-20T10:01:00Z",
+ "session_id": "test1234-abcd", "type": "outreach",
+ "tool": "Bash", "summary": "Ran: send_email.py --to x@test.com", "project": "test"},
+ ]
+ obs_file = obs_dir / "observations-2026-03-20.jsonl"
+ obs_file.write_text("\n".join(json.dumps(o) for o in obs) + "\n")
+
+ with patch("openexp.core.config.Q_CACHE_PATH", cache_path), \
+ patch("openexp.core.config.DATA_DIR", tmp_path), \
+ patch("openexp.core.config.OBSERVATIONS_DIR", obs_dir), \
+ patch("openexp.core.config.SESSIONS_DIR", sess_dir):
+ data = export_replay_data("test1234")
+
+ assert "error" not in data
+ assert data["meta"]["total_observations"] == 2
+ assert data["meta"]["session_id"] == "test1234"
+ # Steps: session_start(if retrievals) + 2 obs + session_end = 3 (no retrievals)
+ assert data["steps"][-1]["type"] == "session_end"
+ assert "beats" in data
+ assert isinstance(data["beats"], list)
+ assert len(data["beats"]) >= 2 # at least start + end
+
+ def test_no_observations(self, tmp_path):
+ """Missing session should return error."""
+ cache_path = tmp_path / "q_cache.json"
+ cache_path.write_text("{}")
+ obs_dir = tmp_path / "obs"
+ obs_dir.mkdir()
+ sess_dir = tmp_path / "sess"
+ sess_dir.mkdir()
+
+ with patch("openexp.core.config.Q_CACHE_PATH", cache_path), \
+ patch("openexp.core.config.DATA_DIR", tmp_path), \
+ patch("openexp.core.config.OBSERVATIONS_DIR", obs_dir), \
+ patch("openexp.core.config.SESSIONS_DIR", sess_dir):
+ data = export_replay_data("nonexistent")
+
+ assert "error" in data
+
+ def test_sanitization(self, tmp_path):
+ """Replay output should not contain file paths."""
+ cache_path = tmp_path / "q_cache.json"
+ cache_path.write_text("{}")
+ obs_dir = tmp_path / "obs"
+ obs_dir.mkdir()
+ sess_dir = tmp_path / "sess"
+ sess_dir.mkdir()
+
+ obs = [
+ {"id": "obs-1", "timestamp": "2026-03-20T10:00:00Z",
+ "session_id": "sanitize-test", "type": "feature",
+ "tool": "Bash", "summary": "Ran: cat /Users/someone/secret.txt", "project": "test"},
+ ]
+ obs_file = obs_dir / "observations-2026-03-20.jsonl"
+ obs_file.write_text(json.dumps(obs[0]) + "\n")
+
+ with patch("openexp.core.config.Q_CACHE_PATH", cache_path), \
+ patch("openexp.core.config.DATA_DIR", tmp_path), \
+ patch("openexp.core.config.OBSERVATIONS_DIR", obs_dir), \
+ patch("openexp.core.config.SESSIONS_DIR", sess_dir):
+ data = export_replay_data("sanitize-test")
+
+ # Should pass sanitization (paths redacted)
+ json_str = json.dumps(data, default=str)
+ assert "/Users/someone" not in json_str
+
+
+class TestBuildConversation:
+ def test_basic_conversation(self):
+ """Should produce user + assistant messages from retrievals and observations."""
+ retrievals = [
+ {"timestamp": "2026-03-20T10:00:00Z", "query": "session start context",
+ "memory_ids": [], "scores": []},
+ {"timestamp": "2026-03-20T10:01:00Z", "query": "check inbox for new emails",
+ "memory_ids": [], "scores": []},
+ ]
+ steps = [
+ {"index": 0, "timestamp": "2026-03-20T10:00:00Z", "type": "session_start",
+ "label": "Session Start", "phase": "recall"},
+ {"index": 1, "timestamp": "2026-03-20T10:01:30Z", "type": "scan_inbox",
+ "label": "Scanning inbox", "phase": "work", "tool": "Bash"},
+ {"index": 2, "timestamp": "2026-03-20T10:02:00Z", "type": "session_end",
+ "label": "Session End", "phase": "reward"},
+ ]
+ obs = [
+ {"summary": "Ran: read_emails.py 15 is:unread", "tool": "Bash", "type": "feature"},
+ ]
+
+ result = _build_conversation(retrievals, steps, obs)
+
+ roles = [m["role"] for m in result]
+ assert "system" in roles
+ assert "user" in roles
+ assert "assistant" in roles
+
+ def test_empty_retrievals(self):
+ """No retrievals should produce only system messages."""
+ steps = [
+ {"index": 0, "timestamp": "2026-03-20T10:00:00Z", "type": "scan_inbox",
+ "label": "Scanning", "phase": "work", "tool": "Bash"},
+ ]
+ obs = [{"summary": "Ran: ls", "tool": "Bash", "type": "feature"}]
+
+ result = _build_conversation([], steps, obs)
+ # Should have system start + assistant action + system end
+ assert any(m["role"] == "system" for m in result)
+
+ def test_redaction_in_conversation(self):
+ """File paths and emails should be redacted in conversation."""
+ retrievals = [
+ {"timestamp": "2026-03-20T10:00:00Z", "query": "auto",
+ "memory_ids": [], "scores": []},
+ {"timestamp": "2026-03-20T10:01:00Z",
+ "query": "read /Users/someone/secret.txt and email alice@example.com",
+ "memory_ids": [], "scores": []},
+ ]
+ steps = [
+ {"index": 0, "timestamp": "2026-03-20T10:00:00Z", "type": "session_start",
+ "label": "Start", "phase": "recall"},
+ {"index": 1, "timestamp": "2026-03-20T10:02:00Z", "type": "action",
+ "label": "Working", "phase": "work", "tool": "Bash"},
+ ]
+ obs = [{"summary": "Ran: cat file", "tool": "Bash", "type": "feature"}]
+
+ result = _build_conversation(retrievals, steps, obs)
+ all_text = " ".join(m["text"] for m in result)
+ assert "/Users/someone" not in all_text
+ assert "alice@example.com" not in all_text
+
+ def test_conversation_in_replay_output(self, tmp_path):
+ """export_replay_data should include conversation field."""
+ cache_path = tmp_path / "q_cache.json"
+ cache_path.write_text("{}")
+ obs_dir = tmp_path / "obs"
+ obs_dir.mkdir()
+ sess_dir = tmp_path / "sess"
+ sess_dir.mkdir()
+
+ obs = [
+ {"id": "obs-1", "timestamp": "2026-03-20T10:00:00Z",
+ "session_id": "conv-test-1234", "type": "feature",
+ "tool": "Bash", "summary": "Ran: read_emails.py is:unread", "project": "test"},
+ ]
+ obs_file = obs_dir / "observations-2026-03-20.jsonl"
+ obs_file.write_text(json.dumps(obs[0]) + "\n")
+
+ with patch("openexp.core.config.Q_CACHE_PATH", cache_path), \
+ patch("openexp.core.config.DATA_DIR", tmp_path), \
+ patch("openexp.core.config.OBSERVATIONS_DIR", obs_dir), \
+ patch("openexp.core.config.SESSIONS_DIR", sess_dir):
+ data = export_replay_data("conv-test")
+
+ assert "conversation" in data
+ assert isinstance(data["conversation"], list)
+
+
+class TestTruncate:
+ def test_short_text(self):
+ assert _truncate("hello", 10) == "hello"
+
+ def test_long_text(self):
+ result = _truncate("a" * 200, 50)
+ assert len(result) == 50
+ assert result.endswith("…")
+
+ def test_none(self):
+ assert _truncate(None) == ""
+
+ def test_empty(self):
+ assert _truncate("") == ""
+
+
+class TestSummarizeActions:
+ def test_single_action(self):
+ result = _summarize_actions(["scan_inbox"])
+ assert "checking the inbox" in result
+ assert result.startswith("I'll handle this by")
+
+ def test_multiple_actions(self):
+ result = _summarize_actions(["scan_inbox", "read_email", "check_sent"])
+ assert "checking the inbox" in result
+ assert "reading the email thread" in result
+ assert " and " in result
+
+ def test_empty(self):
+ assert _summarize_actions([]) == "Working on it."
+
+ def test_deduplication(self):
+ result = _summarize_actions(["scan_inbox", "scan_inbox", "read_email"])
+ assert result.count("checking the inbox") == 1
+
+
+class TestBuildBeats:
+ def _make_steps_and_conv(self, num_obs=3, user_msgs=None):
+ """Helper to create steps and conversation for beat testing."""
+ steps = [
+ {"index": 0, "timestamp": "2026-03-20T10:00:00Z", "type": "session_start",
+ "label": "Session Start", "phase": "recall",
+ "memories_recalled": [{"id": "mem1", "score": 0.8, "q_combined": 0.5}]},
+ ]
+ obs = []
+ for i in range(num_obs):
+ steps.append({
+ "index": i + 1, "timestamp": f"2026-03-20T10:0{i+1}:00Z",
+ "type": "scan_inbox" if i == 0 else "read_email" if i == 1 else "send_email",
+ "label": "Scanning inbox" if i == 0 else "Reading email" if i == 1 else "Sending email",
+ "description": f"action {i}", "tool": "Bash", "phase": "work",
+ "memories_recalled": [{"id": f"mem{i+2}", "score": 0.7, "q_combined": 0.4}] if i == 0 else [],
+ })
+ obs.append({"summary": f"action {i}", "tool": "Bash", "type": "feature"})
+
+ steps.append({
+ "index": len(steps), "timestamp": "2026-03-20T10:10:00Z",
+ "type": "session_end", "label": "Session End", "phase": "reward",
+ "reward_info": {"memories_updated": 5, "alpha": 0.25},
+ })
+
+ conversation = [
+ {"step_index": 0, "role": "system", "text": "Session started."},
+ ]
+ if user_msgs:
+ for step_idx, text in user_msgs:
+ conversation.append({"step_index": step_idx, "role": "user", "text": text})
+ conversation.append({"step_index": len(steps) - 1, "role": "system",
+ "text": "Session complete."})
+ return steps, conversation, obs
+
+ def test_basic_beat_grouping(self):
+ """Steps group around user messages, has start/end."""
+ steps, conv, obs = self._make_steps_and_conv(
+ num_obs=3, user_msgs=[(1, "Check the inbox?")])
+ beats = _build_beats(steps, conv, obs)
+
+ assert beats[0]["type"] == "system_start"
+ assert beats[-1]["type"] == "system_end"
+ assert any(b["type"] == "user_turn" for b in beats)
+
+ def test_two_user_messages_create_two_beats(self):
+ """Each user msg = new beat."""
+ steps, conv, obs = self._make_steps_and_conv(
+ num_obs=4, user_msgs=[(1, "Check inbox?"), (3, "OK, send it.")])
+ beats = _build_beats(steps, conv, obs)
+
+ user_beats = [b for b in beats if b["type"] == "user_turn"]
+ assert len(user_beats) == 2
+ assert user_beats[0]["conversation"][0]["text"] == "Check inbox?"
+ assert user_beats[1]["conversation"][0]["text"] == "OK, send it."
+
+ def test_empty_conversation(self):
+ """Still produces start + end beats even with no user messages."""
+ steps, conv, obs = self._make_steps_and_conv(num_obs=2, user_msgs=None)
+ beats = _build_beats(steps, conv, obs)
+
+ assert len(beats) >= 2
+ assert beats[0]["type"] == "system_start"
+ assert beats[-1]["type"] == "system_end"
+
+ def test_beat_memories_deduplicated(self):
+ """Same memory across steps counted once per beat."""
+ steps = [
+ {"index": 0, "type": "session_start", "timestamp": "T0", "phase": "recall",
+ "memories_recalled": [{"id": "m1", "score": 0.9, "q_combined": 0.5}]},
+ {"index": 1, "type": "scan_inbox", "timestamp": "T1", "phase": "work",
+ "label": "Scan", "description": "scan", "tool": "Bash",
+ "memories_recalled": [{"id": "m2", "score": 0.8, "q_combined": 0.4}]},
+ {"index": 2, "type": "read_email", "timestamp": "T2", "phase": "work",
+ "label": "Read", "description": "read", "tool": "Bash",
+ "memories_recalled": [{"id": "m2", "score": 0.8, "q_combined": 0.4}]},
+ {"index": 3, "type": "session_end", "timestamp": "T3", "phase": "reward",
+ "label": "End", "reward_info": {"memories_updated": 2, "alpha": 0.25}},
+ ]
+ conv = [
+ {"step_index": 0, "role": "system", "text": "Started."},
+ {"step_index": 3, "role": "system", "text": "Done."},
+ ]
+ obs = [{"summary": "scan", "tool": "Bash"}, {"summary": "read", "tool": "Bash"}]
+
+ beats = _build_beats(steps, conv, obs)
+ # The auto beat should have m2 only once
+ auto_beat = [b for b in beats if b["type"] == "auto"][0]
+ mem_ids = [m["id"] for m in auto_beat["memories_recalled"]]
+ assert mem_ids.count("m2") == 1
+
+ def test_beat_actions_preserve_order(self):
+ """Actions match step order."""
+ steps, conv, obs = self._make_steps_and_conv(
+ num_obs=3, user_msgs=[(1, "Do it")])
+ beats = _build_beats(steps, conv, obs)
+
+ user_beat = [b for b in beats if b["type"] == "user_turn"][0]
+ indices = [a["step_index"] for a in user_beat["actions"]]
+ assert indices == sorted(indices)
+
+ def test_sanitization_of_beats(self, tmp_path):
+ """Beat data should pass _sanitize()."""
+ steps, conv, obs = self._make_steps_and_conv(
+ num_obs=2, user_msgs=[(1, "Check it")])
+ beats = _build_beats(steps, conv, obs)
+ # Should not raise
+ _sanitize({"beats": beats})
+
+ def test_summarize_actions_readable(self):
+ """Summary should produce readable English."""
+ result = _summarize_actions(["scan_inbox", "read_email"])
+ assert "I'll" in result
+ assert result.endswith(".")
+
+ def test_duration_hint_scales(self):
+ """More actions = longer hint."""
+ steps_short, conv_s, obs_s = self._make_steps_and_conv(
+ num_obs=1, user_msgs=[(1, "Go")])
+ steps_long, conv_l, obs_l = self._make_steps_and_conv(
+ num_obs=5, user_msgs=[(1, "Go")])
+ beats_short = _build_beats(steps_short, conv_s, obs_s)
+ beats_long = _build_beats(steps_long, conv_l, obs_l)
+
+ # Find user_turn beats
+ short_beat = [b for b in beats_short if b["type"] == "user_turn"][0]
+ long_beat = [b for b in beats_long if b["type"] == "user_turn"][0]
+ assert long_beat["duration_hint"] >= short_beat["duration_hint"]
+
+
+class TestDemoReplay:
+ def test_generate_demo_replay_structure(self):
+ data = generate_demo_replay()
+ assert data["meta"]["demo"] is True
+ assert data["meta"]["session_id"] == "demo0001"
+ assert len(data["beats"]) == 4
+ assert data["beats"][0]["type"] == "system_start"
+ assert data["beats"][1]["type"] == "user_turn"
+ assert data["beats"][2]["type"] == "user_turn"
+ assert data["beats"][3]["type"] == "system_end"
+
+ def test_demo_has_rich_conversation(self):
+ data = generate_demo_replay()
+ beat1 = data["beats"][1]
+ conv = beat1["conversation"]
+ assert len(conv) >= 5
+ types = [c.get("content_type", "text") for c in conv]
+ assert "email_card" in types
+ assert "memory_results" in types
+
+ def test_demo_has_flow_events(self):
+ data = generate_demo_replay()
+ beat1 = data["beats"][1]
+ for c in beat1["conversation"]:
+ assert "flow" in c
+
+ def test_demo_has_q_values(self):
+ data = generate_demo_replay()
+ assert len(data["memory_q_values"]) == 5
+ for mid, q in data["memory_q_values"].items():
+ assert "combined" in q
+ assert "combined_before" in q
+ assert q["reward_direction"] == "positive"
+
+ def test_demo_is_json_serializable(self):
+ data = generate_demo_replay()
+ json.dumps(data, default=str)
+
+ def test_demo_no_sensitive_data(self):
+ data = generate_demo_replay()
+ _sanitize(data)
From 9d04e3ac3aebc33af6d3d558d2aff392ba97405a Mon Sep 17 00:00:00 2001
From: Ivan Pasichnyk
Date: Sun, 29 Mar 2026 21:04:22 -0700
Subject: [PATCH 23/59] docs: 5-level storage pyramid, product page content,
architecture update
- docs/storage-system.md: comprehensive reference for L0-L4 storage,
all 4 reward paths, Q-learning formulas, 16 MCP tools
- docs/product-page-content.md: marketing copy for product page
- Updated CLAUDE.md and architecture.md to reference new modules
- Config: added explanation-related environment variables
Co-Authored-By: Claude Opus 4.6
---
CLAUDE.md | 16 +-
docs/architecture.md | 3 +
docs/product-page-content.md | 238 ++++++++++++++++++
docs/storage-system.md | 455 +++++++++++++++++++++++++++++++++++
openexp/core/config.py | 4 +
5 files changed, 710 insertions(+), 6 deletions(-)
create mode 100644 docs/product-page-content.md
create mode 100644 docs/storage-system.md
diff --git a/CLAUDE.md b/CLAUDE.md
index 67ef243..5a3d9da 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -24,13 +24,17 @@ Immediately. Don't wait. Every piece of context improves future retrieval.
## Architecture
-- `openexp/core/` — Q-learning engine (q_value, search, scoring, lifecycle)
-- `openexp/ingest/` — Observation → Qdrant pipeline
-- `openexp/resolvers/` — Outcome resolvers (CRM → rewards)
-- `openexp/hooks/` — Claude Code integration (session-start, post-tool-use, session-end)
-- `openexp/mcp_server.py` — MCP STDIO server
+**Full reference:** `docs/storage-system.md` — 5-level pyramid (L0–L4), all 4 reward paths, Q-learning formulas, 16 MCP tools, every file and env var. **Read that instead of re-reading source code.**
+
+- `openexp/core/` — Q-learning engine (q_value, search, scoring, lifecycle, explanation, reward_log)
+- `openexp/ingest/` — Observation → Qdrant pipeline + session reward (Path 1)
+- `openexp/reward_tracker.py` — Prediction → outcome rewards (Path 2)
+- `openexp/outcome.py` — Business event rewards (Path 3)
+- `openexp/resolvers/` — Outcome resolvers (CRM CSV → rewards)
+- `openexp/mcp_server.py` — MCP STDIO server (16 tools) + calibration rewards (Path 4)
- `openexp/cli.py` — CLI interface
-- `tests/` — pytest suite
+- `openexp/viz.py` — Visualization data export
+- `tests/` — 237 tests across 11 files
## Q-Learning (do not change without discussion)
diff --git a/docs/architecture.md b/docs/architecture.md
index 26b7053..4806f94 100644
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -1,5 +1,8 @@
# Architecture
+> **Full storage system docs:** See [storage-system.md](storage-system.md) for the complete
+> 5-level pyramid (L0–L4), all 4 reward paths, Q-learning formulas, MCP tools, and file map.
+
## System Overview
```
diff --git a/docs/product-page-content.md b/docs/product-page-content.md
new file mode 100644
index 0000000..27853ca
--- /dev/null
+++ b/docs/product-page-content.md
@@ -0,0 +1,238 @@
+# OpenExp — Product Page Content
+
+> Source of truth for website/landing page. Written for humans, not developers.
+> Last updated: 2026-03-26
+
+---
+
+## Headline
+
+**Your AI sessions don't learn from each other. OpenExp fixes that.**
+
+## Subheadline
+
+Persistent memory for Claude Code with Q-learning. Every outcome — commit, deploy, closed deal — feeds back as a signal. Over time, your AI knows what works.
+
+---
+
+## The Problem
+
+There are three ways people give context to AI coding assistants today.
+
+### 1. Static instructions (CLAUDE.md)
+
+You write a file with rules and preferences. The AI reads it at the start of each session. It works — but it doesn't learn. To change priorities, you edit the file by hand. The AI itself never updates its understanding of what matters.
+
+### 2. Bring everything (full context)
+
+Pack your CRM, project management, chat history, docs — everything — into the context window. The AI has access to it all. But it's expensive (tokens cost money), slow (large contexts = slower responses), and still doesn't scale. At some point, you can't fit it all in.
+
+### 3. Memory services (Mem0, Zep, LangMem)
+
+Store memories in a database. Search and retrieve when relevant. Better than static files — but every memory is equally important. A critical architecture decision and a random grep command have the same weight. There's no learning.
+
+---
+
+## The OpenExp Approach
+
+Write everything. Remember selectively. **Learn from outcomes.**
+
+### How it works
+
+**1. Automatic capture**
+
+Every action in your Claude Code session — file edits, commits, commands, decisions — is automatically recorded as a memory. You don't do anything. Hooks handle it.
+
+**2. Smart retrieval**
+
+Before each response, the system finds 5-10 most relevant memories and injects them into context. Not by similarity alone — by **proven usefulness**.
+
+**3. Reward loop**
+
+After every session, the system looks at what happened:
+
+| Session outcome | Signal |
+|----------------|--------|
+| Code committed | +0.3 |
+| Pull request created | +0.2 |
+| Deployed to production | +0.1 |
+| Tests passed | +0.1 |
+| Nothing produced | -0.1 |
+
+Memories that were used in productive sessions get a higher score. Memories from empty sessions get a lower score.
+
+This is Q-learning — the same algorithm that trained AlphaGo. Applied to your working memory.
+
+**After a month of use, search results are fundamentally different from plain semantic search.** Proven memories surface first. Noise sinks.
+
+---
+
+## Experiences — Different Lenses on the Same Memory
+
+One memory can be valuable in one context and worthless in another.
+
+An Experience is a definition of what "success" means for a specific workflow. You create it as a simple YAML config.
+
+### For a developer (default)
+
+```yaml
+weights:
+ commit: 0.3
+ pr: 0.2
+ deploy: 0.1
+ tests: 0.1
+```
+
+### For sales
+
+```yaml
+weights:
+ email_sent: 0.15
+ proposal_sent: 0.20
+ payment_received: 0.30
+ commit: 0.0
+```
+
+### For support
+
+```yaml
+weights:
+ ticket_closed: 0.25
+ escalation_avoided: 0.20
+ customer_reply: 0.10
+```
+
+### For content creation
+
+```yaml
+weights:
+ post_published: 0.25
+ engagement: 0.15
+ subscriber_gained: 0.20
+```
+
+**Each memory holds separate scores per experience.** In a sales context, sales-relevant memories surface. In a coding context — coding memories.
+
+### Example
+
+Memory: *"Discussed NDA with client — lawyers took 2 weeks, 10+7 year term"*
+
+| Experience | Score | Why |
+|-----------|-------|-----|
+| **coding** | 0.05 | Session had no commits. Useless for coding. |
+| **dealflow** | 0.72 | NDA led to proposal, then payment. Very useful for sales. |
+
+Same memory. Different scores. The active lens determines what surfaces.
+
+You can create custom experiences with `openexp experience create` or drop an `.openexp.yaml` into any project folder for automatic per-project switching.
+
+---
+
+## Four Reward Channels
+
+Not just session outcomes. Four ways to feed signals back.
+
+### 1. Session (automatic)
+
+After every session, the system analyzes what was produced and rewards memories accordingly. No manual action required.
+
+### 2. Predictions
+
+Your AI says "I predict the client will sign." Later, you report the actual outcome. The accuracy difference becomes a reward signal.
+
+### 3. Business events
+
+Connect your CRM. When a deal closes or payment arrives, all memories tagged with that client automatically receive a reward. Real business outcomes flow back to the knowledge that contributed.
+
+### 4. Manual calibration
+
+You know best. Mark any memory as valuable or worthless directly. Override the algorithm when you have knowledge it doesn't.
+
+---
+
+## Five Levels of Understanding
+
+A number alone doesn't explain itself. When you see Q=0.8, you don't know why. Each level adds depth.
+
+| Level | What | Purpose |
+|-------|------|---------|
+| **L0** | Raw session logs | Full audit trail |
+| **L1** | Q-value (one number) | Search ranking |
+| **L2** | Short notes: "Session +0.30: 2 commits, 1 PR" | Quick context for score changes |
+| **L3** | Full record with all context | Detailed audit |
+| **L4** | LLM explanation: "This memory helped because it contained the architecture decision for module X" | Human-readable reasoning |
+
+L1-L2 are in memory — fast, used for ranking. L3-L4 are on disk — for when you want to understand why a memory has its score.
+
+Ask any time: `explain_q("memory-id")` — get the full story.
+
+---
+
+## Search: Five Factors
+
+Not just "find similar text." Five components weighted together.
+
+| Factor | Weight | What it does |
+|--------|--------|-------------|
+| Semantic similarity | 30% | Vector search — meaning, not keywords |
+| Q-value | 30% | Proven useful memories rank higher |
+| Keywords (BM25) | 10% | Exact matches when they matter |
+| Recency | 15% | Recent memories get a small boost |
+| Importance | 15% | Decisions outrank commands |
+
+The key: **Q-value is 30% of the ranking.** This means the system's search improves with every session. After 100 sessions, your retrieval is personalized by actual outcomes.
+
+---
+
+## Fully Local
+
+No SaaS. No data leaves your machine.
+
+| Component | Where it runs |
+|-----------|--------------|
+| **Qdrant** | Docker container on your machine |
+| **FastEmbed** | Local embeddings, no API calls |
+| **Q-cache** | JSON file on disk |
+| **LLM explanations (L4)** | Anthropic API (optional, can be disabled) |
+
+All data lives under `~/.openexp/`. You own everything.
+
+---
+
+## Built for Claude Code
+
+OpenExp integrates through native Claude Code hooks:
+
+| Hook | When | What happens |
+|------|------|-------------|
+| **Session start** | You open a session | Top memories injected into context |
+| **Each message** | You type something | Relevant memories retrieved |
+| **After each action** | AI writes/edits/runs | Observation recorded |
+| **Session end** | You close | Reward computed, Q-values updated |
+
+Zero manual work. Install, use Claude Code as usual, watch it get smarter.
+
+---
+
+## Quick Start
+
+```bash
+# Install
+pip install openexp-memory
+
+# Start Qdrant
+docker run -d --name openexp-qdrant -p 6333:6333 qdrant/qdrant
+
+# Register hooks with Claude Code
+openexp hooks install
+
+# Done. Use Claude Code as normal.
+```
+
+---
+
+## Open Source
+
+MIT License. GitHub: [anthroos/openexp](https://github.com/anthroos/openexp)
+
+Based on research: [The Yerkes-Dodson Curve for AI Agents](https://arxiv.org/abs/2603.07360)
diff --git a/docs/storage-system.md b/docs/storage-system.md
new file mode 100644
index 0000000..501cd83
--- /dev/null
+++ b/docs/storage-system.md
@@ -0,0 +1,455 @@
+# OpenExp Storage System — Complete Reference
+
+> **Purpose:** This document describes the full storage architecture so that Claude
+> doesn't have to re-read every source file each session. Read THIS instead of the code.
+>
+> **Last updated:** 2026-03-26 (after L4 audit, all gaps fixed, 237 tests pass)
+
+---
+
+## 1. The 5-Level Storage Pyramid
+
+Every memory gets a Q-value that rises when useful and falls when not.
+A number alone doesn't explain itself — each level adds understanding.
+
+| Level | What | Where | Size | Purpose |
+|-------|------|-------|------|---------|
+| **L0** | Raw observations | `~/.openexp/observations/*.jsonl` | ~50 KB/session | Everything that happened: tool calls, edits, commands |
+| **L1** | Q-value scalar | `q_cache.json` → `q_value` field | 1 float | How useful is this memory? (−0.5 … 1.0) |
+| **L2** | Reward contexts | `q_cache.json` → `reward_contexts[]` | Max 5 strings, 120 chars | Brief: `"Session +0.30: 2 commits, 1 PR [rwd_abc]"` |
+| **L3** | Cold storage | `reward_log.jsonl` | Full JSON per event | Complete reward record: observations, breakdowns, predictions |
+| **L4** | LLM explanation | `explanation` field in L3 record | Max 500 chars | Opus 4.6 writes WHY: "This note helped because…" |
+
+### Data Flow
+
+```
+Session observations (L0)
+ → compute_session_reward() → reward signal
+ → read q_before from QCache
+ → QValueUpdater.update_all_layers() → new Q-value (L1) + context (L2)
+ → read q_after from QCache
+ → generate_reward_explanation(q_before, q_after) → explanation (L4)
+ → log_reward_event() → cold record (L3) with explanation
+```
+
+### Linking Across Levels
+
+```
+L2 context string: "Session +0.30: 2 commits [rwd_abc12345]"
+ ↑
+L3 reward_log.jsonl: {"reward_id": "rwd_abc12345", ..., "explanation": "..."}
+ ↑
+L4 explanation: "Ця нотатка допомогла бо містила архітектурне рішення..."
+```
+
+---
+
+## 2. Four Reward Paths
+
+Each path: reads q_before → updates Q-values → reads q_after → generates L4 explanation → logs L3 record.
+
+| # | Path | Trigger | File | `reward_type` |
+|---|------|---------|------|---------------|
+| 1 | **Session** | Session end (hook) | `openexp/ingest/reward.py` → `apply_session_reward()` | `"session"` |
+| 2 | **Prediction** | `log_outcome` MCP call | `openexp/reward_tracker.py` → `RewardTracker.log_outcome()` | `"prediction"` |
+| 3 | **Business** | `resolve_outcomes` MCP call | `openexp/outcome.py` → `resolve_outcomes()` | `"business"` |
+| 4 | **Calibration** | `calibrate_experience_q` MCP call | `openexp/mcp_server.py` | `"calibration"` |
+
+### Path 1: Session Reward (`ingest/reward.py`)
+
+**Trigger:** `session-end.sh` hook → `ingest` CLI → `apply_session_reward()`
+
+**Logic:**
+1. `compute_session_reward(observations)` → heuristic score [−0.5, +0.5]
+ - Positive signals: commits (+0.3), PRs (+0.2), writes (+0.02 each), deploys (+0.1), tests (+0.1), decisions (+0.1)
+ - Negative: base (−0.1), few observations (−0.05), no output (−0.1)
+ - Experience-specific weights override defaults
+2. `_build_session_reward_context(obs, reward)` → L2 string: `"Session +0.30: 2 commits, 1 PR"`
+3. Read `q_before` from first memory's Q-cache entry
+4. `QValueUpdater.update_all_layers()` for each memory
+5. Read `q_after` from first memory's Q-cache entry
+6. `generate_reward_explanation(reward_type="session", q_before, q_after)` → L4
+7. `log_reward_event()` → L3
+
+**Also:** `reward_retrieved_memories()` — rewards memories recalled at session start (closed-loop). Delegates to `apply_session_reward()`.
+
+### Path 2: Prediction Reward (`reward_tracker.py`)
+
+**Trigger:** User calls `log_outcome` MCP tool with prediction_id + outcome + reward.
+
+**Logic:**
+1. Find pending prediction by ID
+2. Build reward context: `"Pred +0.80: 'prediction snippet' -> 'outcome snippet'"`
+3. Read `q_before` from first memory via `self.q_cache.get()`
+4. Update Q-values for all `memory_ids_used`
+5. Read `q_after`
+6. Generate L4 explanation with `reward_type="prediction"`
+7. Log L3 record
+
+**Data stored:** prediction text, outcome, confidence, strategic_value, cause_category.
+
+### Path 3: Business Reward (`outcome.py`)
+
+**Trigger:** User calls `resolve_outcomes` MCP tool → runs all registered `OutcomeResolver` subclasses.
+
+**Logic:**
+1. Each resolver scans external data (e.g., CRM CSV diffs) → emits `OutcomeEvent`s
+2. For each event: auto-resolve matching pending predictions
+3. Find memories tagged with `entity_id` via Qdrant scroll
+4. Read `q_before` from first memory via `q_updater.cache.get()`
+5. Apply reward to all tagged memories
+6. Read `q_after`
+7. Generate L4 explanation with `reward_type="business"`
+8. Log L3 record
+
+**Resolver:** `CRMCSVResolver` diffs `deals.csv` / `leads.csv` against snapshot, detects stage transitions.
+
+### Path 4: Calibration (`mcp_server.py`)
+
+**Trigger:** User calls `calibrate_experience_q` MCP tool with memory_id + new q_value.
+
+**Logic:**
+1. Read `old_q` from cache
+2. Set all Q-layers to `new_q` directly (no formula)
+3. Generate L4 explanation with `reward_type="calibration"`, `q_before=old_q, q_after=new_q`
+4. Log L3 record
+5. Append L2 context: `"Cal 0.80: "`
+
+---
+
+## 3. Q-Learning Engine (`core/q_value.py`)
+
+### Formula
+
+```
+Q_new = clamp(Q_old + alpha * reward, q_floor, q_ceiling)
+```
+
+- `alpha = 0.25` (learning rate)
+- `q_init = 0.0` (new memories start at zero)
+- `q_floor = -0.5`, `q_ceiling = 1.0`
+
+### Three Layers
+
+| Layer | Weight | Reward | What it measures |
+|-------|--------|--------|------------------|
+| `q_action` | 50% | full reward | Was retrieving this memory useful? |
+| `q_hypothesis` | 20% | reward × 0.8 | Is the hypothesis/insight valid? |
+| `q_fit` | 30% | full if positive, ×0.5 if negative | Does this memory fit the experience? |
+
+Combined: `Q = 0.5 * q_action + 0.2 * q_hypothesis + 0.3 * q_fit`
+
+### QCache
+
+- `OrderedDict` with LRU eviction (max 100K entries)
+- **Nested format:** `{memory_id: {experience_name: {q_value, q_action, q_hypothesis, q_fit, q_visits, reward_contexts[], q_updated_at, last_reward, ...}}}`
+- Auto-migrates from flat format on load
+- **Delta persistence:** each session writes only changed entries to `~/.openexp/data/deltas/delta_.json`. On startup, merges all deltas (newest wins) into main cache.
+- `save()` writes full cache; `save_delta()` writes only dirty entries.
+
+### Reward Contexts (L2)
+
+- Max 5 per memory (FIFO eviction)
+- Max 120 chars each
+- Format: `"Session +0.30: 2 commits [rwd_abc12345]"` — the `[rwd_xxx]` suffix links to L3
+- Stored inside `q_data.reward_contexts[]`
+
+---
+
+## 4. L4 Explanation Engine (`core/explanation.py`)
+
+### `generate_reward_explanation()`
+
+- **Model:** `claude-opus-4-6` (configurable via `OPENEXP_EXPLANATION_MODEL`)
+- **Enabled:** `OPENEXP_EXPLANATION_ENABLED=true` (default)
+- **max_tokens:** 200
+- **Safety cap:** 500 chars
+- **Graceful:** returns `None` on any error (disabled, no API key, API failure)
+- **Lazy client:** singleton `_anthropic_client` (same pattern as enrichment.py)
+
+### Prompt Types
+
+| `reward_type` | Prompt focus | When used |
+|---------------|-------------|-----------|
+| `session` | Session observations + breakdown + memories used | Session end |
+| `prediction` | Prediction text + outcome + confidence | log_outcome |
+| `business` | Entity ID + event name + details | resolve_outcomes |
+| `calibration` | Old Q → New Q + reason | calibrate_experience_q |
+| `summary` | Aggregated events for a memory | explain_q regenerate=true |
+
+### Q-line in Prompts
+
+When both `q_before` and `q_after` are provided, the prompt includes:
+```
+Q-value: 0.30 → 0.58
+```
+When either is None, this line is omitted (graceful degradation).
+
+### `fetch_memory_contents()`
+
+Retrieves up to `limit` (default 5) memory texts from Qdrant by ID. Returns `{memory_id: content_text[:300]}`. Graceful on failure (returns `{}`).
+
+---
+
+## 5. Cold Storage (`core/reward_log.py`)
+
+### File
+
+`~/.openexp/data/reward_log.jsonl` — append-only JSONL, rotated at 100 MB.
+
+### Record Format
+
+```json
+{
+ "reward_id": "rwd_abc12345",
+ "timestamp": "2026-03-26T12:00:00+00:00",
+ "reward_type": "session",
+ "reward": 0.30,
+ "memory_ids": ["mem-1", "mem-2"],
+ "experience": "default",
+ "context": {
+ "observations": [...],
+ "observation_count": 15,
+ "reward_breakdown": {"commits": 2, "prs": 1, "writes": 5},
+ "session_id": "abc123"
+ },
+ "explanation": "Ця нотатка допомогла бо містила архітектурне рішення..."
+}
+```
+
+### Access Functions
+
+| Function | What | Used by |
+|----------|------|---------|
+| `generate_reward_id()` | `"rwd_<8hex>"` | All 4 paths |
+| `log_reward_event()` | Append record | All 4 paths |
+| `get_reward_detail(reward_id)` | Lookup by ID | `reward_detail` MCP tool |
+| `get_reward_history(memory_id)` | All events for a memory | `memory_reward_history`, `explain_q` MCP tools |
+| `compact_observation(obs)` | Strip to id/tool/summary/type/path/tags | Session path (L3 context) |
+
+---
+
+## 6. MCP Tools (16 total)
+
+### Memory CRUD
+| Tool | What |
+|------|------|
+| `search_memory` | FastEmbed + Qdrant + BM25 + Q-value reranking |
+| `add_memory` | Store new memory with embedding |
+
+### Prediction Loop
+| Tool | What |
+|------|------|
+| `log_prediction` | Log prediction → returns `pred_id` |
+| `log_outcome` | Resolve prediction → reward Q-values |
+
+### Context & Reflection
+| Tool | What |
+|------|------|
+| `get_agent_context` | memories + Q-scores + pending predictions |
+| `reflect` | Pattern finding on recent memories |
+| `memory_stats` | System statistics |
+
+### Outcome & Cache
+| Tool | What |
+|------|------|
+| `resolve_outcomes` | Run CRM resolvers → business rewards |
+| `reload_q_cache` | Reload from disk |
+
+### Experience Introspection
+| Tool | What |
+|------|------|
+| `experience_info` | Current experience config |
+| `experience_top_memories` | Top/bottom N by Q-value |
+| `experience_insights` | Reward distribution, learning velocity |
+
+### Q-Value Inspection
+| Tool | What |
+|------|------|
+| `calibrate_experience_q` | Manually set Q-value + L4 explanation |
+| `memory_reward_history` | Q + L2 contexts + L3 records |
+| `reward_detail` | Full L3 record by reward_id |
+| `explain_q` | Aggregated L4 explanations + optional LLM regeneration |
+
+---
+
+## 7. Experience System (`core/experience.py`)
+
+Same memory can have different Q-values per experience (e.g., "default", "sales", "coding").
+
+- Configs in `~/.openexp/experiences/.yaml` or bundled defaults
+- Each experience defines: reward weights, resolver configs, type boosts
+- Active experience set via `OPENEXP_EXPERIENCE` env var (default: `"default"`)
+- Q-cache stores: `{memory_id: {experience_name: {q_data...}, ...}}`
+
+---
+
+## 8. Search & Scoring
+
+### Search Pipeline (`core/direct_search.py` + `hybrid_search.py`)
+
+1. **FastEmbed** (BAAI/bge-small-en-v1.5, 384-dim, local) embeds query
+2. **Qdrant** vector search with lifecycle + metadata filters
+3. **BM25** pure-Python scoring on payload texts
+4. **Hybrid merge:** vector 30% + BM25 10% + recency 15% + importance 15% + Q-value 30%
+
+### Scoring Weights (`core/scoring.py`)
+
+| Component | Weight | Source |
+|-----------|--------|--------|
+| Semantic similarity | 30% | FastEmbed cosine via Qdrant |
+| Q-value | 30% | Q-cache |
+| Recency | 15% | `created_at` exponential decay |
+| Importance | 15% | Memory type + tags |
+| BM25 keyword | 10% | Hybrid search |
+
+---
+
+## 9. Ingest Pipeline
+
+### Flow
+
+```
+~/.openexp/observations/*.jsonl (written by post-tool-use hook)
+ ↓
+ filters.py (drops ~60-70% trivial obs)
+ ↓
+ observation.py (batch embed via FastEmbed → upsert to Qdrant)
+ ↓
+~/.openexp/sessions/*.md (written by session-end hook)
+ ↓
+ session_summary.py (parse markdown → higher-importance memories)
+ ↓
+ reward.py (compute session reward → update Q-values)
+ ↓
+ watermark.py (mark processed obs IDs for idempotency)
+```
+
+### Filters (`ingest/filters.py`)
+
+Drops: read-only commands (cat, grep, ls), short summaries (<15 chars), Read/Glob/Grep tool calls.
+Keeps: Write, Edit, Bash with side effects, decisions, valuable tags.
+
+---
+
+## 10. Hooks (Claude Code Integration)
+
+| Hook | File | When | What |
+|------|------|------|------|
+| **SessionStart** | `session-start.sh` | Session begins | Search Qdrant → inject top-5 memories → log retrieval IDs |
+| **UserPromptSubmit** | `user-prompt-recall.sh` | Each message | Context recall (skip trivial) → inject |
+| **PostToolUse** | `post-tool-use.sh` | After Write/Edit/Bash | Write observation to JSONL (skip reads) |
+| **SessionEnd** | `session-end.sh` | Session ends | Generate summary → async ingest → compute reward |
+
+---
+
+## 11. File Map
+
+### Config
+
+| File | Purpose |
+|------|---------|
+| `core/config.py` | All env-var-based settings (paths, models, keys, ports) |
+
+### Core Engine
+
+| File | Purpose |
+|------|---------|
+| `core/q_value.py` | QCache (LRU + delta), QValueUpdater (3-layer), QScorer, reward contexts |
+| `core/direct_search.py` | FastEmbed embedding + Qdrant vector search |
+| `core/hybrid_search.py` | Pure Python BM25 implementation |
+| `core/scoring.py` | Composite scoring (semantic + recency + importance + Q) |
+| `core/lifecycle.py` | 8-state memory lifecycle with transition validation |
+| `core/enrichment.py` | LLM metadata extraction (Haiku) |
+| `core/explanation.py` | L4 LLM reward explanations (Opus) |
+| `core/reward_log.py` | L3 cold storage JSONL |
+| `core/experience.py` | Per-experience Q-values + YAML configs |
+| `core/compaction.py` | Cluster similar memories, merge, deduplicate |
+| `core/v7_extensions.py` | Lifecycle filtering + hybrid scoring helpers |
+
+### Ingest
+
+| File | Purpose |
+|------|---------|
+| `ingest/filters.py` | Drop trivial observations |
+| `ingest/observation.py` | Batch embed → Qdrant upsert |
+| `ingest/session_summary.py` | Parse session markdown → memories |
+| `ingest/reward.py` | Session reward computation + Q-update + L3/L4 |
+| `ingest/retrieval_log.py` | Track recalled memory IDs |
+| `ingest/watermark.py` | Idempotent ingestion tracking |
+
+### Reward Paths
+
+| File | Purpose |
+|------|---------|
+| `ingest/reward.py` | Path 1: Session reward |
+| `reward_tracker.py` | Path 2: Prediction → outcome |
+| `outcome.py` | Path 3: Business events (+ OutcomeResolver ABC) |
+| `mcp_server.py` | Path 4: Calibration (+ all 16 MCP tools) |
+| `resolvers/crm_csv.py` | CRM CSV diff resolver |
+
+### Other
+
+| File | Purpose |
+|------|---------|
+| `mcp_server.py` | STDIO MCP server (init, tools, request handler) |
+| `cli.py` | CLI: search, ingest, stats, viz |
+| `viz.py` | Export data for visualization dashboard |
+
+---
+
+## 12. Data Files
+
+| File | Path | Format |
+|------|------|--------|
+| Q-cache | `~/.openexp/data/q_cache.json` | Nested JSON: `{mem_id: {exp: {q_data}}}` |
+| Q-cache deltas | `~/.openexp/data/deltas/delta_.json` | Same format, dirty entries only |
+| Reward log (L3) | `~/.openexp/data/reward_log.jsonl` | JSONL, rotated at 100 MB |
+| Predictions | `~/.openexp/data/predictions.jsonl` | JSONL: pending/resolved predictions |
+| Outcomes | `~/.openexp/data/outcomes.jsonl` | JSONL: prediction outcomes |
+| Retrieval log | `~/.openexp/data/session_retrievals.jsonl` | Which memories recalled when |
+| CRM snapshot | `~/.openexp/data/crm_snapshot.json` | Last CRM state for diffing |
+| Ingest watermark | `~/.openexp/data/ingest_watermark.json` | Processed observation IDs |
+| Observations (L0) | `~/.openexp/observations/obs-YYYYMMDD-*.jsonl` | Raw tool-use observations |
+| Session summaries | `~/.openexp/sessions/*.md` | Markdown session summaries |
+
+---
+
+## 13. Environment Variables
+
+| Variable | Default | What |
+|----------|---------|------|
+| `OPENEXP_DATA_DIR` | `~/.openexp/data` | Main data directory |
+| `OPENEXP_OBSERVATIONS_DIR` | `~/.openexp/observations` | Raw observations |
+| `OPENEXP_SESSIONS_DIR` | `~/.openexp/sessions` | Session summaries |
+| `OPENEXP_COLLECTION` | `openexp_memories` | Qdrant collection name |
+| `OPENEXP_EMBEDDING_MODEL` | `BAAI/bge-small-en-v1.5` | FastEmbed model |
+| `OPENEXP_EMBEDDING_DIM` | `384` | Embedding dimensions |
+| `OPENEXP_ENRICHMENT_MODEL` | `claude-haiku-4-5-20251001` | Enrichment LLM |
+| `OPENEXP_EXPLANATION_MODEL` | `claude-opus-4-6` | L4 explanation LLM |
+| `OPENEXP_EXPLANATION_ENABLED` | `true` | Enable/disable L4 |
+| `OPENEXP_EXPERIENCE` | `default` | Active experience name |
+| `OPENEXP_EXPERIENCES_DIR` | `~/.openexp/experiences` | Experience YAML configs |
+| `OPENEXP_OUTCOME_RESOLVERS` | `""` | Resolver classes (module:Class) |
+| `OPENEXP_CRM_DIR` | `""` | CRM directory for CSV resolver |
+| `OPENEXP_INGEST_BATCH_SIZE` | `50` | Batch size for embedding |
+| `QDRANT_HOST` | `localhost` | Qdrant host |
+| `QDRANT_PORT` | `6333` | Qdrant port |
+| `QDRANT_API_KEY` | `""` | Qdrant auth (optional) |
+| `ANTHROPIC_API_KEY` | `""` | For enrichment + explanations |
+
+---
+
+## 14. Test Coverage
+
+237 tests across 11 test files. Key test files for the storage system:
+
+| File | Tests | What |
+|------|-------|------|
+| `test_explanation.py` | 21 | L4 prompts, generation, fetch, L3 field, explain_q, integration |
+| `test_q_value.py` | 17 | QCache CRUD, LRU, delta, updater, scorer, reward contexts |
+| `test_reward_log.py` | 11 | Reward ID, log/get, history, compact |
+| `test_reward_context.py` | 11 | L2 context builders for all 3 paths |
+| `test_outcome.py` | 15 | OutcomeEvent, matching, CRM resolver, resolve_outcomes |
+| `test_session_end.py` | 7 | Session reward, retrieval log, closed-loop |
+| `test_experience.py` | 16 | Experience loading, per-experience Q, migration |
diff --git a/openexp/core/config.py b/openexp/core/config.py
index 54f48bf..af9e640 100644
--- a/openexp/core/config.py
+++ b/openexp/core/config.py
@@ -44,6 +44,10 @@
# Enrichment model (optional — requires ANTHROPIC_API_KEY)
ENRICHMENT_MODEL = os.getenv("OPENEXP_ENRICHMENT_MODEL", "claude-haiku-4-5-20251001")
+# L4: LLM-generated reward explanations (default: Opus for deep understanding)
+EXPLANATION_MODEL = os.getenv("OPENEXP_EXPLANATION_MODEL", "claude-opus-4-6")
+EXPLANATION_ENABLED = os.getenv("OPENEXP_EXPLANATION_ENABLED", "true").lower() == "true"
+
# Outcome resolvers (format: "module:ClassName,module2:ClassName2")
OUTCOME_RESOLVERS = os.getenv("OPENEXP_OUTCOME_RESOLVERS", "").strip()
From 9cc5661186208e7f93621013e971d17fd73dbd8b Mon Sep 17 00:00:00 2001
From: Ivan Pasichnyk
Date: Sun, 29 Mar 2026 21:07:52 -0700
Subject: [PATCH 24/59] feat: add CI workflow + update README for full feature
set
- GitHub Actions: test on Python 3.11/3.12/3.13 with Qdrant service
- README: 16 MCP tools (was 8), updated architecture, CLI commands,
new docs links, CI badge
- Contributing: updated focus areas
Co-Authored-By: Claude Opus 4.6
---
.github/workflows/tests.yml | 37 +++++++++++++++++++++++++++
README.md | 50 ++++++++++++++++++++++++++++++++-----
2 files changed, 81 insertions(+), 6 deletions(-)
create mode 100644 .github/workflows/tests.yml
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
new file mode 100644
index 0000000..86d8b0c
--- /dev/null
+++ b/.github/workflows/tests.yml
@@ -0,0 +1,37 @@
+name: Tests
+
+on:
+ push:
+ branches: [main]
+ pull_request:
+ branches: [main]
+
+jobs:
+ test:
+ runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ python-version: ["3.11", "3.12", "3.13"]
+
+ services:
+ qdrant:
+ image: qdrant/qdrant:latest
+ ports:
+ - 6333:6333
+
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v5
+ with:
+ python-version: ${{ matrix.python-version }}
+
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install -r requirements.txt
+ pip install pytest
+
+ - name: Run tests
+ run: pytest tests/ -v --tb=short
diff --git a/README.md b/README.md
index 1f9a9c0..727f159 100644
--- a/README.md
+++ b/README.md
@@ -7,6 +7,7 @@
+
@@ -127,7 +128,7 @@ Three hooks integrate with Claude Code automatically:
| **PostToolUse** | After Write/Edit/Bash | Captures what Claude does as observations (JSONL) |
| **SessionEnd** | Session closes | Generates summary, triggers ingest + reward (async) |
-The MCP server provides 8 tools for explicit memory operations (search, add, predict, reflect).
+The MCP server provides 16 tools for memory operations, introspection, and calibration.
### The Learning Loop
@@ -185,6 +186,8 @@ With 10% epsilon-greedy exploration — occasionally surfaces low-Q memories to
## MCP Tools
+**Core — memory operations:**
+
| Tool | Description |
|------|-------------|
| `search_memory` | Hybrid search: BM25 + vector + Q-value reranking |
@@ -197,6 +200,18 @@ With 10% epsilon-greedy exploration — occasionally surfaces low-Q memories to
| `memory_stats` | Q-cache size, prediction accuracy stats |
| `reload_q_cache` | Hot-reload Q-values from disk |
+**Introspection — understand why memories rank the way they do:**
+
+| Tool | Description |
+|------|-------------|
+| `experience_info` | Active experience config (weights, resolvers, boosts) |
+| `experience_top_memories` | Top or bottom N memories by Q-value |
+| `experience_insights` | Reward distribution, learning velocity, valuable memory types |
+| `calibrate_experience_q` | Manually set Q-value for a memory with reason |
+| `memory_reward_history` | Full reward trail: Q-value changes, contexts (L2), cold storage (L3) |
+| `reward_detail` | Complete L3 cold storage record for a reward event |
+| `explain_q` | Human-readable LLM explanation of why a memory has its Q-value (L4) |
+
## CLI
```bash
@@ -214,6 +229,18 @@ openexp resolve
# Show Q-cache statistics
openexp stats
+
+# Memory compaction (merge similar memories)
+openexp compact --dry-run
+
+# Manage experiences
+openexp experience list
+openexp experience show sales
+openexp experience create # interactive wizard
+
+# Visualization
+openexp viz --replay latest # session replay
+openexp viz --demo # demo dashboard
```
## Configuration
@@ -249,7 +276,11 @@ openexp/
│ ├── hybrid_search.py # BM25 keyword + vector + Q-value hybrid scoring
│ ├── scoring.py # Composite relevance: similarity × recency × importance
│ ├── lifecycle.py # 8-state memory lifecycle (active→confirmed→archived→...)
+│ ├── experience.py # Per-domain Q-value contexts (default, sales, dealflow)
│ ├── enrichment.py # Auto-metadata extraction (LLM or defaults)
+│ ├── explanation.py # L4: LLM-generated reward explanations
+│ ├── reward_log.py # L3: cold storage of reward events
+│ ├── compaction.py # Memory merging/clustering
│ ├── v7_extensions.py # Lifecycle filter + hybrid scoring integration
│ └── config.py # Environment-based configuration
│
@@ -264,6 +295,11 @@ openexp/
├── resolvers/ # Outcome resolvers (pluggable)
│ └── crm_csv.py # CRM CSV stage transition → reward events
│
+├── data/experiences/ # Shipped experience configs
+│ ├── default.yaml # Software engineering
+│ ├── sales.yaml # Sales & outreach
+│ └── dealflow.yaml # Deal pipeline
+│
├── outcome.py # Outcome resolution framework
│
├── hooks/ # Claude Code integration
@@ -272,9 +308,10 @@ openexp/
│ ├── post-tool-use.sh # Capture observations from tool calls
│ └── session-end.sh # Summary + ingest + reward (closes the loop)
│
-├── mcp_server.py # MCP STDIO server (JSON-RPC 2.0)
+├── mcp_server.py # MCP STDIO server (16 tools, JSON-RPC 2.0)
├── reward_tracker.py # Prediction → outcome → Q-value updates
-└── cli.py # CLI: search, ingest, stats
+├── viz.py # Visualization + session replay
+└── cli.py # CLI: search, ingest, stats, viz, compact, experience
```
### Memory Lifecycle
@@ -370,6 +407,7 @@ export OPENEXP_EXPERIENCE=dealflow
Detailed docs are available in the [`docs/`](docs/) directory:
- [How It Works](docs/how-it-works.md) — full explanation of the learning loop
+- [Storage System](docs/storage-system.md) — 5-level pyramid (L0–L4), all 4 reward paths
- [Experiences](docs/experiences.md) — domain-specific reward profiles (create your own)
- [Architecture](docs/architecture.md) — system design and data flow
- [Configuration](docs/configuration.md) — all environment variables and options
@@ -380,11 +418,11 @@ This project is in early stages. See [CONTRIBUTING.md](CONTRIBUTING.md) for setu
Key areas where help is welcome:
-- **Reward signals** — beyond commits/PRs, what indicates a productive session?
-- **Compaction** — merging duplicate or outdated memories automatically
+- **New experiences** — domain-specific reward profiles (DevOps, writing, research, etc.)
+- **Outcome resolvers** — new integrations beyond CRM (Jira, Linear, GitHub Issues)
- **Multi-project learning** — sharing relevant context across projects
- **Benchmarks** — measuring retrieval quality improvement over time
-- **More lifecycle transitions** — automated contradiction detection
+- **Automated lifecycle transitions** — contradiction detection, staleness heuristics
## Research
From 3d219793c46df29e756aee5eceaecada0265c4c1 Mon Sep 17 00:00:00 2001
From: Ivan Pasichnyk
Date: Sun, 29 Mar 2026 21:17:46 -0700
Subject: [PATCH 25/59] =?UTF-8?q?fix:=20security=20hardening=20=E2=80=94?=
=?UTF-8?q?=20command=20injection,=20path=20traversal,=20secret=20filterin?=
=?UTF-8?q?g?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- hooks: pass $CWD via env var instead of string interpolation to prevent
command injection through crafted directory names (session-start.sh,
session-end.sh)
- experience.py: validate experience names with ^[a-zA-Z0-9_-]+$ regex
to prevent path traversal via malicious .openexp.yaml
- filters.py: add secret pattern detection (API keys, AWS keys, private
keys) to prevent accidental ingestion of credentials into Qdrant
- .env.example: stronger recommendation to set QDRANT_API_KEY
Co-Authored-By: Claude Opus 4.6
---
.env.example | 3 ++-
openexp/core/experience.py | 13 +++++++++++++
openexp/hooks/session-end.sh | 6 +++++-
openexp/hooks/session-start.sh | 6 +++++-
openexp/ingest/filters.py | 16 ++++++++++++++++
5 files changed, 41 insertions(+), 3 deletions(-)
diff --git a/.env.example b/.env.example
index 7e5598e..bce3ad4 100644
--- a/.env.example
+++ b/.env.example
@@ -5,8 +5,9 @@
QDRANT_HOST=localhost
QDRANT_PORT=6333
OPENEXP_COLLECTION=openexp_memories
-# Qdrant API key (optional — set to enable authentication)
+# Qdrant API key (RECOMMENDED — without this, any local process can read your memories)
# If set, setup.sh will also pass it to the Docker container as QDRANT__SERVICE__API_KEY
+# Generate one with: python3 -c "import secrets; print(secrets.token_urlsafe(32))"
# QDRANT_API_KEY=
# Data directory (default: ~/.openexp/data)
diff --git a/openexp/core/experience.py b/openexp/core/experience.py
index 0e56a23..1116e15 100644
--- a/openexp/core/experience.py
+++ b/openexp/core/experience.py
@@ -11,6 +11,7 @@
"""
import logging
import os
+import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import Dict, List, Optional
@@ -76,6 +77,14 @@ def _parse_yaml(path: Path) -> Experience:
)
+_VALID_NAME_RE = re.compile(r"^[a-zA-Z0-9_-]+$")
+
+
+def _validate_experience_name(name: str) -> bool:
+ """Validate experience name to prevent path traversal."""
+ return bool(_VALID_NAME_RE.match(name)) and len(name) <= 64
+
+
def load_experience(name: str) -> Experience:
"""Load an experience by name.
@@ -84,6 +93,10 @@ def load_experience(name: str) -> Experience:
2. openexp/data/experiences/{name}.yaml
3. DEFAULT_EXPERIENCE (if name == "default")
"""
+ if not _validate_experience_name(name):
+ logger.warning("Invalid experience name '%s', falling back to default", name)
+ return DEFAULT_EXPERIENCE
+
if name == "default":
# Try YAML files first, fall back to constant
for directory in (_user_experiences_dir(), _BUNDLED_DIR):
diff --git a/openexp/hooks/session-end.sh b/openexp/hooks/session-end.sh
index 5d8f286..849a978 100755
--- a/openexp/hooks/session-end.sh
+++ b/openexp/hooks/session-end.sh
@@ -138,7 +138,11 @@ fi
# Resolve experience: project .openexp.yaml → env var → default
EXPERIENCE="${OPENEXP_EXPERIENCE:-default}"
if [ -n "$CWD" ] && [ -f "$CWD/.openexp.yaml" ]; then
- PROJECT_EXP=$(python3 -c "import yaml; d=yaml.safe_load(open('$CWD/.openexp.yaml')); print(d.get('experience',''))" 2>/dev/null)
+ PROJECT_EXP=$(OPENEXP_CWD="$CWD" python3 -c "
+import yaml, os
+d=yaml.safe_load(open(os.path.join(os.environ['OPENEXP_CWD'], '.openexp.yaml')))
+print(d.get('experience',''))
+" 2>/dev/null)
[ -n "$PROJECT_EXP" ] && EXPERIENCE="$PROJECT_EXP"
fi
export OPENEXP_EXPERIENCE="$EXPERIENCE"
diff --git a/openexp/hooks/session-start.sh b/openexp/hooks/session-start.sh
index 5a8d465..7cf463e 100755
--- a/openexp/hooks/session-start.sh
+++ b/openexp/hooks/session-start.sh
@@ -50,7 +50,11 @@ export OPENEXP_TMPDIR="$TMPDIR_HOOK"
# Resolve experience: project .openexp.yaml → env var → default
EXPERIENCE="${OPENEXP_EXPERIENCE:-default}"
if [ -f "$CWD/.openexp.yaml" ]; then
- PROJECT_EXP=$(python3 -c "import yaml; d=yaml.safe_load(open('$CWD/.openexp.yaml')); print(d.get('experience',''))" 2>/dev/null)
+ PROJECT_EXP=$(OPENEXP_CWD="$CWD" python3 -c "
+import yaml, os
+d=yaml.safe_load(open(os.path.join(os.environ['OPENEXP_CWD'], '.openexp.yaml')))
+print(d.get('experience',''))
+" 2>/dev/null)
[ -n "$PROJECT_EXP" ] && EXPERIENCE="$PROJECT_EXP"
fi
"$PYTHON" -c "
diff --git a/openexp/ingest/filters.py b/openexp/ingest/filters.py
index e83edd1..315cdae 100644
--- a/openexp/ingest/filters.py
+++ b/openexp/ingest/filters.py
@@ -5,6 +5,17 @@
import re
from typing import Dict
+# Patterns that indicate secrets — never ingest these observations
+_SECRET_PATTERNS = [
+ r"sk-ant-api\w+", # Anthropic API keys
+ r"sk-[a-zA-Z0-9]{20,}", # OpenAI-style keys
+ r"ghp_[a-zA-Z0-9]{36}", # GitHub personal access tokens
+ r"gho_[a-zA-Z0-9]{36}", # GitHub OAuth tokens
+ r"AKIA[0-9A-Z]{16}", # AWS access key IDs
+ r"-----BEGIN.*PRIVATE KEY", # Private keys
+]
+_SECRET_RE = re.compile("|".join(_SECRET_PATTERNS))
+
_READONLY_PATTERNS = [
r"^(git\s+(status|log|diff|show|branch|remote|stash\s+list))",
r"^(find|grep|rg|ls|cat|head|tail|wc|du|tree|stat)\b",
@@ -35,6 +46,11 @@ def should_keep(obs: Dict) -> bool:
tags = set(obs.get("tags", []))
obs_type = obs.get("type", "")
+ # Never ingest observations containing secrets
+ full_text = summary + " " + str(obs.get("context", ""))
+ if _SECRET_RE.search(full_text):
+ return False
+
if tags & _VALUABLE_TAGS:
return True
if obs_type in ("decision", "retrospective"):
From 47abcb927b41d55e823167ec5d597656787ce335 Mon Sep 17 00:00:00 2001
From: Ivan Pasichnyk
Date: Sun, 29 Mar 2026 21:37:40 -0700
Subject: [PATCH 26/59] fix: reward only recalled memories, count only
per-session observations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Two critical bugs in the Q-learning reward loop:
1. Session reward was computed from ALL observations across ALL sessions
(showing "67 commits, 154 PRs" = lifetime cumulative stats). Now filters
to only observations matching the current session_id.
2. Reward was applied to ALL newly ingested memories (2,721 at once)
instead of only the 5-10 memories recalled at session start. Now uses
reward_retrieved_memories() exclusively — the correct closed-loop path.
These bugs made Q-values meaningless (99.8% at identical Q=0.12).
Q-cache has been reset to allow clean re-learning.
Co-Authored-By: Claude Opus 4.6
---
openexp/ingest/__init__.py | 52 ++++++++++++++++++++++----------------
1 file changed, 30 insertions(+), 22 deletions(-)
diff --git a/openexp/ingest/__init__.py b/openexp/ingest/__init__.py
index 655fa82..7d5898d 100644
--- a/openexp/ingest/__init__.py
+++ b/openexp/ingest/__init__.py
@@ -52,7 +52,7 @@ def ingest_session(
"""Full ingest pipeline: observations + sessions + reward."""
from .observation import ingest_observations
from .session_summary import ingest_sessions
- from .reward import compute_session_reward, apply_session_reward, reward_retrieved_memories, _build_session_reward_context
+ from .reward import compute_session_reward, reward_retrieved_memories, _build_session_reward_context
result = {}
@@ -68,34 +68,42 @@ def ingest_session(
if dry_run:
return result
+ # Clean up internal fields from observation result
obs_data = result.get("observations", {})
- point_ids = obs_data.pop("_point_ids", [])
+ obs_data.pop("_point_ids", [])
raw_obs = obs_data.pop("_raw_observations", [])
- if point_ids and raw_obs:
- reward = compute_session_reward(raw_obs)
+ # --- Session Reward: reward RECALLED memories, not ingested ones ---
+ # Filter observations to THIS session only (fixes cumulative counting bug)
+ if session_id and raw_obs:
+ session_obs = [o for o in raw_obs if session_id in o.get("session_id", "")]
+ else:
+ session_obs = raw_obs
+
+ if session_id and session_obs:
+ reward = compute_session_reward(session_obs)
if reward != 0.0:
- reward_ctx = _build_session_reward_context(raw_obs, reward)
- updated = apply_session_reward(
- point_ids, reward, reward_context=reward_ctx,
- observations=raw_obs, session_id=session_id,
+ reward_ctx = _build_session_reward_context(session_obs, reward)
+ # Reward only memories that were RECALLED at session start (closed loop)
+ retrieved_updated = reward_retrieved_memories(
+ session_id, reward, reward_context=reward_ctx,
+ )
+ result["reward"] = {
+ "applied": True,
+ "value": reward,
+ "retrieved_memories_rewarded": retrieved_updated,
+ "session_observations": len(session_obs),
+ }
+ logger.info(
+ "Session reward=%.2f applied to %d retrieved memories (from %d session obs)",
+ reward, retrieved_updated, len(session_obs),
)
- result["reward"] = {"applied": True, "value": reward, "updated": updated}
- logger.info("Session reward=%.2f applied to %d memories", reward, updated)
else:
- result["reward"] = {"applied": False, "value": 0.0, "reason": "neutral session"}
- reward_ctx = None
+ result["reward"] = {"applied": False, "value": 0.0, "reason": "neutral session", "retrieved_memories_rewarded": 0}
+ elif not session_id:
+ result["reward"] = {"applied": False, "reason": "no session_id provided", "retrieved_memories_rewarded": 0}
else:
- result["reward"] = {"applied": False, "reason": "no new observations"}
- reward_ctx = None
-
- if session_id:
- reward_val = result.get("reward", {}).get("value", 0.0)
- if reward_val and reward_val != 0.0:
- retrieved_updated = reward_retrieved_memories(session_id, reward_val, reward_context=reward_ctx)
- result["reward"]["retrieved_memories_rewarded"] = retrieved_updated
- else:
- result["reward"]["retrieved_memories_rewarded"] = 0
+ result["reward"] = {"applied": False, "reason": "no observations for this session", "retrieved_memories_rewarded": 0}
# Run outcome resolvers (CRM stage transitions, etc.)
try:
From 96cd581a38758a7476a35af03bc8e51d022a0751 Mon Sep 17 00:00:00 2001
From: Ivan Pasichnyk
Date: Sun, 29 Mar 2026 21:48:49 -0700
Subject: [PATCH 27/59] feat: auto-tag observations with client_id from CRM
companies
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Observations are now automatically matched against CRM company names
during ingestion. This enables the CRM resolver to reward memories
when deals progress (e.g., invoiced→paid), closing the business
outcome feedback loop.
Co-Authored-By: Claude Opus 4.6
---
openexp/ingest/observation.py | 58 ++++++++++++++++++++++++++++++++++-
1 file changed, 57 insertions(+), 1 deletion(-)
diff --git a/openexp/ingest/observation.py b/openexp/ingest/observation.py
index 0e5756b..ead3822 100644
--- a/openexp/ingest/observation.py
+++ b/openexp/ingest/observation.py
@@ -73,6 +73,7 @@ def _obs_to_payload(obs: Dict) -> Dict:
obs_type = obs.get("type", "feature")
tool = obs.get("tool", "")
summary = obs.get("summary", "")
+ client_id = obs.get("client_id") or _detect_client_id(obs)
return {
"memory": summary,
@@ -96,13 +97,68 @@ def _obs_to_payload(obs: Dict) -> Dict:
"tool": tool,
"tags": obs.get("tags", []),
"file_path": obs.get("context", {}).get("file_path", ""),
- **({"client_id": obs["client_id"]} if obs.get("client_id") else {}),
+ **({"client_id": client_id} if client_id else {}),
},
}
MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB
+# --- Client auto-tagging from CRM ---
+_CLIENT_LOOKUP: Optional[Dict] = None
+
+
+def _load_client_lookup() -> Dict[str, str]:
+ """Load company name → company_id lookup from CRM CSV.
+
+ Returns {lowercase_name: company_id} for auto-tagging observations.
+ Cached on first call. Returns empty dict if CRM not configured.
+ """
+ global _CLIENT_LOOKUP
+ if _CLIENT_LOOKUP is not None:
+ return _CLIENT_LOOKUP
+
+ from ..core.config import CRM_DIR
+ _CLIENT_LOOKUP = {}
+ if not CRM_DIR or not CRM_DIR.exists():
+ return _CLIENT_LOOKUP
+
+ companies_path = CRM_DIR / "contacts" / "companies.csv"
+ if not companies_path.exists():
+ return _CLIENT_LOOKUP
+
+ import csv
+ try:
+ with open(companies_path, encoding="utf-8") as f:
+ for row in csv.DictReader(f):
+ cid = row.get("company_id", "").strip()
+ name = row.get("name", "").strip()
+ if cid and name and len(name) >= 3:
+ _CLIENT_LOOKUP[name.lower()] = cid
+ except Exception as e:
+ logger.warning("Failed to load CRM companies for auto-tagging: %s", e)
+
+ logger.info("Loaded %d companies for client auto-tagging", len(_CLIENT_LOOKUP))
+ return _CLIENT_LOOKUP
+
+
+def _detect_client_id(obs: Dict) -> Optional[str]:
+ """Detect client_id from observation content by matching CRM company names."""
+ lookup = _load_client_lookup()
+ if not lookup:
+ return None
+
+ # Build searchable text from observation
+ text = (obs.get("summary", "") + " " + obs.get("context", {}).get("file_path", "")).lower()
+ if len(text) < 5:
+ return None
+
+ for name, cid in lookup.items():
+ if name in text:
+ return cid
+
+ return None
+
def _load_observations(obs_dir: Path, processed_ids: set = None) -> List[Dict]:
"""Load all observations from JSONL files in directory.
From 8beb700bb25dd3e5bb443439a246841925fb893a Mon Sep 17 00:00:00 2001
From: John
Date: Sun, 29 Mar 2026 22:18:03 -0700
Subject: [PATCH 28/59] fix: remove hardcoded usernames from sanitize patterns
(#15)
Replace personal identifiers in viz.py _sanitize() with generic
API key detection pattern. Open-source code should not contain
developer-specific strings.
Co-authored-by: Ivan Pasichnyk
Co-authored-by: Claude Opus 4.6
---
openexp/viz.py | 3 +--
tests/test_viz.py | 4 ++--
2 files changed, 3 insertions(+), 4 deletions(-)
diff --git a/openexp/viz.py b/openexp/viz.py
index 2881c84..fc37250 100644
--- a/openexp/viz.py
+++ b/openexp/viz.py
@@ -1654,8 +1654,7 @@ def _sanitize(data):
r"/Users/\w+",
r"/home/\w+",
r"sk-ant-",
- r"welababeldata",
- r"ivanpasichnyk",
+ r"sk-[a-zA-Z0-9]{20,}",
]
def _check(obj, path=""):
diff --git a/tests/test_viz.py b/tests/test_viz.py
index 16ebb6a..9023cde 100644
--- a/tests/test_viz.py
+++ b/tests/test_viz.py
@@ -77,9 +77,9 @@ def test_api_key_caught(self):
with pytest.raises(ValueError, match="Sensitive data"):
_sanitize({"key": "sk-ant-abc123"})
- def test_username_caught(self):
+ def test_long_api_key_caught(self):
with pytest.raises(ValueError, match="Sensitive data"):
- _sanitize({"key": "ivanpasichnyk"})
+ _sanitize({"key": "sk-abcdefghijklmnopqrstuvwxyz"})
def test_numeric_values_ok(self):
data = {"q": 0.5, "count": 100, "nested": [1, 2, 3]}
From 5afe144861ca24938122756208271b8ebbedf775 Mon Sep 17 00:00:00 2001
From: John
Date: Sun, 29 Mar 2026 22:28:01 -0700
Subject: [PATCH 29/59] fix: correct Q-learning formula and q_init in README
(#16)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- Q_init is 0.0, not 0.5 (memories earn value from zero)
- Formula is additive (Q + α*reward), not exponential moving average
- Reward range is [-1.0, 1.0], not [-0.5, 0.5]
- Add floor/ceiling parameters
Co-authored-by: Ivan Pasichnyk
Co-authored-by: Claude Opus 4.6
---
README.md | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/README.md b/README.md
index 727f159..372a0c8 100644
--- a/README.md
+++ b/README.md
@@ -155,7 +155,7 @@ The MCP server provides 16 tools for memory operations, introspection, and calib
### Q-Learning Details
-Every memory has a Q-value (starts at 0.5). Three layers capture different aspects:
+Every memory has a Q-value (starts at 0.0 — earn value from zero). Three layers capture different aspects:
| Layer | Weight | Measures |
|-------|--------|----------|
@@ -166,10 +166,11 @@ Every memory has a Q-value (starts at 0.5). Three layers capture different aspec
Update rule:
```
-Q_new = (1 - α) × Q_old + α × reward
+Q_new = clamp(Q_old + α × reward, floor, ceiling)
α = 0.25 (learning rate)
-reward ∈ [-0.5, 0.5] (session productivity signal)
+reward ∈ [-1.0, 1.0] (productivity signal)
+floor = -0.5, ceiling = 1.0
```
Retrieval scoring combines five signals:
From 0e9b553df579d08a751051b311aafe2b5b447c18 Mon Sep 17 00:00:00 2001
From: John
Date: Mon, 30 Mar 2026 01:55:25 -0700
Subject: [PATCH 30/59] feat: pivot to business process learning engine (#17)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Bug fix: experience weights were never used — compute_session_reward()
always used hardcoded defaults. Now ingest_session() loads the active
experience and passes its weights to the reward pipeline.
New features:
- ProcessStage dataclass for defining business pipelines
- reward_memory_types filter: only reward decisions/insights, not noise
- Experience wizard now starts with process type (dev/sales/support/content)
- experience_info MCP tool returns process_stages and reward_memory_types
- Bundled YAMLs updated with real pipeline stages
All 250 tests pass. 12 new tests added for process stages, memory type
filtering, backward compatibility, and ingest pipeline integration.
Co-authored-by: Ivan Pasichnyk
Co-authored-by: Claude Opus 4.6
---
README.md | 66 ++++++----
docs/experiences.md | 82 ++++++++++++-
docs/product-page-content.md | 40 +++----
openexp/cli.py | 138 ++++++++++++++++++---
openexp/core/experience.py | 34 ++++++
openexp/data/experiences/dealflow.yaml | 29 +++++
openexp/data/experiences/default.yaml | 26 +++-
openexp/data/experiences/sales.yaml | 26 ++++
openexp/ingest/__init__.py | 18 ++-
openexp/ingest/reward.py | 27 +++++
openexp/mcp_server.py | 5 +
tests/test_experience.py | 160 +++++++++++++++++++++++++
tests/test_session_end.py | 74 ++++++++++++
13 files changed, 661 insertions(+), 64 deletions(-)
diff --git a/README.md b/README.md
index 372a0c8..975ae02 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
OpenExp
- Q-learning memory for Claude Code
- Your AI learns from experience.
+ Self-labeling experience engine for AI agents
+ Define your process. Outcomes label your data. AI learns what works.
@@ -25,17 +25,31 @@
---
-Every Claude Code session starts from zero. OpenExp changes that.
+Memory tools store and retrieve. OpenExp **learns which memories actually help you get work done** — and surfaces those first next time.
-It gives Claude Code **persistent memory that learns**. Not just storage — actual reinforcement learning. Memories that lead to productive sessions (commits, PRs, passing tests) get higher Q-values and surface first next time. Bad memories sink.
+You define your process (software dev, sales, support, content). Every outcome — commit, closed deal, resolved ticket — feeds back as a reward signal. Over time, proven memories rank higher. Noise sinks.
-The same idea behind AlphaGo, applied to your coding assistant's context window.
+### How it works for a sales team
+
+```yaml
+# .openexp.yaml in your sales project
+experience: sales
+```
+
+```
+1. Define your pipeline: lead → contacted → qualified → proposal → won
+2. Work normally — Claude remembers client preferences, deal context, pricing
+3. Deal closes → all memories tagged with that client get rewarded
+4. Next similar deal → the insights that led to the close surface first
+```
+
+The same idea behind AlphaGo, applied to your AI agent's working memory.
## The Problem
-Claude Code forgets everything between sessions. You re-explain your project structure, your preferences, your past decisions — every single time.
+AI agents forget everything between sessions. Existing memory tools (Mem0, Zep, LangMem) just store and retrieve — every memory is equally important. A two-month-old note about a deleted feature has the same weight as yesterday's critical architecture decision.
-Existing memory tools just store and retrieve. They treat a two-month-old note about a deleted feature the same as yesterday's critical architecture decision.
+**The missing piece:** there's no learning. No feedback loop from outcomes to retrieval quality.
## The Solution
@@ -44,9 +58,9 @@ OpenExp adds a **closed-loop learning system**:
```
Session starts → recall memories (ranked by Q-value)
↓
-Claude works → observations captured automatically
+Agent works → observations captured automatically
↓
-Session ends → productive? (commits, PRs, tests)
+Session ends → productive? (commits, PRs, closed deals, resolved tickets)
↓
YES → reward recalled memories (Q-values go up)
NO → penalize them (Q-values go down)
@@ -68,17 +82,16 @@ CRM: Acme deal moves negotiation → won
resolve_outcomes → finds memories tagged comp-acme → reward +0.8
```
-This creates a much stronger learning signal than "did this session have git commits?"
-
After a few sessions, OpenExp learns what context actually helps you get work done.
## Why OpenExp?
| Feature | OpenExp | Mem0 | Zep/Graphiti | LangMem |
|---------|---------|------|-------------|---------|
-| **Q-learning on memories** | Yes — memories earn/lose rank from session outcomes | No | No | No |
-| **Closed-loop rewards** | Session productivity → Q-value updates automatically | No | No | No |
-| **Outcome-based rewards** | Real business events (CRM, deployments) → targeted rewards | No | No | No |
+| **Learns from outcomes** | Yes — Q-learning from real business results | No | No | No |
+| **Process-aware** | Define pipeline stages with reward signals | No | No | No |
+| **Memory type filtering** | Reward only decisions/insights, not noise | No | No | No |
+| **Outcome-based rewards** | CRM deal closes → tagged memories get rewarded | No | No | No |
| **Claude Code native** | Zero-config hooks, works out of the box | Requires integration | Requires integration | Requires integration |
| **Local-first** | Qdrant + FastEmbed, no cloud, no API key for core | Cloud API | Cloud or self-hosted | Cloud API |
| **Hybrid retrieval** | BM25 + vector + recency + importance + Q-value (5 signals) | Vector only | Graph + vector | Vector only |
@@ -386,22 +399,31 @@ openexp ingest # ingest into Qdrant
openexp stats # check Q-cache state
```
-## Experiences
+## Experiences — Define Your Process
-Not everyone writes code. OpenExp ships with three **Experiences** — domain-specific reward profiles:
+Not everyone writes code. An **Experience** defines what "productive" means for your workflow, including pipeline stages and which memory types matter.
-| Experience | Optimized For | Top Signals |
-|------------|--------------|-------------|
-| `default` | Software engineering | commits, PRs, tests |
-| `sales` | Sales & outreach | decisions, emails, follow-ups |
-| `dealflow` | Deal pipeline (lead → payment) | proposals, invoices, payments |
+| Experience | Process | Top Signals |
+|------------|---------|-------------|
+| `default` | backlog → in_progress → review → merged → deployed | commits, PRs, tests |
+| `sales` | lead → contacted → qualified → proposal → negotiation → won | decisions, emails, follow-ups |
+| `dealflow` | lead → discovery → nda → proposal → negotiation → invoice → paid | proposals, invoices, payments |
Switch with one env var:
```bash
export OPENEXP_EXPERIENCE=dealflow
```
-**Create your own** — answer a questionnaire, get a YAML. See the [Experiences Guide](docs/experiences.md).
+Each experience also controls **which memory types get rewarded** — sales rewards decisions and insights, not raw tool actions. This means the system learns faster because it focuses on the signal, not the noise.
+
+**Create your own** with the interactive wizard:
+```bash
+openexp experience create
+# Pick a process type (dev/sales/support/content)
+# Customize stages, signal weights, memory type filters
+```
+
+See the [Experiences Guide](docs/experiences.md) for full details.
## Documentation
diff --git a/docs/experiences.md b/docs/experiences.md
index bb43d35..868b908 100644
--- a/docs/experiences.md
+++ b/docs/experiences.md
@@ -4,6 +4,13 @@ An **Experience** is a domain-specific reward profile that tells OpenExp what "p
The default experience rewards coding outputs (commits, PRs, tests). But if your work is sales, devops, content creation, or research — the signals are different. Experiences let you define that.
+An experience consists of:
+- **Signal weights** — how much each action type is worth
+- **Process stages** — your pipeline (backlog → done, lead → won)
+- **Memory type filter** — which memory types receive rewards (decisions only? everything?)
+- **Retrieval boosts** — which types rank higher in search
+- **Learning speed** — how fast Q-values update
+
## How It Works
After each Claude Code session, OpenExp computes a reward score: did this session accomplish something useful?
@@ -17,7 +24,9 @@ Apply weights from active Experience
↓
reward = sum(signal × weight) + base + penalties
↓
-Update Q-values for all memories from this session
+Filter: only reward memory types that matter (e.g., decisions, not raw actions)
+ ↓
+Update Q-values for matching memories from this session
↓
Next session → memories from productive sessions rank higher
```
@@ -108,6 +117,59 @@ openexp experience list
openexp experience info # shows active + weights
```
+## Process Stages
+
+Each experience can define **pipeline stages** — the steps in your business process. Stages are declarative: they define what the pipeline looks like and what reward a memory earns when the process advances to that stage.
+
+```yaml
+process_stages:
+ - name: lead
+ description: New lead identified
+ reward_on_enter: 0.0
+ - name: qualified
+ description: Lead confirmed as viable
+ reward_on_enter: 0.2
+ - name: proposal
+ description: Proposal sent
+ reward_on_enter: 0.3
+ - name: won
+ description: Deal closed
+ reward_on_enter: 0.8
+```
+
+Stages are currently informational and used by outcome resolvers (e.g., `CRMCSVResolver`) to determine reward magnitude when a deal moves from one stage to another. The `reward_on_enter` value is the reward applied when the process advances to that stage.
+
+Stages can also be defined as simple strings:
+
+```yaml
+process_stages:
+ - backlog
+ - in_progress
+ - review
+ - done
+```
+
+String format uses `reward_on_enter: 0.0` by default.
+
+## Memory Type Filter (`reward_memory_types`)
+
+By default, all recalled memories receive session rewards. But in many workflows, raw action observations (e.g., "ran git status") are noise — you only want to reward the insights and decisions that drove the outcome.
+
+```yaml
+# Only reward these memory types during session reward
+reward_memory_types:
+ - decision
+ - insight
+ - outcome
+```
+
+When set, OpenExp fetches the memory type from Qdrant and filters out non-matching memories before applying rewards. This means:
+- **Decisions** about client strategy get rewarded when a deal closes
+- **Raw tool observations** like "Read file.py" don't accumulate noise Q-values
+- The system learns faster because signal-to-noise ratio is higher
+
+An empty list (or omitting the field) preserves the default behavior: reward all recalled memories.
+
## Creating Your Own Experience
### Step 1: Answer These Questions
@@ -183,6 +245,24 @@ retrieval_boosts:
outcome: 1.2
q_config_overrides:
alpha: 0.25 # learning rate
+
+# Pipeline stages (optional — used by outcome resolvers)
+process_stages:
+ - name: lead
+ description: New opportunity
+ reward_on_enter: 0.0
+ - name: proposal
+ description: Proposal sent
+ reward_on_enter: 0.3
+ - name: won
+ description: Deal closed
+ reward_on_enter: 0.8
+
+# Which memory types receive session rewards (optional — empty = all)
+reward_memory_types:
+ - decision
+ - insight
+ - outcome
```
### Step 3: Activate
diff --git a/docs/product-page-content.md b/docs/product-page-content.md
index 27853ca..5f0a370 100644
--- a/docs/product-page-content.md
+++ b/docs/product-page-content.md
@@ -7,17 +7,17 @@
## Headline
-**Your AI sessions don't learn from each other. OpenExp fixes that.**
+**Your AI doesn't learn from outcomes. OpenExp fixes that.**
## Subheadline
-Persistent memory for Claude Code with Q-learning. Every outcome — commit, deploy, closed deal — feeds back as a signal. Over time, your AI knows what works.
+A self-labeling experience engine for AI agents. Define your business process — software dev, sales, support — and outcomes automatically label which memories matter. Over time, your AI knows what works.
---
## The Problem
-There are three ways people give context to AI coding assistants today.
+There are three ways people give context to AI agents today.
### 1. Static instructions (CLAUDE.md)
@@ -67,51 +67,47 @@ This is Q-learning — the same algorithm that trained AlphaGo. Applied to your
---
-## Experiences — Different Lenses on the Same Memory
+## Experiences — Your Process, Your Rewards
-One memory can be valuable in one context and worthless in another.
-
-An Experience is a definition of what "success" means for a specific workflow. You create it as a simple YAML config.
+One memory can be valuable in one context and worthless in another. An Experience defines what "success" means for a specific workflow — including the process pipeline and which memory types matter.
### For a developer (default)
```yaml
+process_stages: [backlog, in_progress, review, merged, deployed]
weights:
- commit: 0.3
- pr: 0.2
- deploy: 0.1
- tests: 0.1
+ commit: 0.3, pr: 0.2, deploy: 0.1, tests: 0.1
+reward_memory_types: [decision, insight, outcome, action]
```
### For sales
```yaml
+process_stages: [lead, contacted, qualified, proposal, negotiation, won]
weights:
- email_sent: 0.15
- proposal_sent: 0.20
- payment_received: 0.30
- commit: 0.0
+ email_sent: 0.15, proposal_sent: 0.20, payment_received: 0.30
+reward_memory_types: [decision, insight, outcome] # skip raw actions
```
### For support
```yaml
+process_stages: [new_ticket, investigating, responded, resolved, closed]
weights:
- ticket_closed: 0.25
- escalation_avoided: 0.20
- customer_reply: 0.10
+ ticket_closed: 0.25, email_sent: 0.10
+reward_memory_types: [decision, insight, outcome]
```
### For content creation
```yaml
+process_stages: [idea, draft, review, published, distributed]
weights:
- post_published: 0.25
- engagement: 0.15
- subscriber_gained: 0.20
+ writes: 0.05, deploy: 0.20, decisions: 0.15
+reward_memory_types: [decision, insight, outcome]
```
-**Each memory holds separate scores per experience.** In a sales context, sales-relevant memories surface. In a coding context — coding memories.
+**Each memory holds separate scores per experience.** In a sales context, sales-relevant memories surface. In a coding context — coding memories. Memory type filtering ensures only meaningful memories (decisions, insights) accumulate rewards — raw tool observations stay at baseline.
### Example
diff --git a/openexp/cli.py b/openexp/cli.py
index 8b63e41..542106f 100644
--- a/openexp/cli.py
+++ b/openexp/cli.py
@@ -319,6 +319,34 @@ def _ask_choice(prompt: str, choices: list[tuple[str, str]], default: int = 1) -
print(f" Please enter 1-{len(choices)}.")
+_PROCESS_PRESETS = {
+ "dev": {
+ "label": "Software Development",
+ "stages": ["backlog", "in_progress", "review", "merged", "deployed"],
+ "stage_rewards": [0.0, 0.05, 0.2, 0.3, 0.4],
+ "signal_defaults": {"commit": 8, "pr": 7, "writes": 5, "tests": 6, "deploy": 6, "decisions": 5},
+ },
+ "sales": {
+ "label": "Sales & Outreach",
+ "stages": ["lead", "contacted", "qualified", "proposal", "negotiation", "won"],
+ "stage_rewards": [0.0, 0.1, 0.2, 0.3, 0.4, 0.8],
+ "signal_defaults": {"decisions": 8, "email_sent": 7, "follow_up": 6, "proposal_sent": 8, "payment_received": 10},
+ },
+ "support": {
+ "label": "Customer Support",
+ "stages": ["new_ticket", "investigating", "responded", "resolved", "closed"],
+ "stage_rewards": [0.0, 0.05, 0.15, 0.3, 0.4],
+ "signal_defaults": {"decisions": 6, "email_sent": 7, "ticket_closed": 9, "writes": 3},
+ },
+ "content": {
+ "label": "Content Creation",
+ "stages": ["idea", "draft", "review", "published", "distributed"],
+ "stage_rewards": [0.0, 0.1, 0.2, 0.35, 0.4],
+ "signal_defaults": {"writes": 7, "commit": 5, "deploy": 8, "decisions": 6, "email_sent": 4},
+ },
+}
+
+
def _experience_create_wizard():
"""Interactive wizard to create a custom experience YAML."""
import yaml
@@ -328,17 +356,67 @@ def _experience_create_wizard():
print(" OpenExp — Create Custom Experience")
print("=" * 50)
+ # Process type (new — asked first)
+ process_idx = _ask_choice(
+ "What kind of process does this experience track?",
+ [
+ ("Software Dev", "commits, PRs, deploys"),
+ ("Sales", "leads, proposals, payments"),
+ ("Support", "tickets, responses, resolutions"),
+ ("Content", "drafts, publishing, distribution"),
+ ],
+ default=1,
+ )
+ process_keys = ["dev", "sales", "support", "content"]
+ preset_key = process_keys[process_idx]
+ preset = _PROCESS_PRESETS[preset_key]
+
+ print(f"\n Using '{preset['label']}' preset as starting point.")
+ print(f" Pipeline stages: {' -> '.join(preset['stages'])}")
+
+ # Ask if custom stages
+ custom_stages_idx = _ask_choice(
+ "Use these pipeline stages?",
+ [
+ ("Yes", f"use preset stages: {', '.join(preset['stages'])}"),
+ ("Custom", "enter your own stages (comma-separated)"),
+ ],
+ default=1,
+ )
+
+ if custom_stages_idx == 0:
+ stage_names = preset["stages"]
+ stage_rewards = preset["stage_rewards"]
+ else:
+ raw = input("Enter stages (comma-separated, in order): ").strip()
+ stage_names = [s.strip().replace(" ", "_") for s in raw.split(",") if s.strip()]
+ if not stage_names:
+ stage_names = preset["stages"]
+ print(f" No stages entered, using preset: {', '.join(stage_names)}")
+ # Auto-assign rewards linearly
+ n = len(stage_names)
+ stage_rewards = [round(i * 0.8 / max(n - 1, 1), 2) for i in range(n)]
+ print(f" Auto-assigned rewards: {dict(zip(stage_names, stage_rewards))}")
+
+ process_stages = [
+ {"name": name, "reward_on_enter": rwd}
+ for name, rwd in zip(stage_names, stage_rewards)
+ ]
+
# Name
+ default_name = preset_key
while True:
- name = input("\nExperience name (lowercase, no spaces): ").strip().lower().replace(" ", "-")
- if name and name.isidentifier() or all(c.isalnum() or c == "-" for c in name):
+ name = input(f"\nExperience name (lowercase, no spaces) [{default_name}]: ").strip().lower().replace(" ", "-")
+ if not name:
+ name = default_name
+ if name and (name.isidentifier() or all(c.isalnum() or c == "-" for c in name)):
break
print(" Use only letters, numbers, and hyphens.")
# Description
- desc = input("One-line description: ").strip() or f"{name} experience"
+ desc = input(f"One-line description [{preset['label']} experience]: ").strip() or f"{preset['label']} experience"
- # Signal ratings
+ # Signal ratings (with preset defaults)
signals = [
("commit", "Committed code to git"),
("pr", "Created a Pull Request"),
@@ -361,13 +439,15 @@ def _experience_create_wizard():
("payment_received", "Payment received"),
]
+ defaults = preset.get("signal_defaults", {})
print("\n--- Rate each signal 0-10 (how important for YOUR workflow) ---")
print(" 10 = this IS the goal 5 = moderate 0 = irrelevant")
- print()
+ print(f" Preset defaults shown in brackets.\n")
weights = {}
for key, label in signals:
- rating = _ask_int(f" {label}", 0, 10, default=0)
+ default_val = defaults.get(key, 0)
+ rating = _ask_int(f" {label}", 0, 10, default=default_val)
w = _rating_to_weight(rating)
if key == "writes":
w = round(w / 5, 3) # per-file weight, cap at ~0.06/file
@@ -394,15 +474,32 @@ def _experience_create_wizard():
alpha_idx = _ask_choice(
"How fast does your domain change?",
[
- ("Fast", "sales, news — learn fast, forget fast (α=0.30)"),
- ("Normal", "engineering — balanced (α=0.25)"),
- ("Slow", "research, legal — accumulate gradually (α=0.15)"),
+ ("Fast", "sales, news — learn fast, forget fast (alpha=0.30)"),
+ ("Normal", "engineering — balanced (alpha=0.25)"),
+ ("Slow", "research, legal — accumulate gradually (alpha=0.15)"),
],
default=2,
)
alpha_values = [0.30, 0.25, 0.15]
alpha = alpha_values[alpha_idx]
+ # Memory type filter (new)
+ mem_filter_idx = _ask_choice(
+ "Which memory types should receive session rewards?",
+ [
+ ("All types", "reward every recalled memory (default for dev)"),
+ ("Decisions+Insights+Outcomes", "skip raw action/observation memories"),
+ ("Only decisions", "most selective — only strategic choices get rewarded"),
+ ],
+ default=1 if preset_key == "dev" else 2,
+ )
+ reward_memory_types_options = [
+ [], # empty = all
+ ["decision", "insight", "outcome"],
+ ["decision"],
+ ]
+ reward_memory_types = reward_memory_types_options[mem_filter_idx]
+
# Retrieval boosts
print("\n--- Which memory types should rank higher in search? ---")
boosts = {}
@@ -415,9 +512,9 @@ def _experience_create_wizard():
boost_idx = _ask_choice(
f"Boost for '{mem_type}' ({label})?",
[
- ("None", "no boost (1.0×)"),
- ("Mild", "slight boost (1.1×)"),
- ("Strong", "significant boost (1.3×)"),
+ ("None", "no boost (1.0x)"),
+ ("Mild", "slight boost (1.1x)"),
+ ("Strong", "significant boost (1.3x)"),
],
default=1,
)
@@ -444,19 +541,27 @@ def _experience_create_wizard():
"outcome_resolvers": resolvers,
"retrieval_boosts": boosts if boosts else {},
"q_config_overrides": {"alpha": alpha} if alpha != 0.25 else {},
+ "process_stages": process_stages,
}
+ if reward_memory_types:
+ experience["reward_memory_types"] = reward_memory_types
# Summary
total_positive = sum(v for v in weights.values() if v > 0)
print("\n" + "=" * 50)
print(f" Experience: {name}")
print(f" Description: {desc}")
+ print(f" Process: {' -> '.join(stage_names)}")
print(f" Total positive weight: {total_positive:.2f}")
if total_positive < 0.5:
- print(" ⚠ Low total — sessions may rarely earn positive reward")
+ print(" Warning: Low total — sessions may rarely earn positive reward")
elif total_positive > 1.5:
- print(" ⚠ High total — most sessions will max out reward")
+ print(" Warning: High total — most sessions will max out reward")
print(f" Alpha: {alpha}")
+ if reward_memory_types:
+ print(f" Reward memory types: {', '.join(reward_memory_types)}")
+ else:
+ print(f" Reward memory types: all")
print("=" * 50)
yaml_text = yaml.dump(experience, default_flow_style=False, sort_keys=False)
@@ -529,6 +634,11 @@ def cmd_experience(args):
"outcome_resolvers": exp.outcome_resolvers,
"retrieval_boosts": exp.retrieval_boosts,
"q_config_overrides": exp.q_config_overrides,
+ "process_stages": [
+ {"name": s.name, "description": s.description, "reward_on_enter": s.reward_on_enter}
+ for s in exp.process_stages
+ ],
+ "reward_memory_types": exp.reward_memory_types,
}
print(json.dumps(info, indent=2))
diff --git a/openexp/core/experience.py b/openexp/core/experience.py
index 1116e15..da73aaa 100644
--- a/openexp/core/experience.py
+++ b/openexp/core/experience.py
@@ -24,6 +24,15 @@
_BUNDLED_DIR = Path(__file__).parent.parent / "data" / "experiences"
+@dataclass
+class ProcessStage:
+ """A stage in a business process pipeline."""
+
+ name: str
+ description: str = ""
+ reward_on_enter: float = 0.0
+
+
@dataclass
class Experience:
"""A domain-specific Q-value context."""
@@ -34,6 +43,8 @@ class Experience:
outcome_resolvers: List[str] = field(default_factory=list)
retrieval_boosts: Dict[str, float] = field(default_factory=dict)
q_config_overrides: Dict[str, float] = field(default_factory=dict)
+ process_stages: List[ProcessStage] = field(default_factory=list)
+ reward_memory_types: List[str] = field(default_factory=list)
DEFAULT_EXPERIENCE = Experience(
@@ -62,11 +73,32 @@ def _user_experiences_dir() -> Path:
return EXPERIENCES_DIR
+def _parse_process_stages(raw: list) -> List[ProcessStage]:
+ """Parse process_stages from YAML — supports dict and string formats."""
+ stages = []
+ for item in raw:
+ if isinstance(item, dict):
+ stages.append(ProcessStage(
+ name=item.get("name", ""),
+ description=item.get("description", ""),
+ reward_on_enter=float(item.get("reward_on_enter", 0.0)),
+ ))
+ elif isinstance(item, str):
+ stages.append(ProcessStage(name=item))
+ else:
+ logger.warning("Skipping invalid process_stage entry: %s", item)
+ return stages
+
+
def _parse_yaml(path: Path) -> Experience:
"""Parse a YAML file into an Experience."""
data = yaml.safe_load(path.read_text())
if not isinstance(data, dict):
raise ValueError(f"Invalid experience YAML: {path}")
+
+ raw_stages = data.get("process_stages", [])
+ process_stages = _parse_process_stages(raw_stages) if raw_stages else []
+
return Experience(
name=data.get("name", path.stem),
description=data.get("description", ""),
@@ -74,6 +106,8 @@ def _parse_yaml(path: Path) -> Experience:
outcome_resolvers=data.get("outcome_resolvers", []),
retrieval_boosts=data.get("retrieval_boosts", {}),
q_config_overrides=data.get("q_config_overrides", {}),
+ process_stages=process_stages,
+ reward_memory_types=data.get("reward_memory_types", []),
)
diff --git a/openexp/data/experiences/dealflow.yaml b/openexp/data/experiences/dealflow.yaml
index e4f5375..b9bea7b 100644
--- a/openexp/data/experiences/dealflow.yaml
+++ b/openexp/data/experiences/dealflow.yaml
@@ -29,3 +29,32 @@ retrieval_boosts:
fact: 1.1
q_config_overrides:
alpha: 0.30
+
+process_stages:
+ - name: lead
+ description: Inbound or outbound lead
+ reward_on_enter: 0.0
+ - name: discovery
+ description: Initial call or meeting to understand needs
+ reward_on_enter: 0.1
+ - name: nda
+ description: NDA exchanged
+ reward_on_enter: 0.15
+ - name: proposal
+ description: Proposal sent with pricing
+ reward_on_enter: 0.25
+ - name: negotiation
+ description: Negotiating terms, SOW, timeline
+ reward_on_enter: 0.3
+ - name: invoice
+ description: Invoice sent
+ reward_on_enter: 0.5
+ - name: paid
+ description: Payment received — terminal reward
+ reward_on_enter: 0.8
+
+# Dealflow: decisions and insights drive deals, not raw tool usage
+reward_memory_types:
+ - decision
+ - insight
+ - outcome
diff --git a/openexp/data/experiences/default.yaml b/openexp/data/experiences/default.yaml
index ab4ac8e..713d94c 100644
--- a/openexp/data/experiences/default.yaml
+++ b/openexp/data/experiences/default.yaml
@@ -1,5 +1,5 @@
name: default
-description: General-purpose experience with balanced weights
+description: General-purpose software engineering experience with balanced weights
session_reward_weights:
commit: 0.3
pr: 0.2
@@ -13,3 +13,27 @@ session_reward_weights:
outcome_resolvers: []
retrieval_boosts: {}
q_config_overrides: {}
+
+process_stages:
+ - name: backlog
+ description: Task identified but not started
+ reward_on_enter: 0.0
+ - name: in_progress
+ description: Actively working on task
+ reward_on_enter: 0.05
+ - name: review
+ description: Code submitted for review (PR created)
+ reward_on_enter: 0.2
+ - name: merged
+ description: Code merged to main branch
+ reward_on_enter: 0.3
+ - name: deployed
+ description: Live in production
+ reward_on_enter: 0.4
+
+# Dev process rewards actions/decisions/insights/outcomes
+reward_memory_types:
+ - decision
+ - insight
+ - outcome
+ - action
diff --git a/openexp/data/experiences/sales.yaml b/openexp/data/experiences/sales.yaml
index a6c663f..31bc6ea 100644
--- a/openexp/data/experiences/sales.yaml
+++ b/openexp/data/experiences/sales.yaml
@@ -17,3 +17,29 @@ retrieval_boosts:
outcome: 1.1
q_config_overrides:
alpha: 0.3
+
+process_stages:
+ - name: lead
+ description: New lead identified
+ reward_on_enter: 0.0
+ - name: contacted
+ description: Initial outreach sent
+ reward_on_enter: 0.1
+ - name: qualified
+ description: Lead confirmed as viable opportunity
+ reward_on_enter: 0.2
+ - name: proposal
+ description: Proposal or quote sent
+ reward_on_enter: 0.3
+ - name: negotiation
+ description: Active negotiation on terms
+ reward_on_enter: 0.4
+ - name: won
+ description: Deal closed, payment expected
+ reward_on_enter: 0.8
+
+# Sales process: focus on decisions and insights, not raw actions
+reward_memory_types:
+ - decision
+ - insight
+ - outcome
diff --git a/openexp/ingest/__init__.py b/openexp/ingest/__init__.py
index 7d5898d..8a8fe01 100644
--- a/openexp/ingest/__init__.py
+++ b/openexp/ingest/__init__.py
@@ -53,6 +53,10 @@ def ingest_session(
from .observation import ingest_observations
from .session_summary import ingest_sessions
from .reward import compute_session_reward, reward_retrieved_memories, _build_session_reward_context
+ from ..core.experience import get_active_experience
+
+ # Load active experience so weights/config are used throughout
+ experience = get_active_experience()
result = {}
@@ -81,22 +85,27 @@ def ingest_session(
session_obs = raw_obs
if session_id and session_obs:
- reward = compute_session_reward(session_obs)
+ # BUG FIX: pass experience weights instead of hardcoded defaults
+ reward = compute_session_reward(session_obs, weights=experience.session_reward_weights)
if reward != 0.0:
reward_ctx = _build_session_reward_context(session_obs, reward)
# Reward only memories that were RECALLED at session start (closed loop)
retrieved_updated = reward_retrieved_memories(
- session_id, reward, reward_context=reward_ctx,
+ session_id, reward,
+ experience=experience.name,
+ reward_context=reward_ctx,
+ reward_memory_types=experience.reward_memory_types,
)
result["reward"] = {
"applied": True,
"value": reward,
"retrieved_memories_rewarded": retrieved_updated,
"session_observations": len(session_obs),
+ "experience": experience.name,
}
logger.info(
- "Session reward=%.2f applied to %d retrieved memories (from %d session obs)",
- reward, retrieved_updated, len(session_obs),
+ "Session reward=%.2f applied to %d retrieved memories (from %d session obs, experience=%s)",
+ reward, retrieved_updated, len(session_obs), experience.name,
)
else:
result["reward"] = {"applied": False, "value": 0.0, "reason": "neutral session", "retrieved_memories_rewarded": 0}
@@ -121,6 +130,7 @@ def ingest_session(
resolvers=resolvers,
q_cache=q_cache,
q_updater=q_updater,
+ experience=experience.name,
)
result["outcomes"] = outcome_result
diff --git a/openexp/ingest/reward.py b/openexp/ingest/reward.py
index 8a5e3f9..e7bc84b 100644
--- a/openexp/ingest/reward.py
+++ b/openexp/ingest/reward.py
@@ -221,10 +221,14 @@ def reward_retrieved_memories(
reward: float,
experience: str = "default",
reward_context: Optional[str] = None,
+ reward_memory_types: Optional[List[str]] = None,
) -> int:
"""Reward memories that were retrieved at session start.
Closes the loop: memories retrieved -> session outcome -> Q-value update.
+
+ If reward_memory_types is set, only memories of those types receive reward.
+ Empty list = reward all (preserves current behavior).
"""
from .retrieval_log import get_session_retrievals
@@ -232,6 +236,29 @@ def reward_retrieved_memories(
if not memory_ids:
return 0
+ # Filter by memory type if configured
+ if reward_memory_types:
+ try:
+ from ..core.direct_search import _get_qdrant
+ client = _get_qdrant()
+ from ..core.config import COLLECTION_NAME
+ points = client.retrieve(collection_name=COLLECTION_NAME, ids=memory_ids, with_payload=True)
+ filtered = [
+ p.id for p in points
+ if p.payload.get("memory_type", "fact") in reward_memory_types
+ ]
+ if filtered != memory_ids:
+ logger.info(
+ "Memory type filter: %d/%d memories match types %s",
+ len(filtered), len(memory_ids), reward_memory_types,
+ )
+ memory_ids = filtered
+ except Exception as e:
+ logger.warning("Failed to filter by memory type, rewarding all: %s", e)
+
+ if not memory_ids:
+ return 0
+
updated = apply_session_reward(memory_ids, reward, experience=experience, reward_context=reward_context)
logger.info(
"Rewarded %d retrieved memories for session %s (reward=%.2f, experience=%s)",
diff --git a/openexp/mcp_server.py b/openexp/mcp_server.py
index 0021244..c8c1b00 100644
--- a/openexp/mcp_server.py
+++ b/openexp/mcp_server.py
@@ -461,6 +461,11 @@ def handle_request(request: dict) -> dict:
"outcome_resolvers": active_experience.outcome_resolvers,
"retrieval_boosts": active_experience.retrieval_boosts,
"q_config_overrides": active_experience.q_config_overrides,
+ "process_stages": [
+ {"name": s.name, "description": s.description, "reward_on_enter": s.reward_on_enter}
+ for s in active_experience.process_stages
+ ],
+ "reward_memory_types": active_experience.reward_memory_types,
"stats": q_cache.get_experience_stats(exp_name),
}
return {"content": [{"type": "text", "text": json.dumps(info, indent=2, default=str)}]}
diff --git a/tests/test_experience.py b/tests/test_experience.py
index 7ec136c..cfba5bc 100644
--- a/tests/test_experience.py
+++ b/tests/test_experience.py
@@ -8,11 +8,13 @@
from openexp.core.experience import (
Experience,
+ ProcessStage,
DEFAULT_EXPERIENCE,
load_experience,
get_active_experience,
list_experiences,
_parse_yaml,
+ _parse_process_stages,
)
from openexp.core.q_value import (
QCache,
@@ -320,3 +322,161 @@ def test_compute_session_reward_with_weights():
}
reward_sales = compute_session_reward(observations, weights=sales_weights)
assert isinstance(reward_sales, float)
+
+
+# --- ProcessStage parsing ---
+
+def test_parse_process_stages_dict_format():
+ raw = [
+ {"name": "lead", "description": "New lead", "reward_on_enter": 0.1},
+ {"name": "won", "description": "Deal closed", "reward_on_enter": 0.8},
+ ]
+ stages = _parse_process_stages(raw)
+ assert len(stages) == 2
+ assert stages[0].name == "lead"
+ assert stages[0].description == "New lead"
+ assert stages[0].reward_on_enter == 0.1
+ assert stages[1].reward_on_enter == 0.8
+
+
+def test_parse_process_stages_string_format():
+ raw = ["backlog", "in_progress", "done"]
+ stages = _parse_process_stages(raw)
+ assert len(stages) == 3
+ assert stages[0].name == "backlog"
+ assert stages[0].description == ""
+ assert stages[0].reward_on_enter == 0.0
+
+
+def test_parse_process_stages_mixed_format():
+ raw = [
+ "lead",
+ {"name": "won", "reward_on_enter": 0.8},
+ ]
+ stages = _parse_process_stages(raw)
+ assert len(stages) == 2
+ assert stages[0].name == "lead"
+ assert stages[1].name == "won"
+ assert stages[1].reward_on_enter == 0.8
+
+
+def test_parse_process_stages_empty():
+ assert _parse_process_stages([]) == []
+
+
+# --- reward_memory_types ---
+
+def test_reward_memory_types_from_yaml(tmp_path, monkeypatch):
+ yaml_content = """
+name: filtered
+description: Test with reward_memory_types
+session_reward_weights:
+ commit: 0.3
+reward_memory_types:
+ - decision
+ - insight
+"""
+ (tmp_path / "filtered.yaml").write_text(yaml_content)
+ monkeypatch.setattr("openexp.core.config.EXPERIENCES_DIR", tmp_path)
+
+ exp = load_experience("filtered")
+ assert exp.reward_memory_types == ["decision", "insight"]
+
+
+def test_reward_memory_types_default_empty(tmp_path, monkeypatch):
+ """Old YAML without reward_memory_types should default to empty list."""
+ yaml_content = """
+name: old_format
+description: No reward_memory_types field
+session_reward_weights:
+ commit: 0.3
+"""
+ (tmp_path / "old_format.yaml").write_text(yaml_content)
+ monkeypatch.setattr("openexp.core.config.EXPERIENCES_DIR", tmp_path)
+
+ exp = load_experience("old_format")
+ assert exp.reward_memory_types == []
+
+
+# --- Backward compat: old YAML without new fields ---
+
+def test_backward_compat_old_yaml(tmp_path, monkeypatch):
+ """YAML without process_stages and reward_memory_types loads fine."""
+ yaml_content = """
+name: legacy
+description: Old format experience
+session_reward_weights:
+ commit: 0.3
+ pr: 0.2
+outcome_resolvers: []
+retrieval_boosts: {}
+q_config_overrides: {}
+"""
+ (tmp_path / "legacy.yaml").write_text(yaml_content)
+ monkeypatch.setattr("openexp.core.config.EXPERIENCES_DIR", tmp_path)
+
+ exp = load_experience("legacy")
+ assert exp.name == "legacy"
+ assert exp.process_stages == []
+ assert exp.reward_memory_types == []
+ assert exp.session_reward_weights["commit"] == 0.3
+
+
+# --- Bundled YAMLs have process_stages ---
+
+def test_bundled_sales_has_process_stages():
+ exp = load_experience("sales")
+ assert len(exp.process_stages) > 0
+ stage_names = [s.name for s in exp.process_stages]
+ assert "lead" in stage_names
+ assert "won" in stage_names
+
+
+def test_bundled_dealflow_has_process_stages():
+ exp = load_experience("dealflow")
+ assert len(exp.process_stages) > 0
+ stage_names = [s.name for s in exp.process_stages]
+ assert "lead" in stage_names
+ assert "paid" in stage_names
+
+
+def test_bundled_sales_has_reward_memory_types():
+ exp = load_experience("sales")
+ assert "decision" in exp.reward_memory_types
+ assert "outcome" in exp.reward_memory_types
+
+
+# --- Integration: ingest_session passes experience weights ---
+
+def test_ingest_session_uses_experience_weights(tmp_path, monkeypatch):
+ """Verify ingest_session passes experience weights to compute_session_reward."""
+ from unittest.mock import patch, MagicMock
+
+ # Mock the ingest sub-functions
+ with patch("openexp.ingest.observation.ingest_observations") as mock_obs, \
+ patch("openexp.ingest.session_summary.ingest_sessions") as mock_sess, \
+ patch("openexp.ingest.reward.compute_session_reward") as mock_reward, \
+ patch("openexp.core.experience.get_active_experience") as mock_exp:
+
+ # Set up mocks
+ mock_obs.return_value = {"ingested": 0, "_point_ids": [], "_raw_observations": [
+ {"summary": "email sent to client", "tool": "Bash", "session_id": "sess-123"},
+ ]}
+ mock_sess.return_value = {"ingested": 0}
+ mock_reward.return_value = 0.0 # neutral, so no further calls needed
+
+ sales_exp = Experience(
+ name="sales",
+ description="test",
+ session_reward_weights={"email_sent": 0.15, "base": -0.05},
+ )
+ mock_exp.return_value = sales_exp
+
+ from openexp.ingest import ingest_session
+ ingest_session(session_id="sess-123")
+
+ # Verify compute_session_reward was called with experience weights
+ mock_reward.assert_called_once()
+ call_kwargs = mock_reward.call_args
+ # weights= should be the experience weights, not None/defaults
+ assert call_kwargs[1]["weights"] == {"email_sent": 0.15, "base": -0.05}
diff --git a/tests/test_session_end.py b/tests/test_session_end.py
index b6a8d71..746f55f 100644
--- a/tests/test_session_end.py
+++ b/tests/test_session_end.py
@@ -142,3 +142,77 @@ def test_no_retrievals_no_update(self, tmp_path):
updated = reward_retrieved_memories("sess-nope", reward=0.3)
assert updated == 0
+
+
+class TestMemoryTypeFiltering:
+ def test_reward_memory_types_filters(self, tmp_path):
+ """reward_memory_types filters which memories get rewarded."""
+ ret_path = tmp_path / "ret.jsonl"
+ q_cache_path = tmp_path / "q_cache.json"
+
+ # Write retrieval log with 3 memories
+ record = {
+ "session_id": "sess-filter",
+ "timestamp": datetime.now(timezone.utc).isoformat(),
+ "query": "test",
+ "memory_ids": ["mem-decision", "mem-action", "mem-fact"],
+ "scores": [0.9, 0.8, 0.7],
+ }
+ ret_path.write_text(json.dumps(record) + "\n")
+
+ # Mock Qdrant client to return memory types
+ mock_point_decision = MagicMock()
+ mock_point_decision.id = "mem-decision"
+ mock_point_decision.payload = {"memory_type": "decision"}
+
+ mock_point_action = MagicMock()
+ mock_point_action.id = "mem-action"
+ mock_point_action.payload = {"memory_type": "action"}
+
+ mock_point_fact = MagicMock()
+ mock_point_fact.id = "mem-fact"
+ mock_point_fact.payload = {"memory_type": "fact"}
+
+ mock_client = MagicMock()
+ mock_client.retrieve.return_value = [mock_point_decision, mock_point_action, mock_point_fact]
+
+ with patch("openexp.ingest.retrieval_log.RETRIEVALS_PATH", ret_path), \
+ patch("openexp.ingest.reward.Q_CACHE_PATH", q_cache_path), \
+ patch("openexp.core.direct_search._get_qdrant", return_value=mock_client):
+ # Only reward decisions — should filter out action and fact
+ updated = reward_retrieved_memories(
+ "sess-filter", reward=0.3,
+ reward_memory_types=["decision"],
+ )
+
+ # Only 1 memory should be rewarded (the decision)
+ assert updated == 1
+
+ def test_empty_reward_memory_types_rewards_all(self, tmp_path):
+ """Empty reward_memory_types list rewards all memories (default behavior)."""
+ ret_path = tmp_path / "ret.jsonl"
+ q_cache_path = tmp_path / "q_cache.json"
+
+ record = {
+ "session_id": "sess-all",
+ "timestamp": datetime.now(timezone.utc).isoformat(),
+ "query": "test",
+ "memory_ids": ["mem-a", "mem-b"],
+ "scores": [0.9, 0.8],
+ }
+ ret_path.write_text(json.dumps(record) + "\n")
+
+ q_cache_path.write_text(json.dumps({
+ "mem-a": {"q_value": 0.0, "q_action": 0.0, "q_hypothesis": 0.0, "q_fit": 0.0, "q_visits": 0},
+ "mem-b": {"q_value": 0.0, "q_action": 0.0, "q_hypothesis": 0.0, "q_fit": 0.0, "q_visits": 0},
+ }))
+
+ with patch("openexp.ingest.retrieval_log.RETRIEVALS_PATH", ret_path), \
+ patch("openexp.ingest.reward.Q_CACHE_PATH", q_cache_path):
+ # Empty list = reward all (no filtering)
+ updated = reward_retrieved_memories(
+ "sess-all", reward=0.3,
+ reward_memory_types=[],
+ )
+
+ assert updated == 2
From e0285955610536f049cbbf5850dfdfd493b01113 Mon Sep 17 00:00:00 2001
From: John
Date: Mon, 30 Mar 2026 01:56:53 -0700
Subject: [PATCH 31/59] =?UTF-8?q?docs:=20update=20positioning=20=E2=80=94?=
=?UTF-8?q?=20skills=20say=20how,=20OpenExp=20teaches=20what=20works=20(#1?=
=?UTF-8?q?8)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Reframe from "self-labeling experience engine" to outcome-based learning.
Core message: skills/CLAUDE.md are static instructions that don't learn
from results. OpenExp adds the feedback loop.
Co-authored-by: Ivan Pasichnyk
Co-authored-by: Claude Opus 4.6
---
README.md | 22 ++++++++++++++--------
1 file changed, 14 insertions(+), 8 deletions(-)
diff --git a/README.md b/README.md
index 975ae02..40c399e 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
OpenExp
- Self-labeling experience engine for AI agents
- Define your process. Outcomes label your data. AI learns what works.
+ Skills tell your AI how. OpenExp teaches it what works.
+ Outcome-based learning for AI agents. Q-learning memory that gets smarter with every session.
@@ -25,11 +25,15 @@
---
-Memory tools store and retrieve. OpenExp **learns which memories actually help you get work done** — and surfaces those first next time.
+You wrote a skill: "how to work with CRM." Your agent follows it perfectly. But it doesn't know that approach A closed deals and approach B didn't. Tomorrow it'll do the same thing as yesterday — even if yesterday didn't work.
-You define your process (software dev, sales, support, content). Every outcome — commit, closed deal, resolved ticket — feeds back as a reward signal. Over time, proven memories rank higher. Noise sinks.
+**Skills say *how*. OpenExp teaches *what works*.**
-### How it works for a sales team
+Every outcome — commit, closed deal, resolved ticket — feeds back as a reward signal. Memories that led to results get higher Q-values and surface first next time. Noise sinks.
+
+### Example: sales agent
+
+Your agent sent 200 emails this month. Which formulations got replies? Which approaches closed deals? Skills don't know — there's no feedback loop.
```yaml
# .openexp.yaml in your sales project
@@ -43,11 +47,13 @@ experience: sales
4. Next similar deal → the insights that led to the close surface first
```
-The same idea behind AlphaGo, applied to your AI agent's working memory.
+After a month, your agent "knows" not just how to write emails — but which emails lead to results.
## The Problem
-AI agents forget everything between sessions. Existing memory tools (Mem0, Zep, LangMem) just store and retrieve — every memory is equally important. A two-month-old note about a deleted feature has the same weight as yesterday's critical architecture decision.
+Skills and CLAUDE.md solve the "agent doesn't remember" problem. But they're **static instructions** — written once, never learning from outcomes. Your agent follows the playbook perfectly, but doesn't know which plays actually work.
+
+Existing memory tools (Mem0, Zep, LangMem) add storage — but every memory is equally important. A two-month-old note about a deleted feature has the same weight as yesterday's critical architecture decision.
**The missing piece:** there's no learning. No feedback loop from outcomes to retrieval quality.
@@ -97,7 +103,7 @@ After a few sessions, OpenExp learns what context actually helps you get work do
| **Hybrid retrieval** | BM25 + vector + recency + importance + Q-value (5 signals) | Vector only | Graph + vector | Vector only |
| **Privacy** | All data stays on your machine | Data sent to cloud | Depends on setup | Data sent to cloud |
-**The key difference:** other memory tools store and retrieve. OpenExp **learns which memories actually help you get work done** — and surfaces those first next time.
+**The key difference:** skills say how. Memory tools store. OpenExp **learns what works** — from real outcomes.
## Quick Start
From becea33f3985eec966003dc20f7f4bb3183e7c38 Mon Sep 17 00:00:00 2001
From: John
Date: Sun, 5 Apr 2026 21:16:56 -0700
Subject: [PATCH 32/59] Fix per-experience Q-value routing in observation
ingest (#19)
ingest_observations() was initializing Q-cache entries under "default"
experience regardless of active experience. Now accepts and passes
experience parameter to q_cache.set(), enabling proper per-experience
Q-values when dealflow/sales experiences are active.
Also updates storage-system.md to reflect fix and 250 test count.
Co-authored-by: Ivan Pasichnyk
Co-authored-by: Claude Opus 4.6
---
docs/storage-system.md | 8 ++++----
openexp/ingest/__init__.py | 2 +-
openexp/ingest/observation.py | 3 ++-
3 files changed, 7 insertions(+), 6 deletions(-)
diff --git a/docs/storage-system.md b/docs/storage-system.md
index 501cd83..4bcb3fb 100644
--- a/docs/storage-system.md
+++ b/docs/storage-system.md
@@ -3,7 +3,7 @@
> **Purpose:** This document describes the full storage architecture so that Claude
> doesn't have to re-read every source file each session. Read THIS instead of the code.
>
-> **Last updated:** 2026-03-26 (after L4 audit, all gaps fixed, 237 tests pass)
+> **Last updated:** 2026-04-05 (experience routing fix, 250 tests pass)
---
@@ -314,7 +314,7 @@ Same memory can have different Q-values per experience (e.g., "default", "sales"
↓
filters.py (drops ~60-70% trivial obs)
↓
- observation.py (batch embed via FastEmbed → upsert to Qdrant)
+ observation.py (batch embed via FastEmbed → upsert to Qdrant, experience-aware Q init)
↓
~/.openexp/sessions/*.md (written by session-end hook)
↓
@@ -372,7 +372,7 @@ Keeps: Write, Edit, Bash with side effects, decisions, valuable tags.
| File | Purpose |
|------|---------|
| `ingest/filters.py` | Drop trivial observations |
-| `ingest/observation.py` | Batch embed → Qdrant upsert |
+| `ingest/observation.py` | Batch embed → Qdrant upsert (passes `experience` to Q-cache init) |
| `ingest/session_summary.py` | Parse session markdown → memories |
| `ingest/reward.py` | Session reward computation + Q-update + L3/L4 |
| `ingest/retrieval_log.py` | Track recalled memory IDs |
@@ -442,7 +442,7 @@ Keeps: Write, Edit, Bash with side effects, decisions, valuable tags.
## 14. Test Coverage
-237 tests across 11 test files. Key test files for the storage system:
+250 tests across 11 test files. Key test files for the storage system:
| File | Tests | What |
|------|-------|------|
diff --git a/openexp/ingest/__init__.py b/openexp/ingest/__init__.py
index 8a8fe01..2a71b79 100644
--- a/openexp/ingest/__init__.py
+++ b/openexp/ingest/__init__.py
@@ -61,7 +61,7 @@ def ingest_session(
result = {}
if not sessions_only:
- obs_result = ingest_observations(max_count=max_count, dry_run=dry_run)
+ obs_result = ingest_observations(max_count=max_count, dry_run=dry_run, experience=experience.name)
result["observations"] = obs_result
else:
result["observations"] = {"skipped": True}
diff --git a/openexp/ingest/observation.py b/openexp/ingest/observation.py
index ead3822..26c32bb 100644
--- a/openexp/ingest/observation.py
+++ b/openexp/ingest/observation.py
@@ -196,6 +196,7 @@ def ingest_observations(
max_count: int = 0,
dry_run: bool = False,
obs_dir: Optional[Path] = None,
+ experience: str = "default",
) -> Dict:
"""Ingest observations into Qdrant."""
obs_dir = obs_dir or OBSERVATIONS_DIR
@@ -279,7 +280,7 @@ def ingest_observations(
"q_hypothesis": q_init,
"q_fit": q_init,
"q_visits": 0,
- })
+ }, experience=experience)
ingested_point_ids.append(point_id)
watermark.mark_obs_processed(obs.get("id", ""))
From d65417acf84a47c94eae5f245a533bab00b91ae8 Mon Sep 17 00:00:00 2001
From: John
Date: Sun, 5 Apr 2026 22:29:22 -0700
Subject: [PATCH 33/59] Fix Q-value wiring, add cache locking, fix test
isolation (#20)
Three bugs found during architecture audit:
1. Q-value from q_cache never reached hybrid_search scoring formula.
direct_search set result["q_value"] but hybrid_search only checked
payload/metadata/q_estimate. Added result.get("q_value") as first
priority in the lookup chain.
2. QCache.save() had no file locking. Concurrent session-end hooks
caused lost updates (15K entries wiped to 1). Added fcntl.flock
with merge-on-save to prevent data loss. Extracted _write_to_disk()
to avoid deadlock with load_and_merge().
3. test_session_end.py and test_outcome.py didn't patch REWARD_LOG_PATH,
polluting real reward_log.jsonl with 100+ test entries. Added autouse
fixture to isolate reward_log in both test files.
Co-authored-by: Ivan Pasichnyk
Co-authored-by: Claude Opus 4.6
---
openexp/core/hybrid_search.py | 5 ++++-
openexp/core/q_value.py | 28 +++++++++++++++++++++++++---
tests/test_outcome.py | 8 ++++++++
tests/test_session_end.py | 8 ++++++++
4 files changed, 45 insertions(+), 4 deletions(-)
diff --git a/openexp/core/hybrid_search.py b/openexp/core/hybrid_search.py
index e6ed32b..056f43d 100644
--- a/openexp/core/hybrid_search.py
+++ b/openexp/core/hybrid_search.py
@@ -165,8 +165,11 @@ def hybrid_search(
status_multiplier = STATUS_WEIGHTS.get(status, 1.0)
# Explicit None checks — 0.0 is a valid Q-value (downranked memory)
+ # Priority: top-level result (set by direct_search from q_cache) > payload > metadata > q_estimate > default
from .q_value import DEFAULT_Q_CONFIG
- q_value = payload.get("q_value")
+ q_value = result.get("q_value")
+ if q_value is None:
+ q_value = payload.get("q_value")
if q_value is None:
q_value = metadata.get("q_value")
if q_value is None:
diff --git a/openexp/core/q_value.py b/openexp/core/q_value.py
index 0a80fa1..373aad2 100644
--- a/openexp/core/q_value.py
+++ b/openexp/core/q_value.py
@@ -183,12 +183,34 @@ def get_experience_stats(self, experience: str = "default") -> Dict[str, Any]:
def __len__(self):
return len(self._cache)
- def save(self, path: Path):
+ def _write_to_disk(self, path: Path):
+ """Write cache to file (no locking — caller must hold lock if needed)."""
data = {k: v for k, v in self._cache.items()}
tmp_path = path.with_suffix(".tmp")
tmp_path.write_text(json.dumps(data, ensure_ascii=False))
tmp_path.rename(path)
+ def save(self, path: Path):
+ """Save cache to file with exclusive file locking to prevent concurrent overwrites."""
+ lock_path = path.with_suffix(".lock")
+ lock_path.parent.mkdir(parents=True, exist_ok=True)
+ lock_fd = open(lock_path, "w")
+ try:
+ fcntl.flock(lock_fd, fcntl.LOCK_EX)
+ # Re-read file under lock to merge any changes written by other processes
+ if path.exists():
+ try:
+ disk_data = json.loads(path.read_text())
+ for mem_id, exp_dict in disk_data.items():
+ if mem_id not in self._cache:
+ self._cache[mem_id] = exp_dict
+ except (json.JSONDecodeError, OSError):
+ pass # Corrupt file — our in-memory data takes precedence
+ self._write_to_disk(path)
+ finally:
+ fcntl.flock(lock_fd, fcntl.LOCK_UN)
+ lock_fd.close()
+
def load(self, path: Path):
if path.exists():
try:
@@ -264,10 +286,10 @@ def load_and_merge(self, path: Path, deltas_dir: Path):
except (json.JSONDecodeError, OSError) as e:
logger.warning("Failed to merge delta %s: %s", delta_file, e)
if merged_any:
- self.save(path)
+ self._write_to_disk(path)
if self._migrated:
if not merged_any:
- self.save(path)
+ self._write_to_disk(path)
self._migrated = False
finally:
fcntl.flock(lock_fd, fcntl.LOCK_UN)
diff --git a/tests/test_outcome.py b/tests/test_outcome.py
index 8b5e04b..aece439 100644
--- a/tests/test_outcome.py
+++ b/tests/test_outcome.py
@@ -27,6 +27,14 @@ def cleanup_test_memories():
yield
+@pytest.fixture(autouse=True)
+def _isolate_reward_log(tmp_path):
+ """Prevent tests from polluting the real reward_log.jsonl."""
+ log_path = tmp_path / "reward_log.jsonl"
+ with patch("openexp.core.reward_log.REWARD_LOG_PATH", log_path):
+ yield
+
+
class TestOutcomeEvent:
def test_basic_construction(self):
event = OutcomeEvent(
diff --git a/tests/test_session_end.py b/tests/test_session_end.py
index 746f55f..2789101 100644
--- a/tests/test_session_end.py
+++ b/tests/test_session_end.py
@@ -15,6 +15,14 @@
from openexp.ingest.retrieval_log import log_retrieval, get_session_retrievals
+@pytest.fixture(autouse=True)
+def _isolate_reward_log(tmp_path):
+ """Prevent tests from polluting the real reward_log.jsonl."""
+ log_path = tmp_path / "reward_log.jsonl"
+ with patch("openexp.core.reward_log.REWARD_LOG_PATH", log_path):
+ yield
+
+
# Override autouse async fixture from conftest.py
@pytest.fixture(autouse=True)
def cleanup_test_memories():
From 3f038638070d9156ffcc1f9ed647f791a6b55d4b Mon Sep 17 00:00:00 2001
From: John
Date: Sun, 5 Apr 2026 23:50:30 -0700
Subject: [PATCH 34/59] feat: auto-detect experience from prompt keywords (#21)
Adds keyword-based experience classifier to the UserPromptSubmit hook.
When a user writes about clients/deals/proposals, the system automatically
switches to the sales experience for retrieval. Invoice/payment/NDA prompts
activate dealflow. Coding prompts stay on default.
Changes:
- Add detect_keywords field to Experience dataclass and YAML configs
- Add detect_experience_from_prompt() with threshold=2 keyword matches
- Add session experience persistence (save/get/cleanup) for session-end
- Modify user-prompt-recall.sh to detect and pass experience to search
- Modify session-end.sh to read auto-detected experience
- Add 26 EN+UK keywords for sales, 16 for dealflow
- 13 new tests (47 total in test_experience.py, 263 total)
Co-authored-by: Ivan Pasichnyk
Co-authored-by: Claude Opus 4.6
---
openexp/core/experience.py | 63 ++++++++++++++++++
openexp/data/experiences/dealflow.yaml | 19 ++++++
openexp/data/experiences/sales.yaml | 29 +++++++++
openexp/hooks/session-end.sh | 90 +++++++++++++++++++++++++-
openexp/hooks/user-prompt-recall.sh | 26 ++++++--
tests/test_experience.py | 76 ++++++++++++++++++++++
6 files changed, 296 insertions(+), 7 deletions(-)
diff --git a/openexp/core/experience.py b/openexp/core/experience.py
index da73aaa..aa0548c 100644
--- a/openexp/core/experience.py
+++ b/openexp/core/experience.py
@@ -45,6 +45,7 @@ class Experience:
q_config_overrides: Dict[str, float] = field(default_factory=dict)
process_stages: List[ProcessStage] = field(default_factory=list)
reward_memory_types: List[str] = field(default_factory=list)
+ detect_keywords: List[str] = field(default_factory=list)
DEFAULT_EXPERIENCE = Experience(
@@ -108,6 +109,7 @@ def _parse_yaml(path: Path) -> Experience:
q_config_overrides=data.get("q_config_overrides", {}),
process_stages=process_stages,
reward_memory_types=data.get("reward_memory_types", []),
+ detect_keywords=data.get("detect_keywords", []),
)
@@ -205,3 +207,64 @@ def list_experiences() -> List[Experience]:
experiences.insert(0, DEFAULT_EXPERIENCE)
return experiences
+
+
+# --- Experience auto-detection from prompt text ---
+
+# Minimum keyword matches required to switch from default
+_DETECT_THRESHOLD = 2
+
+
+def detect_experience_from_prompt(prompt: str) -> str:
+ """Detect the best-matching experience from a user prompt using keyword scoring.
+
+ Returns the experience name with the most keyword hits (minimum 2),
+ or "default" if no experience reaches the threshold.
+ """
+ if not prompt or len(prompt) < 10:
+ return "default"
+
+ prompt_lower = prompt.lower()
+ experiences = list_experiences()
+
+ best_name = "default"
+ best_score = 0
+
+ for exp in experiences:
+ if not exp.detect_keywords or exp.name == "default":
+ continue
+ score = sum(1 for kw in exp.detect_keywords if kw in prompt_lower)
+ if score > best_score and score >= _DETECT_THRESHOLD:
+ best_score = score
+ best_name = exp.name
+
+ if best_name != "default":
+ logger.debug("Auto-detected experience '%s' (score=%d) from prompt", best_name, best_score)
+
+ return best_name
+
+
+def save_session_experience(session_id: str, experience_name: str) -> None:
+ """Persist detected experience for a session (for session-end to read)."""
+ from .config import DATA_DIR
+ exp_file = DATA_DIR / f"session_{session_id}_experience.txt"
+ exp_file.parent.mkdir(parents=True, exist_ok=True)
+ exp_file.write_text(experience_name)
+
+
+def get_session_experience(session_id: str) -> Optional[str]:
+ """Read the detected experience for a session, if saved."""
+ from .config import DATA_DIR
+ exp_file = DATA_DIR / f"session_{session_id}_experience.txt"
+ if exp_file.exists():
+ name = exp_file.read_text().strip()
+ if _validate_experience_name(name):
+ return name
+ return None
+
+
+def cleanup_session_experience(session_id: str) -> None:
+ """Remove the session experience file after session-end processing."""
+ from .config import DATA_DIR
+ exp_file = DATA_DIR / f"session_{session_id}_experience.txt"
+ exp_file.unlink(missing_ok=True)
diff --git a/openexp/data/experiences/dealflow.yaml b/openexp/data/experiences/dealflow.yaml
index b9bea7b..ebac3f3 100644
--- a/openexp/data/experiences/dealflow.yaml
+++ b/openexp/data/experiences/dealflow.yaml
@@ -58,3 +58,22 @@ reward_memory_types:
- decision
- insight
- outcome
+
+# Keywords for auto-detection from prompt text (EN + UK)
+detect_keywords:
+ - invoice
+ - payment
+ - nda
+ - pricing
+ - negotiation
+ - sow
+ - billing
+ - paid
+ - quote
+ - інвойс
+ - оплат
+ - рахунок
+ - ціна
+ - переговор
+ - акт
+ - нда
diff --git a/openexp/data/experiences/sales.yaml b/openexp/data/experiences/sales.yaml
index 31bc6ea..4857f11 100644
--- a/openexp/data/experiences/sales.yaml
+++ b/openexp/data/experiences/sales.yaml
@@ -43,3 +43,32 @@ reward_memory_types:
- decision
- insight
- outcome
+
+# Keywords for auto-detection from prompt text (EN + UK)
+detect_keywords:
+ - client
+ - deal
+ - lead
+ - proposal
+ - outreach
+ - follow-up
+ - follow up
+ - email
+ - crm
+ - pipeline
+ - sales
+ - prospect
+ - revenue
+ - close
+ - contract
+ - клієнт
+ - угода
+ - лід
+ - пропозиц
+ - аутріч
+ - фоловап
+ - імейл
+ - продаж
+ - контракт
+ - листа
+ - написати лист
diff --git a/openexp/hooks/session-end.sh b/openexp/hooks/session-end.sh
index 849a978..1771aa6 100755
--- a/openexp/hooks/session-end.sh
+++ b/openexp/hooks/session-end.sh
@@ -135,9 +135,20 @@ fi
cd "$OPENEXP_DIR"
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] SessionEnd: starting ingest for session $SESSION_SHORT" >> "$INGEST_LOG"
- # Resolve experience: project .openexp.yaml → env var → default
+ # Resolve experience: auto-detected (from prompts) → project .openexp.yaml → env var → default
EXPERIENCE="${OPENEXP_EXPERIENCE:-default}"
- if [ -n "$CWD" ] && [ -f "$CWD/.openexp.yaml" ]; then
+ # Check if experience was auto-detected during this session
+ AUTO_EXP=$("$PYTHON" -c "
+import sys
+sys.path.insert(0, '.')
+from openexp.core.experience import get_session_experience
+exp = get_session_experience('$SESSION_ID')
+print(exp or '')
+" 2>/dev/null)
+ if [ -n "$AUTO_EXP" ]; then
+ EXPERIENCE="$AUTO_EXP"
+ echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] SessionEnd: using auto-detected experience '$EXPERIENCE'" >> "$INGEST_LOG"
+ elif [ -n "$CWD" ] && [ -f "$CWD/.openexp.yaml" ]; then
PROJECT_EXP=$(OPENEXP_CWD="$CWD" python3 -c "
import yaml, os
d=yaml.safe_load(open(os.path.join(os.environ['OPENEXP_CWD'], '.openexp.yaml')))
@@ -146,10 +157,83 @@ print(d.get('experience',''))
[ -n "$PROJECT_EXP" ] && EXPERIENCE="$PROJECT_EXP"
fi
export OPENEXP_EXPERIENCE="$EXPERIENCE"
+ # Phase 2a: Full ingest + session reward (ingests ALL pending obs, rewards THIS session)
"$PYTHON" -m openexp.cli ingest --session-id "$SESSION_ID" >> "$INGEST_LOG" 2>&1
EXIT_CODE=$?
-
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] SessionEnd: ingest finished (exit=$EXIT_CODE)" >> "$INGEST_LOG"
+
+ # Phase 2b: Fallback reward — if obs were already ingested (by launchd or prior session),
+ # raw_obs was empty and reward didn't fire above. Read obs from JSONL directly.
+ # Guard: skip if reward was already applied for this session (idempotency).
+ "$PYTHON" -c "
+import json, sys, logging
+from pathlib import Path
+
+logging.basicConfig(level=logging.INFO)
+session_id = '$SESSION_ID'
+data_dir = Path.home() / '.openexp' / 'data'
+reward_log = data_dir / 'reward_log.jsonl'
+
+# Check if reward already applied for this session
+if reward_log.exists():
+ for line in reward_log.read_text().splitlines():
+ if not line.strip():
+ continue
+ try:
+ entry = json.loads(line)
+ except json.JSONDecodeError:
+ continue
+ ctx = entry.get('context', {})
+ if isinstance(ctx, dict) and session_id in ctx.get('session_id', ''):
+ print(f'Reward already applied for session {session_id[:8]}, skipping fallback')
+ sys.exit(0)
+
+# No reward yet — read observations from JSONL and compute
+from openexp.ingest.reward import compute_session_reward, reward_retrieved_memories, _build_session_reward_context
+from openexp.core.experience import get_active_experience
+
+obs_dir = Path.home() / '.openexp' / 'observations'
+session_obs = []
+for f in sorted(obs_dir.glob('observations-*.jsonl')):
+ for line in f.read_text().splitlines():
+ if not line.strip():
+ continue
+ try:
+ obs = json.loads(line)
+ except json.JSONDecodeError:
+ continue
+ sid = obs.get('session_id', '')
+ if session_id in sid or sid.startswith(session_id[:8]):
+ session_obs.append(obs)
+
+if not session_obs:
+ print(f'No observations found for session {session_id[:8]}')
+ sys.exit(0)
+
+experience = get_active_experience()
+reward = compute_session_reward(session_obs, weights=experience.session_reward_weights)
+if reward == 0.0:
+ print(f'Session {session_id[:8]}: neutral reward, skipping')
+ sys.exit(0)
+
+reward_ctx = _build_session_reward_context(session_obs, reward)
+updated = reward_retrieved_memories(
+ session_id, reward,
+ experience=experience.name,
+ reward_context=reward_ctx,
+ reward_memory_types=experience.reward_memory_types,
+)
+print(f'Fallback reward={reward:.2f} applied to {updated} retrieved memories ({len(session_obs)} obs)')
+" >> "$INGEST_LOG" 2>&1
+ echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] SessionEnd: fallback reward finished" >> "$INGEST_LOG"
+
+ # Cleanup session experience file
+ "$PYTHON" -c "
+import sys
+sys.path.insert(0, '.')
+from openexp.core.experience import cleanup_session_experience
+cleanup_session_experience('$SESSION_ID')
+" 2>/dev/null
) &
disown
diff --git a/openexp/hooks/user-prompt-recall.sh b/openexp/hooks/user-prompt-recall.sh
index 7cccf4d..aba4178 100755
--- a/openexp/hooks/user-prompt-recall.sh
+++ b/openexp/hooks/user-prompt-recall.sh
@@ -38,15 +38,17 @@ esac
# Truncate prompt for search query (max 300 chars)
QUERY="${PROMPT:0:300}"
-# --- Search memories ---
+# --- Detect experience from prompt + search memories ---
cd "$OPENEXP_DIR"
export OPENEXP_TMPFILE="$TMPFILE"
+export OPENEXP_SESSION_ID="$SESSION_ID"
"$PYTHON" -c "
import json, sys, os
sys.path.insert(0, '.')
from openexp.core.config import Q_CACHE_PATH
from openexp.core.q_value import QCache
from openexp.core import direct_search
+from openexp.core.experience import detect_experience_from_prompt, save_session_experience
q = QCache()
q.load(Q_CACHE_PATH)
@@ -55,9 +57,15 @@ query = sys.stdin.read().strip()
if not query:
sys.exit(1)
+# Auto-detect experience from prompt keywords
+experience = detect_experience_from_prompt(query)
+session_id = os.environ.get('OPENEXP_SESSION_ID', '')
+if experience != 'default' and session_id and session_id != 'unknown':
+ save_session_experience(session_id, experience)
+
tmpfile = os.environ['OPENEXP_TMPFILE']
-context = direct_search.search_memories(query=query, limit=5, q_cache=q)
-json.dump({'context': context}, open(tmpfile, 'w'), default=str)
+context = direct_search.search_memories(query=query, limit=5, q_cache=q, experience=experience)
+json.dump({'context': context, 'experience': experience}, open(tmpfile, 'w'), default=str)
" <<< "$QUERY" 2>/dev/null
if [ ! -s "$TMPFILE" ]; then
@@ -90,15 +98,25 @@ if [ -n "$ALL_IDS" ] && [ "$SESSION_ID" != "unknown" ]; then
--memory-ids "$ALL_IDS" --scores "$ALL_SCORES" 2>/dev/null) &
fi
+# --- Read detected experience ---
+DETECTED_EXP=$(jq -r '.experience // "default"' "$TMPFILE" 2>/dev/null)
+
# --- Build output using jq for safe string handling ---
REMINDER="\n\nREMINDER: Before starting this task, call search_memory with a targeted query. Hooks recalled the above automatically, but you must also do a manual targeted search for complex tasks."
+# Show experience label if non-default
+EXP_LABEL=""
+if [ "$DETECTED_EXP" != "default" ]; then
+ EXP_LABEL=" [experience: $DETECTED_EXP]"
+fi
+
jq -n \
--arg context "$CONTEXT_TEXT" \
--arg reminder "$REMINDER" \
+ --arg exp_label "$EXP_LABEL" \
'{
hookSpecificOutput: {
hookEventName: "UserPromptSubmit",
- additionalContext: ("## Recall: Context\n" + $context + $reminder + "\n")
+ additionalContext: ("## Recall: Context" + $exp_label + "\n" + $context + $reminder + "\n")
}
}'
diff --git a/tests/test_experience.py b/tests/test_experience.py
index cfba5bc..267ddcb 100644
--- a/tests/test_experience.py
+++ b/tests/test_experience.py
@@ -15,6 +15,10 @@
list_experiences,
_parse_yaml,
_parse_process_stages,
+ detect_experience_from_prompt,
+ save_session_experience,
+ get_session_experience,
+ cleanup_session_experience,
)
from openexp.core.q_value import (
QCache,
@@ -480,3 +484,75 @@ def test_ingest_session_uses_experience_weights(tmp_path, monkeypatch):
call_kwargs = mock_reward.call_args
# weights= should be the experience weights, not None/defaults
assert call_kwargs[1]["weights"] == {"email_sent": 0.15, "base": -0.05}
+
+
+# --- Experience auto-detection ---
+
+class TestDetectExperience:
+ def test_sales_keywords_english(self):
+ prompt = "write an email to the client about our proposal"
+ assert detect_experience_from_prompt(prompt) == "sales"
+
+ def test_sales_keywords_ukrainian(self):
+ prompt = "напиши листа клієнту про нашу пропозицію"
+ assert detect_experience_from_prompt(prompt) == "sales"
+
+ def test_dealflow_keywords(self):
+ prompt = "check if the invoice was paid and update pricing"
+ assert detect_experience_from_prompt(prompt) == "dealflow"
+
+ def test_dealflow_keywords_ukrainian(self):
+ prompt = "перевір чи прийшла оплата за рахунок"
+ assert detect_experience_from_prompt(prompt) == "dealflow"
+
+ def test_coding_stays_default(self):
+ prompt = "fix the bug in auth.py where the token refresh fails"
+ assert detect_experience_from_prompt(prompt) == "default"
+
+ def test_short_prompt_default(self):
+ assert detect_experience_from_prompt("ok") == "default"
+
+ def test_empty_prompt_default(self):
+ assert detect_experience_from_prompt("") == "default"
+
+ def test_single_keyword_not_enough(self):
+ """One keyword match is below threshold (needs 2+)."""
+ prompt = "tell me about the client relationship"
+ # "client" matches sales, but only 1 match — below threshold
+ result = detect_experience_from_prompt(prompt)
+ # Could be sales if "client" + something else matches, or default
+ # The point is: threshold=2 requires at least 2 keyword hits
+ assert result in ("default", "sales")
+
+ def test_ambiguous_prefers_higher_score(self):
+ """When multiple experiences match, highest score wins."""
+ prompt = "send invoice to client for the deal and check payment status"
+ # "client" + "deal" → sales (2 hits)
+ # "invoice" + "payment" → dealflow (2 hits)
+ # Both >= threshold, whichever scores higher wins
+ result = detect_experience_from_prompt(prompt)
+ assert result in ("sales", "dealflow")
+
+
+class TestSessionExperience:
+ def test_save_and_get(self, tmp_path, monkeypatch):
+ monkeypatch.setattr("openexp.core.config.DATA_DIR", tmp_path)
+ save_session_experience("sess-abc", "sales")
+ assert get_session_experience("sess-abc") == "sales"
+
+ def test_get_nonexistent(self, tmp_path, monkeypatch):
+ monkeypatch.setattr("openexp.core.config.DATA_DIR", tmp_path)
+ assert get_session_experience("sess-nope") is None
+
+ def test_cleanup(self, tmp_path, monkeypatch):
+ monkeypatch.setattr("openexp.core.config.DATA_DIR", tmp_path)
+ save_session_experience("sess-abc", "dealflow")
+ assert get_session_experience("sess-abc") == "dealflow"
+ cleanup_session_experience("sess-abc")
+ assert get_session_experience("sess-abc") is None
+
+ def test_invalid_name_rejected(self, tmp_path, monkeypatch):
+ monkeypatch.setattr("openexp.core.config.DATA_DIR", tmp_path)
+ exp_file = tmp_path / "session_sess-bad_experience.txt"
+ exp_file.write_text("../../../etc/passwd") # path traversal attempt
+ assert get_session_experience("sess-bad") is None
From 631368fe4b6071ade0df82416e0a041edf78b710 Mon Sep 17 00:00:00 2001
From: Ivan Pasichnyk
Date: Mon, 6 Apr 2026 00:00:56 -0700
Subject: [PATCH 35/59] fix: JSONL format and multi-line parser for
observations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Critical bug: post-tool-use.sh wrote pretty-printed JSON (multi-line)
instead of JSONL (one JSON per line). This caused:
1. _load_observations() couldn't parse any observations
2. Session reward never found observations → "no observations for this session"
3. Q-values stayed at 0.0 forever — the reward loop was broken
Fixes:
- Add -c flag to jq in post-tool-use.sh (compact output = true JSONL)
- Add multi-line JSON fallback parser in _load_observations() for existing files
- Reuse _load_observations() in ingest_session() fallback path
- Now correctly loads 19,983 observations from mixed-format files
Verified: full reward pipeline works end-to-end
Session obs → reward → retrieve IDs → Q-update → Q-value changes
Co-Authored-By: Claude Opus 4.6
---
openexp/hooks/post-tool-use.sh | 2 +-
openexp/ingest/__init__.py | 12 +++++++
openexp/ingest/observation.py | 62 ++++++++++++++++++++++++++++------
3 files changed, 65 insertions(+), 11 deletions(-)
diff --git a/openexp/hooks/post-tool-use.sh b/openexp/hooks/post-tool-use.sh
index 8aaab92..e1cd09b 100755
--- a/openexp/hooks/post-tool-use.sh
+++ b/openexp/hooks/post-tool-use.sh
@@ -62,7 +62,7 @@ TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
# Write observation to JSONL
OBS_FILE="$OBS_DIR/observations-$(date +%Y-%m-%d).jsonl"
-jq -n \
+jq -cn \
--arg id "$OBS_ID" \
--arg timestamp "$TIMESTAMP" \
--arg session_id "$SESSION_ID" \
diff --git a/openexp/ingest/__init__.py b/openexp/ingest/__init__.py
index 2a71b79..ebd341a 100644
--- a/openexp/ingest/__init__.py
+++ b/openexp/ingest/__init__.py
@@ -84,6 +84,18 @@ def ingest_session(
else:
session_obs = raw_obs
+ # If raw_obs was empty (observations already ingested via watermark),
+ # read this session's observations directly from JSONL files.
+ if session_id and not session_obs:
+ from .observation import _load_observations, OBSERVATIONS_DIR
+ all_obs = _load_observations(OBSERVATIONS_DIR)
+ session_obs = [
+ o for o in all_obs
+ if session_id in o.get("session_id", "") or o.get("session_id", "").startswith(session_id[:8])
+ ]
+ if session_obs:
+ logger.info("Read %d observations for session %s from JSONL (already ingested)", len(session_obs), session_id[:8])
+
if session_id and session_obs:
# BUG FIX: pass experience weights instead of hardcoded defaults
reward = compute_session_reward(session_obs, weights=experience.session_reward_weights)
diff --git a/openexp/ingest/observation.py b/openexp/ingest/observation.py
index 26c32bb..a998cc7 100644
--- a/openexp/ingest/observation.py
+++ b/openexp/ingest/observation.py
@@ -163,8 +163,9 @@ def _detect_client_id(obs: Dict) -> Optional[str]:
def _load_observations(obs_dir: Path, processed_ids: set = None) -> List[Dict]:
"""Load all observations from JSONL files in directory.
- Streams line-by-line to avoid loading entire files into memory.
- Skips files larger than MAX_FILE_SIZE and already-processed IDs early.
+ Handles both true JSONL (one JSON per line) and multi-line pretty-printed
+ JSON objects (caused by jq without -c flag). Streams line-by-line for
+ JSONL, falls back to json.JSONDecoder for multi-line.
"""
all_obs = []
for f in sorted(obs_dir.glob("observations-*.jsonl")):
@@ -175,20 +176,61 @@ def _load_observations(obs_dir: Path, processed_ids: set = None) -> List[Dict]:
if file_size > MAX_FILE_SIZE:
logger.warning("Skipping oversized observation file %s (%d bytes > %d limit)", f, file_size, MAX_FILE_SIZE)
continue
- with open(f, encoding="utf-8") as fh:
- for line in fh:
+
+ content = f.read_text(encoding="utf-8")
+ file_obs = []
+
+ # Try JSONL first (fast path: first non-empty line is valid JSON)
+ first_line = ""
+ for line in content.split("\n"):
+ line = line.strip()
+ if line:
+ first_line = line
+ break
+
+ is_jsonl = False
+ if first_line:
+ try:
+ json.loads(first_line)
+ is_jsonl = True
+ except json.JSONDecodeError:
+ pass
+
+ if is_jsonl:
+ for line in content.split("\n"):
line = line.strip()
if not line:
continue
try:
obs = json.loads(line)
- except json.JSONDecodeError as e:
- logger.warning("Skipping malformed JSONL line in %s: %s", f, e)
- continue
- # Skip already-processed IDs early to save memory
- if processed_ids and obs.get("id", "") in processed_ids:
+ except json.JSONDecodeError:
continue
- all_obs.append(obs)
+ file_obs.append(obs)
+ else:
+ # Multi-line JSON: use decoder to extract consecutive objects
+ decoder = json.JSONDecoder()
+ idx = 0
+ while idx < len(content):
+ # Skip whitespace
+ while idx < len(content) and content[idx] in " \t\n\r":
+ idx += 1
+ if idx >= len(content):
+ break
+ try:
+ obj, end_idx = decoder.raw_decode(content, idx)
+ file_obs.append(obj)
+ idx = end_idx
+ except json.JSONDecodeError:
+ # Skip to next line
+ next_nl = content.find("\n", idx)
+ idx = next_nl + 1 if next_nl != -1 else len(content)
+
+ # Filter already-processed IDs
+ for obs in file_obs:
+ if processed_ids and obs.get("id", "") in processed_ids:
+ continue
+ all_obs.append(obs)
+
return all_obs
From 6398e19d45e2bfcecfbd3f4f87d3035293b1bddb Mon Sep 17 00:00:00 2001
From: Ivan Pasichnyk
Date: Mon, 6 Apr 2026 00:03:07 -0700
Subject: [PATCH 36/59] docs: update architecture with auto-detect, research,
hippocampus analogy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- Add experience auto-detect to Prompt Recall component
- Add "The Problem" section with 3 research citations on context degradation
- Add hippocampus analogy (Encoding → Consolidation → Retrieval → Reinforcement)
- Add Q&A: why keyword detection over LLM classification
Co-Authored-By: Claude Opus 4.6
---
openexp-architecture.html | 521 ++++++++++++++++++++++++++++++++++++++
1 file changed, 521 insertions(+)
create mode 100644 openexp-architecture.html
diff --git a/openexp-architecture.html b/openexp-architecture.html
new file mode 100644
index 0000000..756576e
--- /dev/null
+++ b/openexp-architecture.html
@@ -0,0 +1,521 @@
+
+
+
+
+
+OpenExp — Architecture
+
+
+
+
+
+
+
+
OpenExp Architecture
+
An experience layer for AI agents. Not just memory — memory that learns which memories are useful.
+
+
+
+
+
+ ⊕
+
Zero-effort capture
+
Hooks observe every tool call automatically. No manual tagging, no save buttons. The agent just works — and everything important is recorded.
+
+
+ ↻
+
Self-improving retrieval
+
Q-learning ranks memories by actual usefulness. Memories that led to commits, PRs, closed deals get promoted. Noise gets demoted. Automatically.
+
+
+ ⌖
+
Context-aware learning
+
Different "Experiences" define what success looks like. Coding session rewards differ from sales. The system learns what works in each context.
+
+
+
+
+
+
Components
+
Each component is isolated with a single responsibility. They communicate through files and APIs — no tight coupling.
+
+
+
+
Event Sources — Claude Code Hooks
+
+
+
+
⚙
+
+
Observer
+
hooks/post-tool-use.sh
+
+
+
Records every Edit, Write, Bash action as a JSONL observation. Filters out read-only noise (Glob, Grep, Read).
+
Why: Raw signal capture. Without this, the system has nothing to learn from. Filtering prevents storage bloat.
+
+
+
+
▶
+
+
Session Start
+
hooks/session-start.sh
+
+
+
Searches Qdrant for top-10 relevant memories and injects them as context. Logs retrieval IDs for the reward loop.
+
Why: The agent starts every session informed by past experience. ID logging enables closed-loop reward.
Why: Batch processing at session boundary. More efficient than per-action processing, ensures atomic ingest.
+
+
+
+
✉
+
+
Prompt Recall + Auto-Detect
+
hooks/user-prompt-recall.sh
+
+
+
Per-message context injection with experience auto-detection. Classifies prompt keywords (EN+UK) to switch between coding, sales, or dealflow. Searches with the correct experience so proven-useful memories rank higher.
+
Why: A memory about a successful proposal should rank higher when doing sales, not coding. Auto-detection means zero manual mode switching.
+
+
+
+
↓observations.jsonl↓retrieval IDs↓
+
+
+
Core Engine — Processing & Intelligence
+
+
+
+
⇅
+
+
Ingester
+
ingest/observation.py + session.py
+
+
+
Reads JSONL observations, embeds them with FastEmbed (BAAI/bge-small-en-v1.5, 384d), upserts vectors to Qdrant. Watermark-based idempotency prevents duplicates.
+
Why separate from hooks: Embedding is CPU-intensive. Running async at session-end keeps the agent responsive during work.
+
+
+
+
🔍
+
+
Hybrid Search
+
core/direct_search.py + hybrid_search.py
+
+
+
Combines vector similarity (Qdrant) with BM25 keyword scoring, recency decay, importance weights, memory status, and Q-value ranking.
+
Why hybrid: Pure vector search misses keyword matches. Pure BM25 misses semantics. The combination + Q-value is what makes retrieval improve over time.
+
+
+
+
★
+
+
Reward Engine
+
ingest/reward.py + outcome.py
+
+
+
Evaluates session productivity (commits, PRs, tests) and external outcomes (deal closed, payment received). Propagates reward to retrieved memories via Q-learning.
+
Why 4 reward paths: Session signals are fast but noisy. Business outcomes are slow but high-signal. Both needed for robust learning.
+
+
+
+
↓vectors + Q-updates↓
+
+
+
Storage — Persistent State
+
+
+
+
◆
+
+
Qdrant
+
localhost:6333 (Docker)
+
+
+
Vector database. Stores memory embeddings with metadata (type, importance, status, timestamps). Handles similarity search at scale.
+
Why Qdrant: Local-first (Docker), no API keys, no cloud dependency. Fast ANN search. Payload filtering for memory type/status.
+
+
+
+
Q
+
+
Q-Cache
+
data/q_cache.json + deltas/
+
+
+
JSON file storing Q-values per memory per experience. Three layers: action (50%), hypothesis (20%), fit (30%). File-locked for concurrent access.
+
Why separate from Qdrant: Q-values change every session. Updating Qdrant payloads on every reward would be expensive. JSON is fast read/write for the hot path.
+
+
+
+
📝
+
+
Observation Store
+
~/.openexp/observations/*.jsonl
+
+
+
Daily JSONL files with raw observations. Source of truth before ingest. Watermark tracks which observations have been processed.
+
Why JSONL files: Append-only writes are fast and crash-safe. No DB needed for sequential writes. Easy to debug, grep, replay.
+
+
+
+
↓search results + Q-values↓
+
+
+
Interface — How the Agent Accesses Memory
+
+
+
+
⚙
+
+
MCP Server
+
mcp_server.py (16 tools)
+
+
+
STDIO MCP server exposing 16 tools to Claude Code: search_memory, add_memory, reflect, explain_q, experience_insights, calibrate, log_prediction, resolve_outcomes, etc.
+
Why MCP: Standard protocol for Claude Code tool integration. Agent calls tools naturally in conversation. No special client needed.
+
+
+
+
>_
+
+
CLI
+
cli.py
+
+
+
Command-line interface for manual operations: search, ingest, stats, log-retrieval. Used by hooks (shell scripts call Python CLI) and for debugging.
+
Why CLI + MCP: Hooks run as shell scripts — they need CLI. Agent needs MCP. Same core, two interfaces.
+
+
+
+
+
+
+
+ Closed Loop: Retrieve → Use in session → Evaluate outcome → Reward retrieved memories → Better retrieval next time
+
+ The Q-value component is what makes OpenExp different from standard RAG. It's 30% of the final score — a memory with Q=0.9 (proven useful) scores 0.27 points higher than Q=0.0 (untested). This is enough to push a semantically weaker but historically useful memory above a closer but untested one.
+
Every architectural choice has a reason. Here's why OpenExp is built this way.
+
+
+
+
Q: Why local-first, not cloud?
+
Your code context, decisions, and work history are sensitive. OpenExp runs entirely on your machine: Qdrant in Docker, FastEmbed locally, no API calls. Your experience data never leaves your laptop.
+
+
+
Q: Why Q-learning instead of just vector search?
+
Vector similarity finds related memories. Q-learning finds useful ones. A memory about a library that led to 3 successful PRs should rank higher than a similar one that led nowhere. Q-values encode outcome history.
+
+
+
Q: Why separate Q-cache from Qdrant?
+
Q-values change every session (hot path). Qdrant payloads are expensive to update at scale. A JSON file with fcntl.flock gives fast, concurrent-safe reads/writes for the scoring formula.
+
+
+
Q: Why hooks, not an always-on daemon?
+
Claude Code hooks are event-driven — they fire only when needed. No background process consuming resources. Zero config: install hooks once, everything works automatically.
+
+
+
Q: Why 4 hooks instead of 1?
+
Observer captures during work. Session Start loads context before work. Prompt Recall adds per-message precision. Session End processes and learns. Each has a distinct timing requirement.
+
+
+
Q: Why "Experiences"?
+
A git commit is positive signal in coding, but irrelevant in sales outreach. Experiences let the same memory system work across different work contexts with context-appropriate reward functions.
+
+
+
Q: Why keyword detection, not LLM classification?
+
The hook runs on every user message. LLM call = 500ms+ latency + API cost. Keyword matching runs in <1ms, supports bilingual prompts (EN+UK), and requires zero API keys. Good enough for experience routing; LLM classification can be added for retrospective re-evaluation.
+
+
+
+
+
+
The Problem: More Context = Worse Performance
+
Research shows LLMs degrade with longer context — even with perfect retrieval.
+
+
+
+
"Lost in the Middle" (Stanford/Meta, 2023)
+
Accuracy drops from 75% to 55% when relevant info is in the middle of the context. U-shaped attention curve across GPT-4, Claude, LLaMA.
+
+
+
"Context Length Alone Hurts" (EMNLP 2025)
+
Even with perfect retrieval, performance degrades 13.9–85% from context length alone. The length itself is the problem.
+
+
+
NoLiMa (ICML 2025)
+
GPT-4o dropped from 99.3% to 69.7% at just 32K tokens. 11/12 models fell below 50% of baseline.
+
+
+
+
OpenExp = Hippocampus for AI
+
+ Instead of dumping all context into the prompt, OpenExp works like a hippocampus: record everything, but replay only what proved useful in similar situations. The Q-learning loop ensures that memories which led to successful outcomes (closed deals, merged PRs, passed tests) get replayed preferentially — while noise gets naturally demoted.
+
+
+
+
Encoding
+
Observer hook records every action
+
+
+
Consolidation
+
SessionEnd embeds & stores in Qdrant
+
+
+
Retrieval
+
Hybrid search with Q-value ranking
+
+
+
Reinforcement
+
Reward loop strengthens useful paths
+
+
+
+
+
+
+
Standard RAG vs OpenExp
+
+
+
+
Standard RAG Memory
+
+
Store everything, retrieve by similarity
+
Old irrelevant memory ranks same as yesterday's insight
+
No feedback loop — retrieval quality never improves
+
Manual curation needed to keep signal-to-noise ratio
+
Same retrieval logic regardless of work context
+
+
+
+
OpenExp
+
+
Store everything, retrieve by proven usefulness
+
Memories that led to results get promoted automatically
+
Closed-loop Q-learning improves retrieval every session
+
Noise gets demoted to Q < 0 — zero manual curation
+
Experience-specific reward functions per work context
+
+
+
+
+
+
+
+
+
+
From 9c40a1614ab6aa9004f28456af9cb00e3d417636 Mon Sep 17 00:00:00 2001
From: John
Date: Mon, 6 Apr 2026 01:03:53 -0700
Subject: [PATCH 37/59] feat: extract decisions from session transcripts via
Opus 4.6 (#22)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Instead of recording actions ("Edited X.html"), the system now extracts
strategic decisions, insights, and commitments from conversation transcripts
using Opus 4.6 via claude -p (Max subscription, zero API cost).
- New module: openexp/ingest/extract_decisions.py
- read_transcript(): parses Claude Code JSONL, skips tool results/system noise
- extract_decisions(): calls claude -p --model opus for LLM extraction
- extract_and_store(): full pipeline → Qdrant with embeddings
- session-end.sh Phase 2c: runs extraction after ingest + reward
- Recursion guard: OPENEXP_EXTRACT_RUNNING=1 env var prevents hook loops
Co-authored-by: Ivan Pasichnyk
Co-authored-by: Claude Opus 4.6
---
openexp/hooks/session-end.sh | 50 +++++
openexp/ingest/extract_decisions.py | 313 ++++++++++++++++++++++++++++
2 files changed, 363 insertions(+)
create mode 100644 openexp/ingest/extract_decisions.py
diff --git a/openexp/hooks/session-end.sh b/openexp/hooks/session-end.sh
index 1771aa6..5c39385 100755
--- a/openexp/hooks/session-end.sh
+++ b/openexp/hooks/session-end.sh
@@ -9,6 +9,12 @@
# reward never gets computed, and Q-values stay at 0.5 forever.
set -uo pipefail
+# Guard: skip if running inside extraction subprocess (prevents recursion)
+if [ "${OPENEXP_EXTRACT_RUNNING:-}" = "1" ]; then
+ echo '{"hookSpecificOutput":{"hookEventName":"SessionEnd"}}'
+ exit 0
+fi
+
# Resolve paths relative to this script
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
OPENEXP_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
@@ -227,6 +233,50 @@ print(f'Fallback reward={reward:.2f} applied to {updated} retrieved memories ({l
" >> "$INGEST_LOG" 2>&1
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] SessionEnd: fallback reward finished" >> "$INGEST_LOG"
+ # Phase 2c: Decision extraction from transcript (Opus 4.6)
+ # This is the most valuable step — extracts DECISIONS, not actions.
+ TRANSCRIPT_DIR="$HOME/.claude/projects/-Users-ivanpasichnyk"
+ TRANSCRIPT_FILE=""
+ # Find transcript file for this session
+ for f in "$TRANSCRIPT_DIR"/*.jsonl; do
+ [ -f "$f" ] || continue
+ if grep -q "\"sessionId\":\"$SESSION_ID\"" "$f" 2>/dev/null; then
+ TRANSCRIPT_FILE="$f"
+ break
+ fi
+ done
+ # Also try partial match
+ if [ -z "$TRANSCRIPT_FILE" ]; then
+ for f in "$TRANSCRIPT_DIR"/*.jsonl; do
+ [ -f "$f" ] || continue
+ if grep -q "$SESSION_SHORT" "$f" 2>/dev/null; then
+ TRANSCRIPT_FILE="$f"
+ break
+ fi
+ done
+ fi
+
+ if [ -n "$TRANSCRIPT_FILE" ]; then
+ echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] SessionEnd: extracting decisions from $TRANSCRIPT_FILE" >> "$INGEST_LOG"
+ "$PYTHON" -c "
+import sys, json, logging
+sys.path.insert(0, '.')
+logging.basicConfig(level=logging.INFO)
+from pathlib import Path
+from openexp.ingest.extract_decisions import extract_and_store
+
+result = extract_and_store(
+ transcript_path=Path('$TRANSCRIPT_FILE'),
+ session_id='$SESSION_ID',
+ experience='$EXPERIENCE',
+)
+print(json.dumps(result, default=str))
+" >> "$INGEST_LOG" 2>&1
+ echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] SessionEnd: decision extraction finished" >> "$INGEST_LOG"
+ else
+ echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] SessionEnd: no transcript found for session $SESSION_SHORT" >> "$INGEST_LOG"
+ fi
+
# Cleanup session experience file
"$PYTHON" -c "
import sys
diff --git a/openexp/ingest/extract_decisions.py b/openexp/ingest/extract_decisions.py
new file mode 100644
index 0000000..58f608f
--- /dev/null
+++ b/openexp/ingest/extract_decisions.py
@@ -0,0 +1,313 @@
+"""Extract decisions from Claude Code conversation transcripts.
+
+Instead of recording "Edited X.html" (action), extracts:
+- What was the choice point?
+- What alternatives existed?
+- Why was this path chosen?
+- What was learned?
+
+Uses claude -p (Max subscription, Opus 4.6) — extraction quality IS the product.
+"""
+import json
+import logging
+import os
+import subprocess
+from pathlib import Path
+from typing import Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+# Configurable via env vars
+# Opus 4.6 — quality of extraction determines quality of the entire memory system.
+# This is not a place to save money. This is the annotation layer.
+EXTRACT_MODEL = os.getenv("OPENEXP_EXTRACT_MODEL", "claude-opus-4-6")
+EXTRACT_MAX_TOKENS = int(os.getenv("OPENEXP_EXTRACT_MAX_TOKENS", "2048"))
+# Max chars of transcript to send to LLM (cost control)
+EXTRACT_CONTEXT_LIMIT = int(os.getenv("OPENEXP_EXTRACT_CONTEXT_LIMIT", "30000"))
+
+EXTRACTION_PROMPT = """\
+You are analyzing a work session between Ivan (entrepreneur, AI/data labeling business) and his AI assistant.
+
+Your job: extract DECISIONS and STRATEGIC INSIGHTS — not actions.
+
+## What to extract
+
+1. **DECISIONS** — moments where a choice was made.
+ - What was the choice point?
+ - What was chosen and why?
+ - What was the alternative?
+
+2. **INSIGHTS** — things learned about clients, markets, patterns.
+ - What was the insight?
+ - Why does it matter for future work?
+
+3. **COMMITMENTS** — promises or agreements made.
+ - Who committed to what, by when?
+
+## What NOT to extract
+- File edits, tool calls, code changes (already captured separately)
+- Calendar scheduling, meeting logistics
+- Greetings, acknowledgments, filler
+- Technical implementation details (code structure, config changes)
+
+## Output format
+Return a JSON array. Each item:
+```json
+{
+ "type": "decision" | "insight" | "commitment",
+ "content": "One clear sentence describing what happened and WHY",
+ "importance": 0.0-1.0,
+ "tags": ["client-name", "domain"],
+ "client_id": "comp-xxx or null"
+}
+```
+
+Be selective. 3-8 items per session is ideal. Only extract what would be valuable
+to recall in a FUTURE conversation — the kind of context that changes how you
+approach the next similar situation.
+
+Think strategically: helicopter view + details. Not "sent email" but "chose to
+lead with social proof because enterprise clients trust references".
+"""
+
+
+def read_transcript(transcript_path: Path, session_id: Optional[str] = None) -> str:
+ """Read and format a Claude Code transcript for LLM extraction.
+
+ Returns a condensed text of user<>assistant exchanges,
+ skipping tool results, system messages, and other noise.
+ """
+ if not transcript_path.exists():
+ return ""
+
+ messages = []
+ for line in transcript_path.read_text(encoding="utf-8").splitlines():
+ if not line.strip():
+ continue
+ try:
+ entry = json.loads(line)
+ except json.JSONDecodeError:
+ continue
+
+ msg_type = entry.get("type")
+ if msg_type not in ("user", "assistant"):
+ continue
+
+ # Skip tool results (user messages that are just tool output)
+ if msg_type == "user":
+ content = entry.get("message", {}).get("content", [])
+ texts = []
+ for block in content:
+ if isinstance(block, dict) and block.get("type") == "text":
+ text = block.get("text", "").strip()
+ # Skip hook injections and system reminders
+ if text and not text.startswith(""):
+ texts.append(text)
+ if not texts:
+ continue
+ messages.append(("user", "\n".join(texts)))
+
+ elif msg_type == "assistant":
+ content = entry.get("message", {}).get("content", [])
+ texts = []
+ for block in content:
+ if isinstance(block, dict) and block.get("type") == "text":
+ text = block.get("text", "").strip()
+ if text:
+ texts.append(text)
+ if not texts:
+ continue
+ messages.append(("assistant", "\n".join(texts)))
+
+ if not messages:
+ return ""
+
+ # Build condensed transcript, respecting context limit
+ # Prioritize recent messages (most likely to contain decisions)
+ formatted = []
+ total_chars = 0
+ for role, text in reversed(messages):
+ entry_text = f"{'IVAN' if role == 'user' else 'ASSISTANT'}: {text}\n"
+ if total_chars + len(entry_text) > EXTRACT_CONTEXT_LIMIT:
+ break
+ formatted.append(entry_text)
+ total_chars += len(entry_text)
+
+ formatted.reverse()
+ return "\n".join(formatted)
+
+
+def extract_decisions(
+ transcript_text: str,
+ session_id: str = "",
+ experience: str = "default",
+) -> List[Dict]:
+ """Extract decisions from a transcript using claude -p (Max subscription).
+
+ Uses Claude Code CLI in pipe mode to leverage the user's Max subscription
+ instead of requiring API credits. --verbose flag suppresses hooks to avoid
+ recursion (this runs inside SessionEnd hook).
+
+ Returns list of extracted items (decisions, insights, commitments).
+ """
+ if not transcript_text or len(transcript_text) < 100:
+ logger.info("Transcript too short for extraction (%d chars)", len(transcript_text))
+ return []
+
+ # Build the full prompt: system instructions + transcript
+ full_prompt = (
+ f"{EXTRACTION_PROMPT}\n\n"
+ f"---\n\n"
+ f"Extract decisions and insights from this work session:\n\n"
+ f"{transcript_text}"
+ )
+
+ response_text = ""
+ try:
+ # Use claude -p (pipe mode) with Max subscription
+ # --model opus: use Opus 4.6 for highest extraction quality
+ # OPENEXP_EXTRACT_RUNNING=1 prevents hook recursion (session-end checks this)
+ env = {**os.environ, "OPENEXP_EXTRACT_RUNNING": "1"}
+ result = subprocess.run(
+ ["claude", "-p", "--model", "opus"],
+ input=full_prompt,
+ capture_output=True,
+ text=True,
+ timeout=120, # 2 min timeout for Opus
+ env=env,
+ )
+
+ if result.returncode != 0:
+ logger.error(
+ "claude -p failed (exit=%d): %s",
+ result.returncode, result.stderr[:500],
+ )
+ return []
+
+ response_text = result.stdout.strip()
+ if not response_text:
+ logger.error("claude -p returned empty response")
+ return []
+
+ # Extract JSON from response (may be wrapped in markdown code block)
+ json_text = response_text
+ if "```json" in json_text:
+ json_text = json_text.split("```json")[1].split("```")[0]
+ elif "```" in json_text:
+ json_text = json_text.split("```")[1].split("```")[0]
+
+ items = json.loads(json_text.strip())
+ if not isinstance(items, list):
+ items = [items]
+
+ logger.info(
+ "Extracted %d items from transcript (%d chars, model=%s, via claude -p)",
+ len(items), len(transcript_text), EXTRACT_MODEL,
+ )
+ return items
+
+ except subprocess.TimeoutExpired:
+ logger.error("claude -p timed out after 120s")
+ return []
+ except json.JSONDecodeError as e:
+ logger.error("Failed to parse extraction response: %s", e)
+ logger.debug("Response was: %s", response_text[:500] if response_text else "empty")
+ return []
+ except FileNotFoundError:
+ logger.error("claude CLI not found in PATH — is Claude Code installed?")
+ return []
+ except Exception as e:
+ logger.error("Decision extraction failed: %s", e)
+ return []
+
+
+def extract_and_store(
+ transcript_path: Path,
+ session_id: str,
+ experience: str = "default",
+ dry_run: bool = False,
+) -> Dict:
+ """Full pipeline: read transcript → extract → store as memories.
+
+ Returns summary of what was extracted and stored.
+ """
+ transcript_text = read_transcript(transcript_path, session_id)
+ if not transcript_text:
+ return {"extracted": 0, "reason": "empty_transcript"}
+
+ items = extract_decisions(transcript_text, session_id, experience)
+ if not items:
+ return {"extracted": 0, "reason": "no_decisions_found"}
+
+ if dry_run:
+ return {"extracted": len(items), "items": items, "dry_run": True}
+
+ # Store each item as a memory via the openexp API
+ stored = 0
+ from ..core.config import COLLECTION_NAME, QDRANT_HOST, QDRANT_PORT
+ from ..core.direct_search import _embed
+ from qdrant_client import QdrantClient
+ from qdrant_client.models import PointStruct
+ import uuid
+ from datetime import datetime, timezone
+
+ client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)
+
+ for item in items:
+ content = item.get("content", "")
+ if not content:
+ continue
+
+ item_type = item.get("type", "decision")
+ importance = item.get("importance", 0.5)
+ tags = item.get("tags", [])
+ client_id = item.get("client_id")
+
+ memory_type = {
+ "decision": "decision",
+ "insight": "insight",
+ "commitment": "action",
+ }.get(item_type, "decision")
+
+ try:
+ vector = _embed(content)
+ point_id = str(uuid.uuid4())
+ now = datetime.now(timezone.utc).isoformat()
+
+ payload = {
+ "memory": content,
+ "type": memory_type,
+ "agent": "session",
+ "source": "decision_extraction",
+ "importance": importance,
+ "tags": tags,
+ "session_id": session_id,
+ "experience": experience,
+ "created_at": now,
+ "status": "active",
+ }
+ if client_id:
+ payload["client_id"] = client_id
+
+ client.upsert(
+ collection_name=COLLECTION_NAME,
+ points=[
+ PointStruct(
+ id=point_id,
+ vector=vector,
+ payload=payload,
+ )
+ ],
+ )
+ stored += 1
+ logger.info("Stored decision: %s (type=%s, importance=%.1f)", content[:80], memory_type, importance)
+
+ except Exception as e:
+ logger.error("Failed to store decision '%s': %s", content[:50], e)
+
+ return {
+ "extracted": len(items),
+ "stored": stored,
+ "experience": experience,
+ "model": EXTRACT_MODEL,
+ }
From d3cae18474410beea4e5b76962e2d468356372ff Mon Sep 17 00:00:00 2001
From: John
Date: Mon, 6 Apr 2026 01:33:57 -0700
Subject: [PATCH 38/59] =?UTF-8?q?docs:=20comprehensive=20update=20?=
=?UTF-8?q?=E2=80=94=20decision=20extraction,=204-phase=20learning=20cycle?=
=?UTF-8?q?=20(#24)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- New: docs/decision-extraction.md — full reference for Opus 4.6 extraction
- Updated: how-it-works.md — 4-phase learning cycle (store → auto-reward → extraction → calibration)
- Updated: architecture.md — extract_decisions.py in ingest pipeline
- Updated: storage-system.md — Phase 2c, new env vars, pipeline flow
- Updated: configuration.md — extraction env vars
- Updated: README.md — decision extraction in hooks, architecture tree, docs list
Co-authored-by: Ivan Pasichnyk
Co-authored-by: Claude Opus 4.6
---
README.md | 12 ++-
docs/architecture.md | 3 +-
docs/configuration.md | 9 ++
docs/decision-extraction.md | 169 ++++++++++++++++++++++++++++++++++++
docs/how-it-works.md | 34 +++++++-
docs/storage-system.md | 14 ++-
6 files changed, 234 insertions(+), 7 deletions(-)
create mode 100644 docs/decision-extraction.md
diff --git a/README.md b/README.md
index 40c399e..2b0f91b 100644
--- a/README.md
+++ b/README.md
@@ -145,7 +145,9 @@ Three hooks integrate with Claude Code automatically:
| **SessionStart** | Session opens | Searches Qdrant for relevant memories, injects top results as context |
| **UserPromptSubmit** | Every message | Lightweight recall — adds relevant memories to each prompt |
| **PostToolUse** | After Write/Edit/Bash | Captures what Claude does as observations (JSONL) |
-| **SessionEnd** | Session closes | Generates summary, triggers ingest + reward (async) |
+| **SessionEnd** | Session closes | Summary → ingest → reward → decision extraction (async) |
+
+After each session, Opus 4.6 reads the conversation transcript and extracts **decisions** (not actions) — strategic choices, insights, and commitments that have value for future similar situations. See [Decision Extraction](docs/decision-extraction.md).
The MCP server provides 16 tools for memory operations, introspection, and calibration.
@@ -310,7 +312,8 @@ openexp/
│ ├── reward.py # Session productivity → reward signal
│ ├── retrieval_log.py # Closed-loop: which memories were recalled
│ ├── watermark.py # Idempotent ingestion tracking
-│ └── filters.py # Filter trivial observations
+│ ├── filters.py # Filter trivial observations
+│ └── extract_decisions.py # Opus 4.6 decision extraction from transcripts
│
├── resolvers/ # Outcome resolvers (pluggable)
│ └── crm_csv.py # CRM CSV stage transition → reward events
@@ -435,8 +438,9 @@ See the [Experiences Guide](docs/experiences.md) for full details.
Detailed docs are available in the [`docs/`](docs/) directory:
-- [How It Works](docs/how-it-works.md) — full explanation of the learning loop
-- [Storage System](docs/storage-system.md) — 5-level pyramid (L0–L4), all 4 reward paths
+- [How It Works](docs/how-it-works.md) — the 4-phase learning cycle
+- [Decision Extraction](docs/decision-extraction.md) — Opus 4.6 extracts decisions, not actions
+- [Storage System](docs/storage-system.md) — 5-level pyramid (L0-L4), all 4 reward paths
- [Experiences](docs/experiences.md) — domain-specific reward profiles (create your own)
- [Architecture](docs/architecture.md) — system design and data flow
- [Configuration](docs/configuration.md) — all environment variables and options
diff --git a/docs/architecture.md b/docs/architecture.md
index 4806f94..de357f1 100644
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -79,6 +79,7 @@ Converts raw observations (JSONL) into embedded vectors in Qdrant:
4. **reward.py** — Computes session productivity score, applies Q-value updates (all 3 layers)
5. **retrieval_log.py** — Tracks which memories were recalled (for closed-loop reward)
6. **watermark.py** — Idempotency: prevents duplicate ingestion
+7. **extract_decisions.py** — Opus 4.6 extracts strategic decisions/insights from transcripts (Phase 2c)
### Outcome Resolution (`openexp/outcome.py` + `openexp/resolvers/`)
@@ -99,7 +100,7 @@ Shell scripts registered with Claude Code:
- **session-start.sh** — Builds contextual query, searches Qdrant, formats results, logs retrieval
- **user-prompt-recall.sh** — Per-message recall (skips trivial inputs), logs retrieval
- **post-tool-use.sh** — Captures Write/Edit/Bash observations, skips Read/Glob/Grep
-- **session-end.sh** — Generates session summary, triggers async ingest + reward computation
+- **session-end.sh** — Generates session summary, triggers async ingest + reward + decision extraction
## Data Persistence
diff --git a/docs/configuration.md b/docs/configuration.md
index 40e7115..24a5cf9 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -45,6 +45,15 @@ Without `ANTHROPIC_API_KEY`, memories are stored with basic metadata. The system
See [Experiences Guide](experiences.md) for details on creating custom experiences.
+### Decision Extraction
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `OPENEXP_EXTRACT_MODEL` | `claude-opus-4-6` | LLM model for extraction (do not downgrade) |
+| `OPENEXP_EXTRACT_MAX_TOKENS` | `2048` | Max response tokens |
+| `OPENEXP_EXTRACT_CONTEXT_LIMIT` | `30000` | Max chars of transcript sent to LLM |
+
+Decision extraction uses `claude -p` (Claude Code pipe mode) to leverage your Max subscription. No API key needed.
+
### Ingest Pipeline
| Variable | Default | Description |
|----------|---------|-------------|
diff --git a/docs/decision-extraction.md b/docs/decision-extraction.md
new file mode 100644
index 0000000..7f80b95
--- /dev/null
+++ b/docs/decision-extraction.md
@@ -0,0 +1,169 @@
+# Decision Extraction
+
+> Extract strategic decisions, insights, and commitments from session transcripts.
+> The system records "chose to lead with social proof because enterprise clients trust references" — not "edited proposal.html".
+
+## Why This Matters
+
+Without decision extraction, OpenExp records **actions** (tool calls, file edits, commands). Actions are useful for reward computation but have low strategic value — "Edited file.html" tells you nothing about **why** that edit was made or **what alternative was considered**.
+
+Decision extraction uses Opus 4.6 to read the full conversation transcript and extract:
+
+1. **Decisions** — choice points with reasoning. What was chosen, why, and what was the alternative?
+2. **Insights** — things learned about clients, markets, patterns. Why does it matter for future work?
+3. **Commitments** — promises or agreements. Who committed to what, by when?
+
+These extracted items become first-class memories in Qdrant, searchable and Q-value-ranked like any other memory.
+
+## How It Works
+
+Decision extraction runs automatically as **Phase 2c** of the SessionEnd hook (async, after ingest + reward):
+
+```
+Session ends
+ ↓
+Phase 2a: Ingest observations + session reward
+Phase 2b: Fallback reward for pre-ingested obs
+Phase 2c: Decision extraction from transcript (NEW)
+ ↓
+Find transcript JSONL for this session
+ ↓
+Read and condense transcript (skip tool results, system noise)
+ ↓
+Send to Opus 4.6 via claude -p (Max subscription)
+ ↓
+Parse JSON response → store each item in Qdrant with embedding
+```
+
+### Transcript Processing
+
+The transcript reader (`read_transcript()`) processes Claude Code JSONL transcripts:
+
+- Reads only `user` and `assistant` message types
+- Extracts text blocks, skips `tool_result` and `system-reminder` content
+- Prioritizes recent messages (builds from end, respects context limit)
+- Default context limit: 30,000 chars (configurable via `OPENEXP_EXTRACT_CONTEXT_LIMIT`)
+
+### LLM Extraction
+
+Uses `claude -p --model opus` (pipe mode) to leverage Claude Max subscription — zero API cost.
+
+The extraction prompt instructs Opus 4.6 to:
+- Think strategically: "helicopter view + details"
+- Be selective: 3-8 items per session
+- Focus on what would be valuable in a FUTURE conversation
+- Skip file edits, tool calls, code changes (already captured as observations)
+
+### Storage
+
+Each extracted item is stored in Qdrant with:
+
+```json
+{
+ "memory": "Chose to remove advertising from scope because we're not a marketing agency — client needs automation, not ads",
+ "type": "decision",
+ "source": "decision_extraction",
+ "importance": 0.8,
+ "tags": ["client-name", "scoping"],
+ "session_id": "abc-123",
+ "experience": "sales",
+ "status": "active"
+}
+```
+
+Memory types are mapped: `decision` → `decision`, `insight` → `insight`, `commitment` → `action`.
+
+## Configuration
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `OPENEXP_EXTRACT_MODEL` | `claude-opus-4-6` | LLM model for extraction (do not downgrade) |
+| `OPENEXP_EXTRACT_MAX_TOKENS` | `2048` | Max response tokens |
+| `OPENEXP_EXTRACT_CONTEXT_LIMIT` | `30000` | Max chars of transcript sent to LLM |
+
+### Model Quality
+
+Opus 4.6 is mandatory for extraction. The quality of extracted decisions determines the quality of the entire memory system. This is the annotation layer — not a place to save money.
+
+### Recursion Guard
+
+Decision extraction runs inside the SessionEnd hook and spawns `claude -p` as a subprocess. To prevent the subprocess from triggering its own SessionEnd → extraction → subprocess loop:
+
+1. The `extract_decisions()` function sets `OPENEXP_EXTRACT_RUNNING=1` in the subprocess environment
+2. `session-end.sh` checks this variable at startup and exits immediately if set
+
+## API
+
+### `read_transcript(transcript_path, session_id=None) -> str`
+
+Read and condense a Claude Code JSONL transcript. Returns formatted text with `IVAN:` and `ASSISTANT:` prefixes.
+
+### `extract_decisions(transcript_text, session_id="", experience="default") -> List[Dict]`
+
+Extract decisions from transcript text using Opus 4.6. Returns list of items:
+
+```python
+[
+ {
+ "type": "decision",
+ "content": "One clear sentence describing what happened and WHY",
+ "importance": 0.8,
+ "tags": ["domain", "client"],
+ "client_id": "comp-xxx" # or null
+ }
+]
+```
+
+### `extract_and_store(transcript_path, session_id, experience="default", dry_run=False) -> Dict`
+
+Full pipeline: read transcript → extract → store in Qdrant.
+
+```python
+# Dry run (extract without storing)
+result = extract_and_store(path, session_id, dry_run=True)
+# {"extracted": 6, "items": [...], "dry_run": True}
+
+# Real run
+result = extract_and_store(path, session_id, experience="sales")
+# {"extracted": 6, "stored": 6, "experience": "sales", "model": "claude-opus-4-6"}
+```
+
+## Example Output
+
+From a real session about a client proposal:
+
+```json
+[
+ {
+ "type": "decision",
+ "content": "Removed advertising from Modecks scope because we're not a marketing agency — client needs CRM+email+follow-up automation, not Google Ads management",
+ "importance": 0.9,
+ "tags": ["modecks", "scoping", "pricing"]
+ },
+ {
+ "type": "insight",
+ "content": "For small contractors (decks/fencing), semi-automatic approach (Claude Code + one click) is more valuable than full automation: follow-up semi-auto = 2-3 hrs vs full auto = 8-12 hrs. Client needs control, not full autonomy.",
+ "importance": 0.8,
+ "tags": ["product-strategy", "semi-auto-vs-auto"]
+ },
+ {
+ "type": "insight",
+ "content": "All won clients came through network/referrals — zero presence on freelance platforms despite strong fit. Untapped channel.",
+ "importance": 0.8,
+ "tags": ["sales-channel", "growth"]
+ },
+ {
+ "type": "commitment",
+ "content": "TODO: finalize scope, update price in HTML proposal, send to client by tomorrow",
+ "importance": 0.6,
+ "tags": ["follow-up"]
+ }
+]
+```
+
+## Files
+
+| File | Purpose |
+|------|---------|
+| `openexp/ingest/extract_decisions.py` | Core module: read, extract, store |
+| `openexp/hooks/session-end.sh` | Phase 2c integration (lines 235-272) |
diff --git a/docs/how-it-works.md b/docs/how-it-works.md
index 1074913..c44ef7b 100644
--- a/docs/how-it-works.md
+++ b/docs/how-it-works.md
@@ -50,7 +50,19 @@ When the session ends, the SessionEnd hook:
2. Saves it to `~/.openexp/sessions/`
3. Triggers async ingest + reward computation (runs in background so it doesn't block exit)
-### 4. Q-Learning Reward Loop
+### 4. Decision Extraction (SessionEnd Phase 2c)
+
+After ingest and reward, Opus 4.6 reads the full conversation transcript and extracts:
+
+- **Decisions** — "Chose to remove advertising from scope because we're not a marketing agency"
+- **Insights** — "All won clients came through referrals — zero presence on freelance platforms"
+- **Commitments** — "Finalize proposal and send by tomorrow"
+
+This is the critical difference between recording "Edited proposal.html" (action) and recording "Chose to lead with social proof because enterprise clients trust references" (decision with reasoning). Decisions have strategic value; actions don't.
+
+See [Decision Extraction](decision-extraction.md) for full details.
+
+### 5. Q-Learning Reward Loop
This is the core innovation. After each session:
@@ -65,6 +77,26 @@ Q_new = (1 - 0.25) × Q_old + 0.25 × reward
Over time, this creates a natural ranking where useful memories (project conventions, working solutions, important decisions) rise to the top, while noise (trivial commands, one-off fixes) sinks.
+## The 4-Phase Learning Cycle
+
+OpenExp learns in four phases, each building on the previous:
+
+**Phase 1 — Store.** Agent works, system writes every action, decision, and context to the vector database. Hooks handle this automatically. Retrieval at this stage = basic vector search.
+
+**Phase 2 — Auto-reward.** After each session, the system evaluates productivity (commits, PRs, deploys, emails sent). Memories from productive sessions get higher Q-values. Noise starts sinking.
+
+**Phase 3 — Decision extraction.** Opus 4.6 reads the conversation transcript and extracts strategic decisions, insights, and commitments. These become first-class memories — the kind of context that changes how you approach the next similar situation.
+
+**Phase 4 — Human calibration.** After a significant outcome (deal closed, project shipped), the user reviews related memories and calibrates Q-values. "This memory directly contributed to closing the deal" → Q goes up. "This was irrelevant noise" → Q goes down.
+
+### What you see over time
+
+| Time | What happens |
+|------|-------------|
+| **Week 1** | System stores everything. Retrieval = vector search. |
+| **Month 1** | Auto-rewards separate productive from empty sessions. Decision extraction adds strategic memories. |
+| **Month 3** | Retrieval is fundamentally different from plain search. Proven decisions surface first. Noise is gone. |
+
## Reward Signals
Reward weights are defined by the active **Experience**. The `default` experience rewards coding; `sales` rewards emails and follow-ups; `dealflow` rewards proposals, invoices, and payments. See [Experiences](experiences.md) for full details and how to create your own.
diff --git a/docs/storage-system.md b/docs/storage-system.md
index 4bcb3fb..0c7c152 100644
--- a/docs/storage-system.md
+++ b/docs/storage-system.md
@@ -323,8 +323,16 @@ Same memory can have different Q-values per experience (e.g., "default", "sales"
reward.py (compute session reward → update Q-values)
↓
watermark.py (mark processed obs IDs for idempotency)
+ ↓
+~/.claude/projects/*/*.jsonl (Claude Code transcripts)
+ ↓
+ extract_decisions.py (Opus 4.6 via claude -p → decisions/insights → Qdrant)
```
+### Decision Extraction (`ingest/extract_decisions.py`)
+
+Runs as Phase 2c of SessionEnd (after ingest + reward). Uses Opus 4.6 to extract strategic decisions, insights, and commitments from the conversation transcript. See [Decision Extraction](decision-extraction.md) for details.
+
### Filters (`ingest/filters.py`)
Drops: read-only commands (cat, grep, ls), short summaries (<15 chars), Read/Glob/Grep tool calls.
@@ -339,7 +347,7 @@ Keeps: Write, Edit, Bash with side effects, decisions, valuable tags.
| **SessionStart** | `session-start.sh` | Session begins | Search Qdrant → inject top-5 memories → log retrieval IDs |
| **UserPromptSubmit** | `user-prompt-recall.sh` | Each message | Context recall (skip trivial) → inject |
| **PostToolUse** | `post-tool-use.sh` | After Write/Edit/Bash | Write observation to JSONL (skip reads) |
-| **SessionEnd** | `session-end.sh` | Session ends | Generate summary → async ingest → compute reward |
+| **SessionEnd** | `session-end.sh` | Session ends | Generate summary → async ingest → reward → decision extraction |
---
@@ -377,6 +385,7 @@ Keeps: Write, Edit, Bash with side effects, decisions, valuable tags.
| `ingest/reward.py` | Session reward computation + Q-update + L3/L4 |
| `ingest/retrieval_log.py` | Track recalled memory IDs |
| `ingest/watermark.py` | Idempotent ingestion tracking |
+| `ingest/extract_decisions.py` | Opus 4.6 decision extraction from transcripts |
### Reward Paths
@@ -437,6 +446,9 @@ Keeps: Write, Edit, Bash with side effects, decisions, valuable tags.
| `QDRANT_PORT` | `6333` | Qdrant port |
| `QDRANT_API_KEY` | `""` | Qdrant auth (optional) |
| `ANTHROPIC_API_KEY` | `""` | For enrichment + explanations |
+| `OPENEXP_EXTRACT_MODEL` | `claude-opus-4-6` | Decision extraction model |
+| `OPENEXP_EXTRACT_MAX_TOKENS` | `2048` | Max tokens for extraction |
+| `OPENEXP_EXTRACT_CONTEXT_LIMIT` | `30000` | Max transcript chars sent to LLM |
---
From 5b64e838f230685c08c8cf57fdb979a8e67682fc Mon Sep 17 00:00:00 2001
From: John
Date: Mon, 6 Apr 2026 01:37:24 -0700
Subject: [PATCH 39/59] docs: add honest outreach pitch with 4-phase learning
cycle (#23)
Pitch for content creators/AI communities that accurately describes
what OpenExp does today vs what it becomes over time. Uses real
calibration data (46 memories, Q range -0.3 to 0.9) instead of
hypothetical scenarios.
Co-authored-by: Ivan Pasichnyk
Co-authored-by: Claude Opus 4.6
---
docs/outreach-pitch.md | 84 ++++++++++++++++++++++++++++++++++++++++++
1 file changed, 84 insertions(+)
create mode 100644 docs/outreach-pitch.md
diff --git a/docs/outreach-pitch.md b/docs/outreach-pitch.md
new file mode 100644
index 0000000..b52caa1
--- /dev/null
+++ b/docs/outreach-pitch.md
@@ -0,0 +1,84 @@
+# OpenExp — Outreach Pitch
+
+> Шаблон для outreach до контент-мейкерів, блогерів, AI-спільнот.
+> Українською. Адаптувати під конкретну аудиторію.
+> Last updated: 2026-04-06
+
+---
+
+Привіт!
+
+Я Іван, зробив open-source систему пам'яті для AI-агентів — OpenExp (github.com/anthroos/openexp). Думаю твоїй аудиторії буде в тему.
+
+## Проблема
+
+Даних стає більше, контекст деградує — і це не теорія, а цифри:
+- GPT-4o падає з 99.3% до 69.7% accuracy на 32K токенів
+- Opus 4.6 — 78.3% на MRCR v2 при 1M токенів. Тобто 1 з 5 фактів губиться
+- Реальні тести: деградація помітна вже з 400K, після 600K retrieval ненадійний
+- Дослідження Du et al., 2025: 13.9–85% деградація навіть при 100% retrieval accuracy — сам довгий контекст вбиває reasoning
+
+Всі намагаються запхати більше в промпт. Я пропоную навпаки.
+
+## Рішення — OpenExp
+
+Принцип простий: **Store everything. Retrieve what worked.**
+
+Існуючі системи пам'яті (Mem0, Zep, LangMem) зберігають і шукають. Але кожна пам'ять для них однаково важлива — критичне архітектурне рішення і рандомна grep-команда мають однакову вагу.
+
+OpenExp додає шар, якого немає ні в кого: **пам'ять що вчиться з результатів.**
+
+### Як це працює — 4 фази навчання
+
+**Фаза 1 — Запис.** Агент працює, система автоматично пише кожну дію, рішення, контекст у векторну базу. Хуки Claude Code роблять це без жодних зусиль.
+
+**Фаза 2 — Автоматичні rewards.** Після кожної сесії система дивиться: були коміти? PR? Деплой? Тести пройшли? Пам'яті що використовувались в продуктивних сесіях отримують вищий Q-value. Пам'яті з пустих сесій — нижчий.
+
+**Фаза 3 — Extraction рішень.** Замість "Edited X.html" (дія), Opus 4.6 витягує з транскрипту розмови: "Прибрали рекламу зі скоупу, бо ми не агенція — клієнту потрібна автоматизація, не маркетинг" (рішення з обґрунтуванням). Це те, що має цінність для майбутніх ситуацій.
+
+**Фаза 4 — Калібрація від людини.** Закрився deal? Провалився проект? Людина каже системі: "ця пам'ять допомогла" або "це було марно". Q-values оновлюються прицільно.
+
+### Що відбувається з часом
+
+Перший тиждень — система пише все. Retrieval = звичайний vector search.
+
+Перший місяць — автоматичні rewards починають розділяти корисне від шуму. Пам'яті з продуктивних сесій піднімаються.
+
+Через 3 місяці — retrieval принципово інший від plain search. Перевірені рішення виходять першими. Шум тоне.
+
+### Приклад з реального використання
+
+У мене в базі 46 пам'ятей калібровані під "sales" experience:
+- **Q = 0.9**: "Ніколи не називай клієнтів по імені в пропозиціях — NDA risk" + "T-Mobile testimonial через Cyril Bialo = найсильніший social proof"
+- **Q = 0.8**: Залучення decision-maker'а (не тільки technical contact), discovery call з усіма стейкхолдерами одразу
+- **Q = -0.3**: Пропозиція FD Group — неправильний підхід, витрачений час
+
+Та сама пам'ять може мати різний Q-value в різних контекстах. NDA rule має q=0.9 в sales, але q=0.0 в coding — бо там воно нерелевантне.
+
+## Технічно
+
+- **Гібридний retrieval**: 5 сигналів — vector similarity (30%), Q-value (30%), BM25 keywords (10%), recency (15%), importance (15%)
+- **Q-learning** — той самий алгоритм що тренував AlphaGo, застосований до робочої пам'яті
+- **Experiences** — named Q-learning індекси. Sales, coding, support — різні визначення "успіху" для різних процесів
+- **Decision extraction** — Opus 4.6 витягує рішення з транскриптів, не дії
+- **Повністю локальний** — Qdrant в Docker, FastEmbed для embeddings, нічого не йде в хмару
+- **Open source** — MIT License
+
+## Чим це відрізняється від Mem0/Zep
+
+| | Mem0, Zep, LangMem | OpenExp |
+|---|---|---|
+| Зберігання | + | + |
+| Пошук | Vector search | Hybrid (5 signals) |
+| Навчання | Немає | Q-learning від outcomes |
+| Пріоритизація | Всі пам'яті рівні | Перевірені вище, шум нижче |
+| Контекст рішень | Немає | Opus 4.6 extraction |
+
+Ніхто з конкурентів не має learned memory prioritization. Ринок забитий на store/retrieve, але пустий на "пам'ять що вчиться".
+
+---
+
+Якщо цікаво — можу скинути деталі, демо, або відповісти на питання. Можу також записати коротке відео-пояснення для аудиторії.
+
+GitHub: [anthroos/openexp](https://github.com/anthroos/openexp)
+Paper: [The Yerkes-Dodson Curve for AI Agents](https://arxiv.org/abs/2603.07360)
From 275fce8eddef6a8bcb498b2e9ae1f96f1931f4e4 Mon Sep 17 00:00:00 2001
From: Ivan Pasichnyk
Date: Mon, 6 Apr 2026 01:48:23 -0700
Subject: [PATCH 40/59] feat: decision extraction, experience auto-detect,
Q-value fixes, docs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Major features:
- Decision extraction from session transcripts using Opus 4.6 via claude -p
- Experience auto-detect from prompt keywords (sales, coding, etc.)
- Per-experience Q-value routing in observation ingest
- Q-value wiring fix + cache locking for concurrent sessions
New files:
- openexp/ingest/extract_decisions.py — Opus 4.6 extracts decisions, not actions
- openexp/core/experience.py — experience auto-detection + session tracking
- openexp/data/experiences/{sales,dealflow}.yaml — shipped experience configs
- docs/decision-extraction.md — full reference for extraction system
- tests/test_experience.py — 76 new tests
Documentation:
- 4-phase learning cycle in how-it-works.md
- Updated architecture, storage system, configuration docs
- Decision extraction env vars documented
Co-Authored-By: Claude Opus 4.6
---
README.md | 12 +-
docs/architecture.md | 3 +-
docs/configuration.md | 9 +
docs/decision-extraction.md | 169 +++++
docs/how-it-works.md | 34 +-
docs/storage-system.md | 22 +-
landing.html | 870 +++++++++++++++++++++++++
openexp/core/experience.py | 63 ++
openexp/core/hybrid_search.py | 5 +-
openexp/core/q_value.py | 28 +-
openexp/data/experiences/dealflow.yaml | 19 +
openexp/data/experiences/sales.yaml | 29 +
openexp/hooks/post-tool-use.sh | 2 +-
openexp/hooks/session-end.sh | 146 ++++-
openexp/hooks/user-prompt-recall.sh | 26 +-
openexp/ingest/__init__.py | 14 +-
openexp/ingest/extract_decisions.py | 313 +++++++++
openexp/ingest/observation.py | 65 +-
tests/test_experience.py | 76 +++
tests/test_outcome.py | 10 +-
tests/test_session_end.py | 8 +
21 files changed, 1887 insertions(+), 36 deletions(-)
create mode 100644 docs/decision-extraction.md
create mode 100644 landing.html
create mode 100644 openexp/ingest/extract_decisions.py
diff --git a/README.md b/README.md
index 40c399e..2b0f91b 100644
--- a/README.md
+++ b/README.md
@@ -145,7 +145,9 @@ Three hooks integrate with Claude Code automatically:
| **SessionStart** | Session opens | Searches Qdrant for relevant memories, injects top results as context |
| **UserPromptSubmit** | Every message | Lightweight recall — adds relevant memories to each prompt |
| **PostToolUse** | After Write/Edit/Bash | Captures what Claude does as observations (JSONL) |
-| **SessionEnd** | Session closes | Generates summary, triggers ingest + reward (async) |
+| **SessionEnd** | Session closes | Summary → ingest → reward → decision extraction (async) |
+
+After each session, Opus 4.6 reads the conversation transcript and extracts **decisions** (not actions) — strategic choices, insights, and commitments that have value for future similar situations. See [Decision Extraction](docs/decision-extraction.md).
The MCP server provides 16 tools for memory operations, introspection, and calibration.
@@ -310,7 +312,8 @@ openexp/
│ ├── reward.py # Session productivity → reward signal
│ ├── retrieval_log.py # Closed-loop: which memories were recalled
│ ├── watermark.py # Idempotent ingestion tracking
-│ └── filters.py # Filter trivial observations
+│ ├── filters.py # Filter trivial observations
+│ └── extract_decisions.py # Opus 4.6 decision extraction from transcripts
│
├── resolvers/ # Outcome resolvers (pluggable)
│ └── crm_csv.py # CRM CSV stage transition → reward events
@@ -435,8 +438,9 @@ See the [Experiences Guide](docs/experiences.md) for full details.
Detailed docs are available in the [`docs/`](docs/) directory:
-- [How It Works](docs/how-it-works.md) — full explanation of the learning loop
-- [Storage System](docs/storage-system.md) — 5-level pyramid (L0–L4), all 4 reward paths
+- [How It Works](docs/how-it-works.md) — the 4-phase learning cycle
+- [Decision Extraction](docs/decision-extraction.md) — Opus 4.6 extracts decisions, not actions
+- [Storage System](docs/storage-system.md) — 5-level pyramid (L0-L4), all 4 reward paths
- [Experiences](docs/experiences.md) — domain-specific reward profiles (create your own)
- [Architecture](docs/architecture.md) — system design and data flow
- [Configuration](docs/configuration.md) — all environment variables and options
diff --git a/docs/architecture.md b/docs/architecture.md
index 4806f94..de357f1 100644
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -79,6 +79,7 @@ Converts raw observations (JSONL) into embedded vectors in Qdrant:
4. **reward.py** — Computes session productivity score, applies Q-value updates (all 3 layers)
5. **retrieval_log.py** — Tracks which memories were recalled (for closed-loop reward)
6. **watermark.py** — Idempotency: prevents duplicate ingestion
+7. **extract_decisions.py** — Opus 4.6 extracts strategic decisions/insights from transcripts (Phase 2c)
### Outcome Resolution (`openexp/outcome.py` + `openexp/resolvers/`)
@@ -99,7 +100,7 @@ Shell scripts registered with Claude Code:
- **session-start.sh** — Builds contextual query, searches Qdrant, formats results, logs retrieval
- **user-prompt-recall.sh** — Per-message recall (skips trivial inputs), logs retrieval
- **post-tool-use.sh** — Captures Write/Edit/Bash observations, skips Read/Glob/Grep
-- **session-end.sh** — Generates session summary, triggers async ingest + reward computation
+- **session-end.sh** — Generates session summary, triggers async ingest + reward + decision extraction
## Data Persistence
diff --git a/docs/configuration.md b/docs/configuration.md
index 40e7115..24a5cf9 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -45,6 +45,15 @@ Without `ANTHROPIC_API_KEY`, memories are stored with basic metadata. The system
See [Experiences Guide](experiences.md) for details on creating custom experiences.
+### Decision Extraction
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `OPENEXP_EXTRACT_MODEL` | `claude-opus-4-6` | LLM model for extraction (do not downgrade) |
+| `OPENEXP_EXTRACT_MAX_TOKENS` | `2048` | Max response tokens |
+| `OPENEXP_EXTRACT_CONTEXT_LIMIT` | `30000` | Max chars of transcript sent to LLM |
+
+Decision extraction uses `claude -p` (Claude Code pipe mode) to leverage your Max subscription. No API key needed.
+
### Ingest Pipeline
| Variable | Default | Description |
|----------|---------|-------------|
diff --git a/docs/decision-extraction.md b/docs/decision-extraction.md
new file mode 100644
index 0000000..c640b77
--- /dev/null
+++ b/docs/decision-extraction.md
@@ -0,0 +1,169 @@
+# Decision Extraction
+
+> Extract strategic decisions, insights, and commitments from session transcripts.
+> The system records "chose to lead with social proof because enterprise clients trust references" — not "edited proposal.html".
+
+## Why This Matters
+
+Without decision extraction, OpenExp records **actions** (tool calls, file edits, commands). Actions are useful for reward computation but have low strategic value — "Edited file.html" tells you nothing about **why** that edit was made or **what alternative was considered**.
+
+Decision extraction uses Opus 4.6 to read the full conversation transcript and extract:
+
+1. **Decisions** — choice points with reasoning. What was chosen, why, and what was the alternative?
+2. **Insights** — things learned about clients, markets, patterns. Why does it matter for future work?
+3. **Commitments** — promises or agreements. Who committed to what, by when?
+
+These extracted items become first-class memories in Qdrant, searchable and Q-value-ranked like any other memory.
+
+## How It Works
+
+Decision extraction runs automatically as **Phase 2c** of the SessionEnd hook (async, after ingest + reward):
+
+```
+Session ends
+ ↓
+Phase 2a: Ingest observations + session reward
+Phase 2b: Fallback reward for pre-ingested obs
+Phase 2c: Decision extraction from transcript (NEW)
+ ↓
+Find transcript JSONL for this session
+ ↓
+Read and condense transcript (skip tool results, system noise)
+ ↓
+Send to Opus 4.6 via claude -p (Max subscription)
+ ↓
+Parse JSON response → store each item in Qdrant with embedding
+```
+
+### Transcript Processing
+
+The transcript reader (`read_transcript()`) processes Claude Code JSONL transcripts:
+
+- Reads only `user` and `assistant` message types
+- Extracts text blocks, skips `tool_result` and `system-reminder` content
+- Prioritizes recent messages (builds from end, respects context limit)
+- Default context limit: 30,000 chars (configurable via `OPENEXP_EXTRACT_CONTEXT_LIMIT`)
+
+### LLM Extraction
+
+Uses `claude -p --model opus` (pipe mode) to leverage Claude Max subscription — zero API cost.
+
+The extraction prompt instructs Opus 4.6 to:
+- Think strategically: "helicopter view + details"
+- Be selective: 3-8 items per session
+- Focus on what would be valuable in a FUTURE conversation
+- Skip file edits, tool calls, code changes (already captured as observations)
+
+### Storage
+
+Each extracted item is stored in Qdrant with:
+
+```json
+{
+ "memory": "Chose to remove advertising from scope because we're not a marketing agency — client needs automation, not ads",
+ "type": "decision",
+ "source": "decision_extraction",
+ "importance": 0.8,
+ "tags": ["client-name", "scoping"],
+ "session_id": "abc-123",
+ "experience": "sales",
+ "status": "active"
+}
+```
+
+Memory types are mapped: `decision` → `decision`, `insight` → `insight`, `commitment` → `action`.
+
+## Configuration
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `OPENEXP_EXTRACT_MODEL` | `claude-opus-4-6` | LLM model for extraction (do not downgrade) |
+| `OPENEXP_EXTRACT_MAX_TOKENS` | `2048` | Max response tokens |
+| `OPENEXP_EXTRACT_CONTEXT_LIMIT` | `30000` | Max chars of transcript sent to LLM |
+
+### Model Quality
+
+Opus 4.6 is mandatory for extraction. The quality of extracted decisions determines the quality of the entire memory system. This is the annotation layer — not a place to save money.
+
+### Recursion Guard
+
+Decision extraction runs inside the SessionEnd hook and spawns `claude -p` as a subprocess. To prevent the subprocess from triggering its own SessionEnd → extraction → subprocess loop:
+
+1. The `extract_decisions()` function sets `OPENEXP_EXTRACT_RUNNING=1` in the subprocess environment
+2. `session-end.sh` checks this variable at startup and exits immediately if set
+
+## API
+
+### `read_transcript(transcript_path, session_id=None) -> str`
+
+Read and condense a Claude Code JSONL transcript. Returns formatted text with `IVAN:` and `ASSISTANT:` prefixes.
+
+### `extract_decisions(transcript_text, session_id="", experience="default") -> List[Dict]`
+
+Extract decisions from transcript text using Opus 4.6. Returns list of items:
+
+```python
+[
+ {
+ "type": "decision",
+ "content": "One clear sentence describing what happened and WHY",
+ "importance": 0.8,
+ "tags": ["domain", "client"],
+ "client_id": "comp-xxx" # or null
+ }
+]
+```
+
+### `extract_and_store(transcript_path, session_id, experience="default", dry_run=False) -> Dict`
+
+Full pipeline: read transcript → extract → store in Qdrant.
+
+```python
+# Dry run (extract without storing)
+result = extract_and_store(path, session_id, dry_run=True)
+# {"extracted": 6, "items": [...], "dry_run": True}
+
+# Real run
+result = extract_and_store(path, session_id, experience="sales")
+# {"extracted": 6, "stored": 6, "experience": "sales", "model": "claude-opus-4-6"}
+```
+
+## Example Output
+
+From a real session about a client proposal:
+
+```json
+[
+ {
+ "type": "decision",
+ "content": "Removed advertising from project scope because we're not a marketing agency — client needs CRM+email+follow-up automation, not Google Ads management",
+ "importance": 0.9,
+ "tags": ["client-project", "scoping", "pricing"]
+ },
+ {
+ "type": "insight",
+ "content": "For small service businesses, semi-automatic approach (Claude Code + one click) is more valuable than full automation: follow-up semi-auto = 2-3 hrs vs full auto = 8-12 hrs. Client needs control, not full autonomy.",
+ "importance": 0.8,
+ "tags": ["product-strategy", "semi-auto-vs-auto"]
+ },
+ {
+ "type": "insight",
+ "content": "All won clients came through network/referrals — zero presence on freelance platforms despite strong fit. Untapped channel.",
+ "importance": 0.8,
+ "tags": ["sales-channel", "growth"]
+ },
+ {
+ "type": "commitment",
+ "content": "TODO: finalize scope, update price in HTML proposal, send to client by tomorrow",
+ "importance": 0.6,
+ "tags": ["follow-up"]
+ }
+]
+```
+
+## Files
+
+| File | Purpose |
+|------|---------|
+| `openexp/ingest/extract_decisions.py` | Core module: read, extract, store |
+| `openexp/hooks/session-end.sh` | Phase 2c integration (lines 235-272) |
diff --git a/docs/how-it-works.md b/docs/how-it-works.md
index 1074913..c44ef7b 100644
--- a/docs/how-it-works.md
+++ b/docs/how-it-works.md
@@ -50,7 +50,19 @@ When the session ends, the SessionEnd hook:
2. Saves it to `~/.openexp/sessions/`
3. Triggers async ingest + reward computation (runs in background so it doesn't block exit)
-### 4. Q-Learning Reward Loop
+### 4. Decision Extraction (SessionEnd Phase 2c)
+
+After ingest and reward, Opus 4.6 reads the full conversation transcript and extracts:
+
+- **Decisions** — "Chose to remove advertising from scope because we're not a marketing agency"
+- **Insights** — "All won clients came through referrals — zero presence on freelance platforms"
+- **Commitments** — "Finalize proposal and send by tomorrow"
+
+This is the critical difference between recording "Edited proposal.html" (action) and recording "Chose to lead with social proof because enterprise clients trust references" (decision with reasoning). Decisions have strategic value; actions don't.
+
+See [Decision Extraction](decision-extraction.md) for full details.
+
+### 5. Q-Learning Reward Loop
This is the core innovation. After each session:
@@ -65,6 +77,26 @@ Q_new = (1 - 0.25) × Q_old + 0.25 × reward
Over time, this creates a natural ranking where useful memories (project conventions, working solutions, important decisions) rise to the top, while noise (trivial commands, one-off fixes) sinks.
+## The 4-Phase Learning Cycle
+
+OpenExp learns in four phases, each building on the previous:
+
+**Phase 1 — Store.** Agent works, system writes every action, decision, and context to the vector database. Hooks handle this automatically. Retrieval at this stage = basic vector search.
+
+**Phase 2 — Auto-reward.** After each session, the system evaluates productivity (commits, PRs, deploys, emails sent). Memories from productive sessions get higher Q-values. Noise starts sinking.
+
+**Phase 3 — Decision extraction.** Opus 4.6 reads the conversation transcript and extracts strategic decisions, insights, and commitments. These become first-class memories — the kind of context that changes how you approach the next similar situation.
+
+**Phase 4 — Human calibration.** After a significant outcome (deal closed, project shipped), the user reviews related memories and calibrates Q-values. "This memory directly contributed to closing the deal" → Q goes up. "This was irrelevant noise" → Q goes down.
+
+### What you see over time
+
+| Time | What happens |
+|------|-------------|
+| **Week 1** | System stores everything. Retrieval = vector search. |
+| **Month 1** | Auto-rewards separate productive from empty sessions. Decision extraction adds strategic memories. |
+| **Month 3** | Retrieval is fundamentally different from plain search. Proven decisions surface first. Noise is gone. |
+
## Reward Signals
Reward weights are defined by the active **Experience**. The `default` experience rewards coding; `sales` rewards emails and follow-ups; `dealflow` rewards proposals, invoices, and payments. See [Experiences](experiences.md) for full details and how to create your own.
diff --git a/docs/storage-system.md b/docs/storage-system.md
index 501cd83..0c7c152 100644
--- a/docs/storage-system.md
+++ b/docs/storage-system.md
@@ -3,7 +3,7 @@
> **Purpose:** This document describes the full storage architecture so that Claude
> doesn't have to re-read every source file each session. Read THIS instead of the code.
>
-> **Last updated:** 2026-03-26 (after L4 audit, all gaps fixed, 237 tests pass)
+> **Last updated:** 2026-04-05 (experience routing fix, 250 tests pass)
---
@@ -314,7 +314,7 @@ Same memory can have different Q-values per experience (e.g., "default", "sales"
↓
filters.py (drops ~60-70% trivial obs)
↓
- observation.py (batch embed via FastEmbed → upsert to Qdrant)
+ observation.py (batch embed via FastEmbed → upsert to Qdrant, experience-aware Q init)
↓
~/.openexp/sessions/*.md (written by session-end hook)
↓
@@ -323,8 +323,16 @@ Same memory can have different Q-values per experience (e.g., "default", "sales"
reward.py (compute session reward → update Q-values)
↓
watermark.py (mark processed obs IDs for idempotency)
+ ↓
+~/.claude/projects/*/*.jsonl (Claude Code transcripts)
+ ↓
+ extract_decisions.py (Opus 4.6 via claude -p → decisions/insights → Qdrant)
```
+### Decision Extraction (`ingest/extract_decisions.py`)
+
+Runs as Phase 2c of SessionEnd (after ingest + reward). Uses Opus 4.6 to extract strategic decisions, insights, and commitments from the conversation transcript. See [Decision Extraction](decision-extraction.md) for details.
+
### Filters (`ingest/filters.py`)
Drops: read-only commands (cat, grep, ls), short summaries (<15 chars), Read/Glob/Grep tool calls.
@@ -339,7 +347,7 @@ Keeps: Write, Edit, Bash with side effects, decisions, valuable tags.
| **SessionStart** | `session-start.sh` | Session begins | Search Qdrant → inject top-5 memories → log retrieval IDs |
| **UserPromptSubmit** | `user-prompt-recall.sh` | Each message | Context recall (skip trivial) → inject |
| **PostToolUse** | `post-tool-use.sh` | After Write/Edit/Bash | Write observation to JSONL (skip reads) |
-| **SessionEnd** | `session-end.sh` | Session ends | Generate summary → async ingest → compute reward |
+| **SessionEnd** | `session-end.sh` | Session ends | Generate summary → async ingest → reward → decision extraction |
---
@@ -372,11 +380,12 @@ Keeps: Write, Edit, Bash with side effects, decisions, valuable tags.
| File | Purpose |
|------|---------|
| `ingest/filters.py` | Drop trivial observations |
-| `ingest/observation.py` | Batch embed → Qdrant upsert |
+| `ingest/observation.py` | Batch embed → Qdrant upsert (passes `experience` to Q-cache init) |
| `ingest/session_summary.py` | Parse session markdown → memories |
| `ingest/reward.py` | Session reward computation + Q-update + L3/L4 |
| `ingest/retrieval_log.py` | Track recalled memory IDs |
| `ingest/watermark.py` | Idempotent ingestion tracking |
+| `ingest/extract_decisions.py` | Opus 4.6 decision extraction from transcripts |
### Reward Paths
@@ -437,12 +446,15 @@ Keeps: Write, Edit, Bash with side effects, decisions, valuable tags.
| `QDRANT_PORT` | `6333` | Qdrant port |
| `QDRANT_API_KEY` | `""` | Qdrant auth (optional) |
| `ANTHROPIC_API_KEY` | `""` | For enrichment + explanations |
+| `OPENEXP_EXTRACT_MODEL` | `claude-opus-4-6` | Decision extraction model |
+| `OPENEXP_EXTRACT_MAX_TOKENS` | `2048` | Max tokens for extraction |
+| `OPENEXP_EXTRACT_CONTEXT_LIMIT` | `30000` | Max transcript chars sent to LLM |
---
## 14. Test Coverage
-237 tests across 11 test files. Key test files for the storage system:
+250 tests across 11 test files. Key test files for the storage system:
| File | Tests | What |
|------|-------|------|
diff --git a/landing.html b/landing.html
new file mode 100644
index 0000000..39628a8
--- /dev/null
+++ b/landing.html
@@ -0,0 +1,870 @@
+
+
+
+
+
+OpenExp — Self-labeling experience engine for AI agents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Open Source · MIT License
+
+
Your AI doesn't learn from outcomes. OpenExp fixes that.
+
Define your business process. Every outcome — commit, closed deal, resolved ticket — feeds back as a reward signal. Over time, proven memories surface first. Noise sinks.
docker run -d --name qdrant -p 6333:6333 qdrant/qdrant
+
# Register hooks with Claude Code
+
openexp hooks install
+
# Done. Use Claude Code as normal.
+
+
+
+
+
+
+
+
+
The Learning Loop
+
Every session makes the next one smarter. The same algorithm behind AlphaGo — applied to your AI's working memory.
+
+
+
🧠
+
Recall
+
Top memories injected into context, ranked by Q-value
+
+
→
+
+
⚙️
+
Work
+
Every action captured automatically as observations
+
+
→
+
+
📊
+
Evaluate
+
Session ends — did anything productive happen?
+
+
→
+
+
🔄
+
Reward
+
Productive? Recalled memories get higher scores
+
+
+
+
+
+
+
+
+
The Problem with AI Memory Today
+
+
+
No Learning
+
Static instructions
+
You write a CLAUDE.md with rules. The AI reads it every session. It works — but it never updates its understanding. To change priorities, you edit the file by hand.
+
+
+
Doesn't Scale
+
Full context window
+
Pack everything into context — CRM, docs, chat history. Expensive, slow, and eventually you can't fit it all in. More tokens, diminishing returns.
+
+
+
No Signal
+
Memory services
+
Mem0, Zep, LangMem store and retrieve. But every memory is equally important. A critical decision and a random grep command have the same weight.
+
+
+
+
+
+
+
+
+
How OpenExp Works
+
Write everything. Remember selectively. Learn from outcomes.
+
+
+
1
+
Automatic capture
+
Every action in your Claude Code session — file edits, commits, commands, decisions — is automatically recorded. Hooks handle it. Zero manual work.
+
+
+
2
+
Smart retrieval
+
Before each response, the system finds the most relevant memories. Not by similarity alone — by proven usefulness. Five ranking signals, not just vector search.
+
+
+
3
+
Reward loop
+
After every session, the system evaluates what happened. Productive sessions reward the memories that were used. Empty sessions penalize them. Q-values update automatically.
+
+
+
+
+
+
+
+
+
Session Signals
+
After each session, OpenExp checks what was produced and assigns a reward score.
+
+
+
Session outcome
Reward
+
+
Code committed
+0.30
+
Pull request created
+0.20
+
Deployed to production
+0.10
+
Tests passed
+0.10
+
Deal closed (CRM)
+0.80
+
Nothing produced
-0.10
+
+
+
+
+
+
+
+
+
+
Experiences — Your Process, Your Rewards
+
One memory can be valuable in one context and worthless in another. Define what "productive" means for your workflow.