pdudotdev
diff --git a/‎.claude.example/settings.local.json‎
Lines changed: 0 additions & 10 deletions b/‎.claude.example/settings.local.json‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎.claude.example/skills/qa/SKILL.md‎
Lines changed: 2 additions & 2 deletions b/‎.claude.example/skills/qa/SKILL.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 2 deletions b/‎.gitignore‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎CLAUDE.md‎
Lines changed: 4 additions & 0 deletions b/‎CLAUDE.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎core/settings.py‎
Lines changed: 1 addition & 0 deletions b/‎core/settings.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ingest.py‎
Lines changed: 3 additions & 3 deletions b/‎ingest.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎metadata/workflow/WORKFLOW.md‎
Lines changed: 8 additions & 113 deletions b/‎metadata/workflow/WORKFLOW.md‎
Lines changed: 8 additions & 113 deletions
diff --git a/‎requirements.txt‎
Lines changed: 4 additions & 3 deletions b/‎requirements.txt‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎results/network_qa_example.xml‎
Lines changed: 33 additions & 0 deletions b/‎results/network_qa_example.xml‎
Lines changed: 33 additions & 0 deletions
@@ -7,16 +7,6 @@
       "Bash(ls:*)"
     ],
     "deny": [
-      "Read(.env)",
-      "Read(**/.env)",
-      "Read(**/.env.*)",
-      "Bash(cat .env*)",
-      "Bash(less .env*)",
-      "Bash(head .env*)",
-      "Bash(tail .env*)",
-      "Bash(more .env*)",
-      "Bash(env)",
-      "Bash(printenv *)",
       "Bash(ssh *)",
       "Bash(sshpass *)",
       "Bash(rm -rf *)",
 
@@ -18,7 +18,7 @@ Then stop.
 
 Parse the XML structure:
 - Each `<testcase>` is one test scenario. The `name` attribute is the scenario name.
-- `<properties>` contain key-value pairs: `device`, `rfc_ref`, `description`, `rfc_citation`.
+- `<properties>` contain key-value pairs: `device`, `rfc_ref`, `description`.
 - A `<testcase>` with a `<failure>` child is a failed test. The `message` attribute and text content describe the failure.
 - A `<testcase>` without `<failure>` is a pass.
 
@@ -73,6 +73,6 @@ Produce a concise report for the investigated failure:
 5. **RFC basis**: the protocol rule that explains the failure
 6. **Recovery status**: is the network still broken or has it been fixed?
 
-If there are remaining uninvestigated failures, re-present the list (without the one just investigated) and ask the user to pick the next one. Repeat until all failures are investigated or the user stops.
+**IMPORTANT — always loop back.** After the report, if there are remaining uninvestigated failures, you MUST immediately re-present the remaining failure list and ask the user to pick the next one — do not wait for the user to ask. The user acknowledging a fix ("ok", "got it", "I'll do that") is NOT a signal to stop. Only stop looping if the user explicitly declines (e.g. "that's all", "no more", "skip the rest") or all failures have been investigated.
 
 After investigating a failure, if its root cause likely explains other failures still on the list, say so — the user may choose to skip those.
@@ -7,5 +7,4 @@ yana/
 .pytest_cache/
 .ruff_cache/
 ansible_test_cases/
-ansible/collections/ansible_collections/
-results/
+ansible/collections/ansible_collections/
@@ -66,6 +66,10 @@ State clearly:
 - What the root cause is (or what further information is needed)
 - What the recommended fix is (configuration direction only — never push changes)
 
+### Multi-failure investigations
+
+When investigating multiple failures (e.g. via `/qa`), always loop back after each finding. Present remaining uninvestigated failures and ask the user to pick the next one. The user acknowledging a fix is not a signal to stop — only stop when the user explicitly declines or all failures are covered.
+
 ## Constraints
 
 - **Read-only.** Never suggest commands that change device configuration. Diagnosis and direction only.
 
@@ -52,6 +52,7 @@ Run your tests with any framework. When something fails, YANA investigates - it
 
 **Step 1 - Install and ingest:**
 ```bash
+sudo apt install git make python3.12-venv
 cd ~ && git clone https://github.com/pdudotdev/YANA
 cd YANA && make setup
 ```
 
@@ -3,6 +3,7 @@
 
 USERNAME = os.getenv("ROUTER_USERNAME", "")
 PASSWORD = os.getenv("ROUTER_PASSWORD", "")
+PASSWORD_JUNOS = os.getenv("ROUTER_PASSWORD_JUNOS", "") or PASSWORD
 
 SSH_TIMEOUT_OPS = 30
 SSH_TIMEOUT_OPS_LONG = 90
 
@@ -6,9 +6,9 @@
 from dotenv import load_dotenv
 load_dotenv(Path(__file__).parent / ".env")
 
-from langchain.schema import Document
-from langchain_community.embeddings import HuggingFaceEmbeddings
-from langchain_community.vectorstores import Chroma
+from langchain_core.documents import Document
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_chroma import Chroma
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 
 from tools.rag import _CHROMA_DIR, _COLLECTION, _EMBEDDING_MODEL
 
@@ -48,85 +48,13 @@ See [OPTIMIZATIONS.md](../scalability/OPTIMIZATIONS.md) for the full RAG optimiz
 
 ---
 
-## Interactive Investigation
-
-The user asks a question in Claude Code. The agent follows the diagnostic workflow defined in `CLAUDE.md`:
-
-### Step 0 — Preflight
-
-```
-get_status()
-```
-
-Confirms which backends are active: inventory (device count), intent (router count), and ChromaDB availability. Displayed as a table before any investigation begins.
-
-### Step 1 — Load the Protocol Skill
-
-The agent reads the relevant skill file before starting. Skill files contain decision trees and query sequences — the agent follows them, it does not improvise.
-
-| When to use | Skill file |
-|-------------|-----------|
-| Adjacency, neighbor state, LSDB, area type | `skills/ospf/SKILL.md` |
-| Path selection, PBR, route-maps, prefix-lists, AD conflicts | `skills/routing/SKILL.md` |
-| Reachability ("can't reach X from Y") | Start with `traceroute` to find the breaking hop, then load the appropriate skill |
-
-### Step 2 — Search the Knowledge Base
-
-```
-search_knowledge_base(query="OSPF neighbor stuck in INIT", topic="rfc", protocol="ospf")
-```
-
-Returns RFC text and vendor documentation relevant to the issue. The `protocol` filter eliminates cross-protocol noise. The embedding model maps the question to nearby chunks even when the exact words differ.
-
-### Step 3 — Query Live Devices
-
-The agent queries the devices involved in the issue:
-
-```
-query_intent(device="D1C")        # what SHOULD the network look like?
-get_ospf("D1C", "neighbors")      # what DOES it look like?
-get_ospf("D1C", "interfaces")     # check timers, area, passive, auth
-traceroute("E1C", "192.168.42.1") # where does the path break?
-```
-
-The skill file dictates which queries to run and in what order. For OSPF adjacency issues, the checklist is: timers → area type → network type → auth → passive → MTU → interface state. Stop at the first mismatch.
-
-### Step 4 — Synthesize
-
-The agent combines knowledge base context with live data. When they conflict, live data wins. The report states:
-
-- What the data shows
-- Root cause with RFC citation
-- Fix direction (configuration guidance only — YANA never pushes config)
-
-### Example
-
-```
-User: "Why can't E1C reach A2A's loopback?"
-
-Agent:
-  1. get_status()           → inventory, intent, ChromaDB all active
-  2. Reads skills/ospf/SKILL.md
-  3. get_routing("E1C", "ip_route")  → 192.168.42.1 missing from VRF1
-  4. get_ospf("E1C", "database")     → No Type 3 LSA for 192.168.42.1
-  5. query_intent()          → A2A should be in Area 1 (stub), connected via D1C/D2B
-  6. get_ospf("D1C", "neighbors")    → D1C has no adjacency with A2A
-  7. get_ospf("A2A", "interfaces")   → A2A's Area 1 is "normal", not stub
-  8. search_knowledge_base("E-bit mismatch stub area", topic="rfc", protocol="ospf")
-
-  Report: A2A is missing `area 1 stub`. RFC 2328 §10.5: E-bit mismatch
-          causes Hellos to be silently discarded. Fix: add stub config to A2A.
-```
-
----
-
 ## QA Investigation
 
 Run your tests with any framework. When something fails, YANA investigates.
 
 ### Test Results
 
-YANA reads JUnit XML results from `results/`. JUnit XML is the de facto standard — produced by pytest (`--junitxml`), pyATS (`--xunit`), Robot Framework (`--xunit`), Ansible (junit callback), and most other test runners.
+YANA reads JUnit XML results from `results/`. JUnit XML is the de facto standard — produced by pytest (`--junitxml`), pyATS (`--xunit`), Robot Framework (`--xunit`), and most other test runners.
 
 Place your test results in `results/` as `.xml` files. YANA doesn't care how the tests were run — it only needs the results.
 
@@ -142,52 +70,19 @@ When tests fail, the user runs `/qa` in Claude Code. The skill (`.claude/skills/
   4. Present numbered failure list to the user
   5. User picks a failure to investigate
   6. Agent reads test context from <properties> (device, rfc_ref, description)
-  7. Agent runs the same diagnostic workflow as interactive mode:
+  7. Agent investigates:
+     - get_status() → confirm backends are active
+     - Load protocol skill (skills/ospf/SKILL.md or skills/routing/SKILL.md)
      - query_intent() → expected state
-     - get_ospf/get_routing/get_interfaces → live state
-     - Follows skill decision trees to trace the root cause
+     - get_ospf/get_routing/get_interfaces/traceroute → live state
+     - Follow skill decision trees to trace the root cause
      - search_knowledge_base → RFC context
   8. Reports findings (scenario, observed, current state, root cause, RFC basis)
   9. Re-presents remaining failures — user picks next, or stops
 ```
 
 If multiple failures share a root cause, the agent says so after investigating the first one — the user can skip the rest.
 
----
-
-## Architecture Summary
+### Interactive Mode
 
-```
-                    ┌─────────────────────────────────────────┐
-                    │            Claude Code (UI)              │
-                    │                                         │
-                    │   Interactive: User asks a question     │
-                    │   QA: User runs /qa after tests         │
-                    └──────────────┬──────────────────────────┘
-                                   │ MCP protocol
-                    ┌──────────────▼──────────────────────────┐
-                    │         YANA MCP Server                  │
-                    │         server/MCPServer.py              │
-                    │                                         │
-                    │   8 tools registered via FastMCP         │
-                    └──┬───────┬───────┬───────┬──────────────┘
-                       │       │       │       │
-              ┌────────▼──┐ ┌──▼────┐ ┌▼─────┐ ┌▼──────────┐
-              │ SSH tools  │ │ RAG   │ │Intent│ │ Status    │
-              │ get_ospf   │ │search │ │query │ │ get_status│
-              │ get_routing│ │_kb    │ │_intent││ list_dev  │
-              │ get_intf   │ │       │ │      │ │           │
-              │ traceroute │ │       │ │      │ │           │
-              └─────┬──────┘ └───┬───┘ └──┬───┘ └───────────┘
-                    │            │        │
-         ┌──────────▼──┐  ┌─────▼───┐ ┌──▼──────────┐
-         │  Scrapli SSH │  │ChromaDB │ │ JSON files  │
-         │  6 vendors   │  │ + MiniLM│ │ data/*.json │
-         │  env creds   │  │         │ │             │
-         └──────────────┘  └─────────┘ └─────────────┘
-
-  Test runners (separate process, not MCP):
-    pytest, pyATS, Ansible, Robot Framework, etc.
-      → JUnit XML results in results/
-      → Consumed by /qa skill in Claude
-```
+YANA also handles ad-hoc questions outside the QA workflow. The user asks a question directly (e.g. "Why can't E1C reach A2A's loopback?") and the agent follows the same diagnostic process: preflight check via `get_status()`, load the relevant protocol skill, query live devices, search the knowledge base, and synthesize a report with root cause and RFC citation. The full interactive workflow is defined in `CLAUDE.md`.
@@ -2,9 +2,10 @@
 fastmcp>=3.0,<4.0
 
 # RAG pipeline
-langchain>=0.3,<0.4
-langchain-community>=0.3,<0.4
-chromadb>=0.6,<1.0
+langchain-core>=1.0,<2.0
+langchain-huggingface>=1.0,<2.0
+langchain-chroma>=1.0,<2.0
+langchain-text-splitters>=0.3,<1.0
 sentence-transformers>=3.0,<4.0
 
 # Environment
 
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<testsuites name="network_qa"
+            tests="3"
+            failures="2"
+            timestamp="2026-03-30T06:15:42Z">
+  <testsuite name="network_qa"
+             tests="3"
+             failures="2">
+    <testcase name="route_to_a2a" classname="route_to_a2a">
+      <properties>
+        <property name="device" value="E1C"/>
+        <property name="rfc_ref" value="RFC 2328 &sect;16"/>
+        <property name="description" value="Verify E1C has route to A2A loopback 192.168.42.1 in VRF1"/>
+      </properties>
+      <failure message="Verify E1C has route to A2A loopback 192.168.42.1 in VRF1">Assertion route_exists(&quot;192.168.42.1&quot;) returned False. NETCONF response contains VRF1 routing entries but 192.168.42.1/32 is not present in the RIB.</failure>
+    </testcase>
+    <testcase name="ospf_adj_e1c_c1j" classname="ospf_adj_e1c_c1j">
+      <properties>
+        <property name="device" value="E1C"/>
+        <property name="rfc_ref" value="RFC 2328 &sect;10.3"/>
+        <property name="description" value="Verify E1C has FULL OSPF adjacency with C1J (router-id 22.22.22.11)"/>
+      </properties>
+      <failure message="Verify E1C has FULL OSPF adjacency with C1J (router-id 22.22.22.11)">Assertion ospf_neighbor_full(&quot;22.22.22.11&quot;) returned False. Neighbor 22.22.22.11 found but adjacency state is INIT, not FULL.</failure>
+    </testcase>
+    <testcase name="route_map_e1c_to_c1j" classname="route_map_e1c_to_c1j">
+      <properties>
+        <property name="device" value="C1J"/>
+        <property name="rfc_ref" value="RFC 2328 &sect;16.4"/>
+        <property name="description" value="Verify route-map on E1C redistributes static route 10.99.99.0/24 to C1J via OSPF"/>
+      </properties>
+    </testcase>
+  </testsuite>
+</testsuites>