-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy patheval.py
More file actions
137 lines (114 loc) · 5.07 KB
/
Copy patheval.py
File metadata and controls
137 lines (114 loc) · 5.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import asyncio
import httpx
import json
from dataclasses import dataclass, asdict
BASE_URL = "http://localhost:8000"
# ── golden test set ────────────────────────────────────────────────────────────
PYTHON_QUERIES = [
"How do I read a CSV file with pandas?",
"How to reverse a list in Python?",
"How do I handle exceptions in Python?",
"What is the difference between @staticmethod and @classmethod?",
"How do I use async and await in Python?",
"How do Python decorators work?",
"What is the fastest way to check if a key exists in a dictionary?",
"How do I use list comprehension with conditions?",
]
OFF_TOPIC_QUERIES = [
"What is the capital of France?",
"Who won the FIFA World Cup in 2022?",
"How do I make pasta carbonara?",
]
@dataclass
class EvalResult:
query: str
is_off_topic_test: bool
retrieval_relevance: float # mean similarity of retrieved chunks
answer_length: int # proxy for completeness
faithfulness_score: float # lexical overlap proxy
correctly_rejected: bool # only meaningful for off-topic queries
latency_ms: int
confidence: float
def lexical_faithfulness(answer: str, sources: list[dict]) -> float:
"""
Proxy faithfulness: what fraction of source titles' keywords appear in the answer?
Real faithfulness would use an LLM judge (e.g. Ragas), but this runs offline.
"""
if not sources:
return 0.0
answer_lower = answer.lower()
hits = 0
total = 0
for src in sources:
keywords = [w for w in src["title"].lower().split() if len(w) > 3]
total += len(keywords)
hits += sum(1 for kw in keywords if kw in answer_lower)
return round(hits / total, 3) if total else 0.0
async def evaluate_query(client: httpx.AsyncClient, query: str, off_topic: bool) -> EvalResult:
resp = await client.post(
f"{BASE_URL}/ask",
json={"question": query},
timeout=30,
)
resp.raise_for_status()
data = resp.json()
sources = data.get("sources", [])
retrieval_relevance = (
sum(s["relevance"] for s in sources) / len(sources) if sources else 0.0
)
faithfulness = lexical_faithfulness(data.get("answer", ""), sources)
correctly_rejected = off_topic and data.get("off_topic", False)
return EvalResult(
query=query,
is_off_topic_test=off_topic,
retrieval_relevance=round(retrieval_relevance, 3),
answer_length=len(data.get("answer", "")),
faithfulness_score=faithfulness,
correctly_rejected=correctly_rejected,
latency_ms=data.get("latency_ms", 0),
confidence=data.get("confidence", 0.0),
)
async def run_eval():
results: list[EvalResult] = []
async with httpx.AsyncClient() as client:
# Check health first
health = await client.get(f"{BASE_URL}/health")
print(f"✅ Service status: {health.json()['status']}\n")
tasks = (
[evaluate_query(client, q, False) for q in PYTHON_QUERIES]
+ [evaluate_query(client, q, True) for q in OFF_TOPIC_QUERIES]
)
results = await asyncio.gather(*tasks)
# ── summary ────────────────────────────────────────────────────────────────
python_results = [r for r in results if not r.is_off_topic_test]
off_topic_results = [r for r in results if r.is_off_topic_test]
avg_relevance = sum(r.retrieval_relevance for r in python_results) / len(python_results)
avg_faithfulness = sum(r.faithfulness_score for r in python_results) / len(python_results)
avg_latency = sum(r.latency_ms for r in python_results) / len(python_results)
rejection_rate = sum(1 for r in off_topic_results if r.correctly_rejected) / len(off_topic_results)
print("=" * 60)
print("EVAL RESULTS — Python Q&A RAG Pipeline")
print("=" * 60)
print(f"{'Metric':<35} {'Score':>10}")
print("-" * 60)
print(f"{'Avg retrieval relevance (0–1)':<35} {avg_relevance:>10.3f}")
print(f"{'Avg answer faithfulness (0–1)':<35} {avg_faithfulness:>10.3f}")
print(f"{'Avg latency (ms)':<35} {avg_latency:>10.0f}")
print(f"{'Off-topic rejection rate':<35} {rejection_rate:>10.0%}")
print("=" * 60)
print("\nPer-query breakdown (Python questions):")
for r in python_results:
print(
f" [{r.latency_ms:>4}ms | rel={r.retrieval_relevance:.2f} | "
f"faith={r.faithfulness_score:.2f}] {r.query[:60]}"
)
print("\nOff-topic queries:")
for r in off_topic_results:
status = "✅ REJECTED" if r.correctly_rejected else "❌ NOT REJECTED"
print(f" {status} — {r.query}")
# Save JSON for notebook/README
with open("eval_results.json", "w") as f:
json.dump([asdict(r) for r in results], f, indent=2)
print("\n📄 Full results saved to eval_results.json")
if __name__ == "__main__":
asyncio.run(run_eval())