-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathevaluation.py
More file actions
121 lines (100 loc) · 4.51 KB
/
evaluation.py
File metadata and controls
121 lines (100 loc) · 4.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from __future__ import annotations
from dataclasses import dataclass
from typing import Iterable
import pandas as pd
from ml_pipeline import preprocess_text, rank_candidates
@dataclass
class EvalCase:
job_description: str
required_skills: list[str]
candidates: list[dict]
relevant_candidates: set[str]
def precision_at_k(ranked_names: list[str], relevant: set[str], k: int) -> float:
top_k = ranked_names[:k]
if k == 0:
return 0.0
return sum(1 for name in top_k if name in relevant) / k
def recall_at_k(ranked_names: list[str], relevant: set[str], k: int) -> float:
if not relevant:
return 0.0
top_k = ranked_names[:k]
hits = sum(1 for name in top_k if name in relevant)
return hits / len(relevant)
def reciprocal_rank(ranked_names: list[str], relevant: set[str]) -> float:
for idx, name in enumerate(ranked_names, 1):
if name in relevant:
return 1.0 / idx
return 0.0
def evaluate_case(case: EvalCase, k: int = 3) -> dict[str, float]:
ranked = rank_candidates(
job_text=preprocess_text(case.job_description),
candidates=case.candidates,
required_skills=case.required_skills,
)
ranked_names = [item["name"] for item in ranked]
return {
"precision@k": round(precision_at_k(ranked_names, case.relevant_candidates, k), 3),
"recall@k": round(recall_at_k(ranked_names, case.relevant_candidates, k), 3),
"mrr": round(reciprocal_rank(ranked_names, case.relevant_candidates), 3),
}
def default_benchmark_cases() -> list[EvalCase]:
return [
EvalCase(
job_description="Looking for an NLP engineer with Python, SQL, Transformers, and MLOps experience.",
required_skills=["Python", "SQL", "NLP", "Hugging Face Transformers", "MLOps"],
relevant_candidates={"Alice Johnson", "Ben Torres"},
candidates=[
{
"name": "Alice Johnson",
"resume_text": "",
"cleaned_text": preprocess_text("Python NLP engineer using Hugging Face Transformers and MLflow with SQL."),
"extracted_skills": ["Python", "NLP", "Hugging Face Transformers", "MLflow", "SQL", "MLOps"],
},
{
"name": "Ben Torres",
"resume_text": "",
"cleaned_text": preprocess_text("Machine learning engineer with Python, SQL, Docker, and MLOps pipelines."),
"extracted_skills": ["Python", "SQL", "Docker", "MLOps"],
},
{
"name": "Carla Smith",
"resume_text": "",
"cleaned_text": preprocess_text("Frontend engineer with React and TypeScript."),
"extracted_skills": ["React", "TypeScript"],
},
],
),
EvalCase(
job_description="Need data analyst with SQL, Tableau, dashboarding, and business analysis.",
required_skills=["SQL", "Tableau", "Data Analysis", "Business Analysis"],
relevant_candidates={"Dina Park"},
candidates=[
{
"name": "Dina Park",
"resume_text": "",
"cleaned_text": preprocess_text("Data analyst with SQL Tableau and business analysis stakeholder reporting."),
"extracted_skills": ["SQL", "Tableau", "Data Analysis", "Business Analysis"],
},
{
"name": "Eric Hall",
"resume_text": "",
"cleaned_text": preprocess_text("Backend developer with Java Spring PostgreSQL."),
"extracted_skills": ["Java", "Spring Boot", "PostgreSQL"],
},
],
),
]
def evaluate_benchmark(cases: Iterable[EvalCase] | None = None, k: int = 3) -> pd.DataFrame:
cases = list(cases) if cases is not None else default_benchmark_cases()
scores = [evaluate_case(case, k=k) for case in cases]
if not scores:
aggregate = {"precision@k": 0.0, "recall@k": 0.0, "mrr": 0.0}
else:
aggregate = {
"precision@k": round(sum(s["precision@k"] for s in scores) / len(scores), 3),
"recall@k": round(sum(s["recall@k"] for s in scores) / len(scores), 3),
"mrr": round(sum(s["mrr"] for s in scores) / len(scores), 3),
}
return pd.DataFrame([aggregate], index=["overall"])
if __name__ == "__main__":
print(evaluate_benchmark(k=2).to_string())