-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsimple_eval.py
More file actions
72 lines (59 loc) · 2.35 KB
/
simple_eval.py
File metadata and controls
72 lines (59 loc) · 2.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from datasets import load_dataset
from unsloth import FastLanguageModel
from tqdm import tqdm
import re
# Load model using Unsloth
model_name = "unsloth/Qwen2.5-3B-Instruct-bnb-4bit"
#model_name = "unsloth/Qwen2.5-1.5B-Instruct-unsloth-bnb-4bit"
#model_name = "unsloth/Qwen2.5-0.5B-Instruct-bnb-4bit"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name,
dtype = None,
load_in_4bit = True,
)
FastLanguageModel.for_inference(model)
# Sampling settings
# Unsloth uses the transformers pipeline, which takes arguments directly.
# Load dataset
dataset = load_dataset("Sebasdi/art_math_test", split="train")
# Evaluation loop
correct = 0
total = 0
with open("results.txt", "w") as f:
f.write("")
print("Evaluating...")
for example in tqdm(dataset.select(range(100))):
question = example["question"]
messages = [
{"role": "system", "content": "You are a clever agent that is asked to solve math questions. If you want to think before the answer do so in between the <think> and </think> tags. Provide your final answer in the format <answer>your answer</answer>"},
{"role": "user", "content": f"""Question:\n{question}\n\n"""}
]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
input_len = inputs["input_ids"].shape[1]
outputs = model.generate(**inputs, max_new_tokens=250, use_cache=True)
generated = tokenizer.batch_decode(outputs[:, input_len:], skip_special_tokens=True)[0]
print("Answer:", example["answer"])
print("Generated:", generated)
extracted_answer = re.search(r'<answer>(.*?)</answer>', generated)
if extracted_answer:
pred = extracted_answer.group(1)
print("extracted answer:", pred)
print("\n")
else:
pred = "NONE"
if pred in example["answer"] or example["answer"] in pred:
reward = 1
else:
reward = 0
correct += reward
total += 1
# write question answer and extracted answer to file
with open("results.txt", "a") as f:
f.write(f"Question: {question}\n")
f.write(f"Answer: {example['answer']}\n")
f.write(f"Extracted answer: {pred}\n")
f.write(f"Reward: {reward}\n")
f.write("\n")
accuracy = correct / total * 100
print(f"Accuracy: {accuracy:.2f}% on {total} samples")