SLOT/eval_only_slot.py at main · maple-research-lab/SLOT · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
import os

from transformers import AutoTokenizer

import torch
import re
from datasets import load_dataset
import random
import argparse

from modeling_qwen2_slot import Qwen2ForCausalLM

def reward_correct(item, answer):
    from math_verify import parse, verify, ExprExtractionConfig
    pattern = r'\d+\.\d+|\d+/\d+|\d+'
    nums = re.findall(pattern, answer)
    if len(nums) == 0:
        return -1.0
    lastnum = nums[-1]

    ans_parsed = None
    ground_truth_parsed = None

    try:
        ans_parsed = parse(lastnum, extraction_config=[ExprExtractionConfig()])
    except Exception as e:
        return -1.0

    try:
        ground_truth_parsed = parse(item["A"], extraction_config=[ExprExtractionConfig()])
    except Exception as e:
        return -1.0

    if ans_parsed is None or ground_truth_parsed is None:
        return -1.0

    verification_result = verify(ans_parsed, ground_truth_parsed)
    result_score = 1.0 if verification_result else -1.0
    return result_score

def reward_format(item, answer):
    pattern = r"^<think>.*?</think><answer>.*?</answer>$"
    match_obj = re.match(pattern, answer, re.DOTALL)
    result_score = 1.25 if match_obj else -1.0
    return result_score

def evaluate_model(model, tokenizer, eval_samples=None, split="test", generation_params=None, seed=42, log_file="evaluation_log.txt"):
    """Evaluates the model's performance on the GSM8K dataset."""
    print("Starting model evaluation...")
    model.eval()
    random.seed(seed)

    # Load the evaluation dataset
    eval_dataset = load_dataset("openai/gsm8k", "main", split=split)
    eval_QAs = [{'Q':x, 'A':y.split('####')[-1].strip()}
                for x,y in zip(eval_dataset['question'], eval_dataset['answer'])]

    # Randomly select samples for evaluation if specified
    if eval_samples is not None and len(eval_QAs) > eval_samples:
        eval_QAs = random.sample(eval_QAs, eval_samples)

    # Print the actual number of samples being evaluated
    print(f"Evaluating {len(eval_QAs)} samples")

    # Append evaluation info to the log
    with open(log_file, "a") as f:
        f.write(f"Number of evaluation samples: {len(eval_QAs)}\n\n")

    system_prompt = """You are a helpful assistant. A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The Assistant first thinks about the reasoning process in the mind and then provides the user with the answer.\
The reasoning process and answer are enclosed within <think> </think> and<answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>."""

    correct = 0
    format_correct = 0
    total = len(eval_QAs)

    for i, qa in enumerate(eval_QAs):
        if (i + 1) % 10 == 0:
            print(f"Evaluated {i+1}/{total} samples")

        prompt = qa['Q']
        prompt_text = tokenizer.apply_chat_template([
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt}
        ], tokenize=False, add_generation_prompt=True)

        inputs = tokenizer(prompt_text, return_tensors="pt", add_special_tokens=False).to(model.device)

        os.environ["prompt_only"] = "True" # Ensure this env var is handled correctly if needed elsewhere
        outputs = model.generate(
            **inputs,
            **generation_params,
        )

        completion = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

        # Check format and correctness
        format_score = reward_format(qa, completion)
        correct_score = reward_correct(qa, completion)

        is_format_correct = format_score > 0
        is_answer_correct = correct_score > 0

        if is_format_correct:
            format_correct += 1
        if is_answer_correct:
            correct += 1

        # Log sample information
        with open(log_file, "a") as f:
            f.write(f"Sample {i+1}:\n")
            f.write(f"Question: {qa['Q']}\n")
            f.write(f"Model Response: {completion}\n")
            f.write(f"Correct Answer: {qa['A']}\n")
            f.write(f"Format Correct: {is_format_correct}, Answer Correct: {is_answer_correct}\n\n")

        # Print detailed information for every sample
        print(f"\n--- Sample {i+1} ---")
        print("Question:", qa['Q'])
        print("Model Response:", completion)
        print("Correct Answer:", qa['A'])
        print(f"Format Correct: {is_format_correct}, Answer Correct: {is_answer_correct}")

    accuracy = correct / total if total > 0 else 0
    format_accuracy = format_correct / total if total > 0 else 0

    print(f"\nEvaluation Results (Samples: {total}):")
    print(f"Answer Accuracy: {accuracy:.4f}")
    print(f"Format Accuracy: {format_accuracy:.4f}")

    # Log overall results
    with open(log_file, "a") as f:
        f.write(f"Evaluation Results (Samples: {total}):\n")
        f.write(f"Answer Accuracy: {accuracy:.4f}\n")
        f.write(f"Format Accuracy: {format_accuracy:.4f}\n")

    return accuracy, format_accuracy

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_path", type=str, default="/ssdwork/huyang/r1/simple_GRPO_debug/slot_gsm8k/models/Qwen2.5-7B", help="Path to the model")
    parser.add_argument("--eval_samples", type=int, default=None, help="Number of samples to evaluate, None for full evaluation")
    parser.add_argument("--split", type=str, default="test", choices=["test", "train"], help="Dataset split to evaluate on")
    parser.add_argument("--do_sample", action="store_true", help="Whether to use sampling for generation")
    parser.add_argument("--temperature", type=float, default=0.9, help="Generation temperature")
    parser.add_argument("--seed", type=int, default=42, help="Random seed for consistent evaluation samples")
    args = parser.parse_args()
    # args.eval_samples = 30

    print(f"Loading model from: {args.model_path}")
    tokenizer = AutoTokenizer.from_pretrained(args.model_path)

    # Ensure same model loading parameters as training if applicable
    model = Qwen2ForCausalLM.from_pretrained(
        args.model_path,
        torch_dtype=torch.bfloat16,
        _attn_implementation="sdpa" # Use 'flash_attention_2' if available and preferred
    ).to("cuda") # Consider adding device management if multiple GPUs

    # Set generation parameters
    generation_params = {
        "do_sample": False,
        "temperature": args.temperature if args.do_sample else None,
        "max_new_tokens": 512 # Added a sensible default, adjust if needed
    }

    # Get environment variables (consider passing as args instead for clarity)
    times = os.environ.get("times", "3")
    lr = os.environ.get("lr", "0.001")

    # Create log directory and file
    log_dir = "logs"
    os.makedirs(log_dir, exist_ok=True)
    log_file = os.path.join(log_dir, f"log_times_{times}_lr_{lr}.txt")

    # Log basic information
    with open(log_file, "w") as f: # Use 'w' to overwrite for a new run
        f.write(f"Model Path: {args.model_path}\n")
        f.write(f"Times (from env): {times}\n")
        f.write(f"LR (from env): {lr}\n")
        f.write(f"Eval Samples: {'All' if args.eval_samples is None else args.eval_samples}\n")
        f.write(f"Dataset Split: {args.split}\n")
        f.write(f"Do Sample: {args.do_sample}\n")
        f.write(f"Temperature: {args.temperature}\n")
        f.write(f"Seed: {args.seed}\n\n")

    # Call evaluate_model, passing the log file path
    accuracy, format_accuracy = evaluate_model(
        model,
        tokenizer,
        eval_samples=args.eval_samples,
        split=args.split,
        generation_params=generation_params,
        seed=args.seed,
        log_file=log_file # Pass log file path
    )

    # Log final results (already done inside evaluate_model, but can add a summary here if needed)
    print(f"Evaluation complete. Results logged to {log_file}")

if __name__ == "__main__":
    main()