-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathevaluation.py
More file actions
105 lines (101 loc) · 3.94 KB
/
evaluation.py
File metadata and controls
105 lines (101 loc) · 3.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import pdb
import openai
from http import HTTPStatus
import dashscope
import requests
import json
import glob
import os
import base64
import sys
sys.path.insert(0, '../')
from SeeAct.src.demo_utils.format_prompt import postprocess_action_lmm
def f1(pred, label):
pred = set(pred.strip().split())
label = set(label.strip().split())
if len(pred) == 0 and len(label) == 0:
return 1
if len(pred) == 0 or len(label) == 0:
return 0
tp = len(pred & label)
fp = len(pred - label)
fn = len(label - pred)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
if precision == 0 or recall == 0:
return 0
f1 = 2 * precision * recall / (precision + recall)
return f1
def process(example):
element_pos = example.find('ELEMENT: ')
action_pos = example.find('ACTION: ')
value_pos = example.find('VALUE: ')
element = example[element_pos+len('ELEMENT: '):action_pos-1]
action = example[action_pos + len('ACTION: '):value_pos - 1]
value = example[value_pos + len('VALUE: '):]
return [element, action, value]
if __name__ == '__main__':
path = 'playground/grounding_results/30_selected_tasks/image_annotation_3images'
# task_action = '021e02c5-95fb-4dc3-987c-a74b1b64e299_c92776a0-f0e8-4f3c-a984-7bf03aab07f3'
# image = os.path.join(path, task_action, 'images/0.jpg')
# quries = json.load(open(os.path.join(path, task_action, 'queries.jsonl')))
# simple_multimodal_conversation_call(image, quries)
# pdb.set_trace()
plan_path = 'checkpoints/flant5-base_mind2web/results_test_task.json'
results = json.load(open(plan_path, 'r'))
# if 'split' in results[0].keys():
# print(len(results))
# pdb.set_trace()
num = {'website': 0, 'task': 0, 'domain': 0}
ele_acc = {'website':0,'task':0,'domain':0}
step_sr = {'website':0,'task':0,'domain':0}
f1_dict = {'website': 0, 'task': 0, 'domain': 0}
for sample in results:
# num +=1
response = sample['predict']
label = sample['label']
# pred = response.split(' ')
# if len(pred) > 2 and pred[2] == '':
# pred[2] = 'None'
# pred = postprocess_action_lmm(response)
# label = postprocess_action_lmm(label)
pred = process(response)
label = process(label)
f1_score = f1(pred[1] + pred[2], label[1] + label[2])
# print(pred)
# print(label)
# pdb.set_trace()
if 'split' not in sample.keys():
f1_dict['website'] += f1_score
if pred[0] == label[0]:
ele_acc['website'] += 1
if pred[1] == label[1] and pred[2].lower() == label[2].lower():
step_sr['website'] += 1
else:
num[sample['split']] += 1
f1_dict[sample['split']] += f1_score
if pred[0] == label[0]:
ele_acc[sample['split']] += 1
if pred[1] == label[1] and pred[2].lower() == label[2].lower():
step_sr[sample['split']] += 1
# else:
# print(pred)
# print(label)
# pdb.set_trace()
if 'split' not in results[0].keys():
print(ele_acc['website']/len(results))
print(step_sr['website']/len(results))
print(f1_dict['website'] / len(results))
else:
print('******Cross-website:*********')
print(ele_acc['website'] / num['website'])
print(step_sr['website'] / num['website'])
print(f1_dict['website'] / num['website'])
print('******Cross-task:*********')
print(ele_acc['task'] / num['task'])
print(step_sr['task'] / num['task'])
print(f1_dict['task'] / num['task'])
print('******Cross-domain:*********')
print(ele_acc['domain'] / num['domain'])
print(step_sr['domain'] / num['domain'])
print(f1_dict['domain'] / num['domain'])