WebExperT/gpt_infer.py at master · Luohh5/WebExperT · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
import json
import base64
import pdb
import dashscope
from http import HTTPStatus
import openai
import os
import sys
from SeeAct.src.data_utils.format_prompt_utils import get_choices
from SeeAct.src.demo_utils.format_prompt import postprocess_action_lmm

# from openai import OpenAI

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')


OPENAI_API_KEY = 'YOUR_API_KEY'
openai.api_key = OPENAI_API_KEY
openai.api_base = "YOUR_API_BASE"
headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {OPENAI_API_KEY}"
}
# gpt_model = "gpt-4-vision-preview"
gpt_model = "gpt-4o"
previous_k = 5
save_path = 'checkpoint/flan-t5-base/memory.json'
path = 'playground/grounding_results/30_selected_tasks/image_annotation_3images_website'
# task_action = '021e02c5-95fb-4dc3-987c-a74b1b64e299_c92776a0-f0e8-4f3c-a984-7bf03aab07f3'
# image = os.path.join(path, task_action, 'images/0.jpg')
# quries = json.load(open(os.path.join(path, task_action, 'queries.jsonl')))
# simple_multimodal_conversation_call(image, quries)
# pdb.set_trace()
for root, dirs, files in os.walk(path):
    acc = 0
    num = 0
    if os.path.exists(save_path):
        log_list = json.load(open(save_path))
    else:
        log_list = []
    for i, task_action in enumerate(dirs):
        # task_action = '039969ee-8f9a-4c49-9260-88267339e885_7b2c58f0-31c0-4cde-944a-ba54cd3a6f7d'
        quries = json.load(open(os.path.join(path, task_action, 'queries.jsonl')))
        system_prompt = "Imagine that you are imitating humans doing web navigation for a task step by step. At each stage, you can see the webpage like humans by a screenshot and know the previous actions before the current step decided by yourself through recorded history. You need to decide on the first following action to take. You can click on an element with the mouse, select an option, type text or press Enter with the keyboard. (For your understanding, they are like the click(), select_option() and type() functions in playwright respectively) One next step means one operation within the three."
        base64_image_raw = encode_image(os.path.join(path, task_action, 'images/0_raw.jpg'))
        base64_image_labeled = encode_image(os.path.join(path, task_action, 'images/0_labeled.jpg'))
        prompt = f"Task:\n{quries['confirmed_task']}\nCombined with the screenshot, make a plan to accomplish the task. The plan should contain a complete flow of actions."
        # GPT-4V
        prompt1_input = [
            {"role": "system", "content": [{"type": "text", "text": system_prompt}]},
            {"role": "user",
             "content": [{"type": "text", "text": prompt}, {"type": "image_url", "image_url": {"url":
                                                                                                   f"data:image/jpeg;base64,{base64_image_raw}",
                                                                                               "detail": "high"},
                                                            }]},
        ]
        response1 = openai.ChatCompletion.create(
            model=gpt_model,
            # model="gpt-4o",
            messages=prompt1_input,
            max_tokens=4096,
            temperature=0,
        )
        answer1 = [choice["message"]["content"] for choice in response1["choices"]][0]

        # DeepSeek-V2
        # response_ds = openai.ChatCompletion.create(
        #     model=deepseek_model,
        #     # model="gpt-4o",
        #     messages=prompt1_input,
        #     max_tokens=4096,
        #     temperature=0,
        #     stream=False
        # )
        # answer1 = response_ds.choices[0].message.content

        # Qwen-VL
        # messages = [
        #     {   "role": "user",
        #         "content": [
        #             {"image": os.path.join(path, task_action, 'images/0_raw.jpg')},
        #             {"text": system_prompt + '\n' + prompt}]}]
        # response_qwen = dashscope.MultiModalConversation.call(model='qwen-vl-max', messages=messages)
        # if response_qwen.status_code == HTTPStatus.OK:
        #     answer1 = response_qwen['output']['choices'][0]['message']['content'][0]['text']
        # else:
        #     print(response_qwen.code)  # The error code.
        #     print(response_qwen.message)  # The error message.

        print(answer1)
        print("**************************************************************************")

        prompt_2 = "Previous Actions:\n"
        if len(quries["previous_actions"]) > 0:
            for pre_act_id, action in enumerate(quries["previous_actions"][-previous_k:]):
                prompt_2 += f"{action}\n"
        else:
            prompt_2 += "None\n"
        prompt_2 += """Combined with the screenshot and each step of the previous action history and their intention. Compared with the above task plan, identify the compeleted action steps. Then, based on the task plan and screenshot, in conjunction with human web browsing habits and the logic of web design, what should be the next action to complete the task? Please select from the following choices:\n"""
        for idx, choice in enumerate(quries["choices"]):
            # convert to ascii A, B, C, D, ...
            prompt_2 += f"{chr(65 + idx)}. {choice[1]}\n"

        prompt_2 += """None. None of the other options match the correct element.

        All the choices above are HTML of interactive elements in the screenshot. You should take into account both their text content and location in screenshot (red rectangle) to determine whether one matches your target element. If none of these elements match your target element, please select None. None of the other options match the correct element.

        Conclude your answer using the format below. Ensure your answer is strictly adhering to the format provided below. Please do not leave any explanation in your answers of the final standardized format part, and this final part should be clear and certain. The element choice, action, and value should be in three separate lines.

        Format:

        ELEMENT: The uppercase letter of your choice. (No need for PRESS ENTER)

        ACTION: Choose an action from {CLICK, SELECT, TYPE, PRESS ENTER, TERMINATE, NONE}.

        VALUE: Provide additional input based on ACTION.

        The VALUE means:
        If ACTION == TYPE, specify the text to be typed.
        If ACTION == SELECT, indicate the option to be chosen. Revise the selection value to align with the available options within the element.
        If ACTION == CLICK, PRESS ENTER, TERMINATE or NONE, write "None".

        NOTE THAT your answer should strictly contains only 1 ELEMENT, 1 ACTION, and 1 VALUE!!!"""

        # GPT-4V
        prompt2_input = [
            {"role": "system", "content": [{"type": "text", "text": system_prompt}]},
            {"role": "user",
             "content": [{"type": "text", "text": prompt}, {"type": "image_url", "image_url": {"url":
                                                                                                   f"data:image/jpeg;base64,{base64_image_raw}",
                                                                                               "detail": "high"}, }]},
            {"role": "assistant", "content": [{"type": "text", "text": f"{answer1}"}]},
            {"role": "user",
             "content": [{"type": "text", "text": prompt_2}, {"type": "image_url", "image_url": {"url":
                                                                                                     f"data:image/jpeg;base64,{base64_image_labeled}",
                                                                                                 "detail": "high"}, }]}, ]
        response2 = openai.ChatCompletion.create(
            model=gpt_model,
            # model="gpt-4o",
            messages=prompt2_input,
            max_tokens=4096,
            temperature=0,
        )
        answer2 = [choice["message"]["content"] for choice in response2["choices"]][0]

        # DeepSeek-V2
        # response_ds = openai.ChatCompletion.create(
        #     model=deepseek_model,
        #     # model="gpt-4o",
        #     messages=prompt2_input,
        #     max_tokens=4096,
        #     temperature=0,
        #     stream=False
        # )
        # answer2 = response_ds.choices[0].message.content

        # Qwen-VL
        # messages.append({'role': response_qwen.output.choices[0].message.role,
        #                  'content': response_qwen.output.choices[0].message.content})
        # messages.append({"role": "user",
        #                  "content": [
        #                      {"image": os.path.join(path, task_action, 'images/0_labeled.jpg')},
        #                      {"text": prompt_2, }
        #                  ]})
        # response_qwen = dashscope.MultiModalConversation.call(model='qwen-vl-max',
        #                                                  messages=messages)
        # if response_qwen.status_code == HTTPStatus.OK:
        #     answer2 = response_qwen['output']['choices'][0]['message']['content'][0]['text']
        # else:
        #     print(response_qwen.code)  # The error code.
        #     print(response_qwen.message)  # The error message.

        print(answer2)
        print("**************************************************************************")
        # prompt_3 = """We overlay many RED bounding boxes of all interactive Web Elements with Alphabetic Labels placed in the BOTTOM LEFT corner on the screenshot. Determine whether one matches your target element above.
        #
        # Conclude your answer using the format below. Ensure your answer is strictly adhering to the format provided below. Please do not leave any explanation in your answers of the final standardized format part, and this final part should be clear and certain. The element choice, action, and value should be in three separate lines.
        #
        # Format:
        #
        # ELEMENT: The uppercase letter of your choice. (No need for PRESS ENTER)
        #
        # ACTION: Choose an action from {CLICK, SELECT, TYPE, PRESS ENTER, TERMINATE, NONE}.
        #
        # VALUE: Provide additional input based on ACTION.
        #
        # The VALUE means:
        # If ACTION == TYPE, specify the text to be typed.
        # If ACTION == SELECT, indicate the option to be chosen. Revise the selection value to align with the available options within the element.
        # If ACTION == CLICK, PRESS ENTER, TERMINATE or NONE, write "None".
        #
        # NOTE THAT your answer should strictly contains only 1 ELEMENT, 1 ACTION, and 1 VALUE!!!"""
        # prompt3_input = [
        #     {"role": "system", "content": [{"type": "text", "text": system_prompt}]},
        #     {"role": "user",
        #      "content": [{"type": "text", "text": prompt}, {"type": "image_url", "image_url": {"url":
        #                                                                                            f"data:image/jpeg;base64,{base64_image_raw}",
        #                                                                                        "detail": "high"}, }]},
        #     {"role": "assistant", "content": [{"type": "text", "text": f"{answer1}"}]},
        #     {"role": "user",
        #      "content": [{"type": "text", "text": prompt_2}, ]},
        #     {"role": "assistant", "content": [{"type": "text", "text": f"{answer2}"}]},
        #     {"role": "user",
        #      "content": [{"type": "text", "text": prompt_3}, {"type": "image_url", "image_url": {"url":
        #                                                                                            f"data:image/jpeg;base64,{base64_image_labeled}",
        #                                                                                        "detail": "high"}, }]},
        # ]
        # response3 = openai.ChatCompletion.create(
        #     model="gpt-4-vision-preview",
        #     messages=prompt3_input,
        #     max_tokens=4096,
        #     temperature=0,
        # )
        # answer3 = [choice["message"]["content"] for choice in response3["choices"]][0]
        # print(answer3)
        # print("**************************************************************************")
        element_id = quries["pos_candidates"][0]['backend_node_id']
        candidates = quries['choices']
        element = None
        for idx, candidate in enumerate(candidates):
            if candidate[0] == element_id:
                target_element = quries['choices'][idx]
                element = chr(65 + idx)
        action = quries["operation"]["op"]
        value = quries["operation"]["value"]
        if value == '':
            value = 'None'
        target = (element, action, value)
        pred = postprocess_action_lmm(answer2)
        print(target)
        print(pred)
        # pred_element = quries['choices'][int(pred[0]-65)]
        print("Task: ", task_action)
        if target != pred:
            if target[0] != pred[0]:
                prompt_reflect = f"You have chosen a wrong web ELEMENT!! The correct ELEMENT is {element}.\n " + "Compared with the correct element, reflect on your mistakes in the above analysis and decision-making process, and generate your insights on what will you do when facing a similar task again to avoid failing the task in the same way.\nConclude your response using the format below. Ensure your response is strictly adhering to the format provided below.\n\nFormat: In this attempt, I was unsuccessful. {Where did you make mistakes?}. Next time, I will {your solution to avoid failing the task in the same way}."
            elif target[1] != pred[1]:
                prompt_reflect = f"You have chosen the correct ELEMENT but made a wrong ACTION!! The correct ACTION is {action}.\n " + "Compared with the correct action, reflect on your mistakes in the above analysis and decision-making process, and generate your insights on what will you do when facing a similar task again to avoid failing the task in the same way.\nConclude your response using the format below. Ensure your response is strictly adhering to the format provided below.\n\nFormat: In this attempt, I was unsuccessful. {Where did you make mistakes?}. Next time, I will {your solution to avoid failing the task in the same way}."
            elif target[2] != pred[2]:
                prompt_reflect = f"The ELEMENT and ACTION are both correct. But you type or select a wrong VALUE!! The correct VALUE is {value}.\n " + "Compared with the correct value, reflect on your mistakes in the above analysis and decision-making process, and generate your insights on what will you do when facing a similar task again to avoid failing the task in the same way.\nConclude your response using the format below. Ensure your response is strictly adhering to the format provided below.\n\nFormat: In this attempt, I was unsuccessful. {Where did you make mistakes?}. Next time, I will {your solution to avoid failing the task in the same way}."
            # prompt_reflect = f"You have made a wrong answer! The correct answer is\nELEMENT: {element} ACTION: {action} VALUE: {value}\n " + "Compared with the correct answer, reflect on your mistakes in the above analysis and decision-making process, and generate your insights on what will you do when facing a similar task again to avoid failing the task in the same way.\nConclude your response using the format below. Ensure your response is strictly adhering to the format provided below.\n\nFormat: In this attempt, I was unsuccessful. {Where did you make mistakes?}. Next time, I will {your solution to avoid failing the task in the same way}."

            prompt_reflect_input = [
            {"role": "system", "content": [{"type": "text", "text": system_prompt}]},
            {"role": "user",
             "content": [{"type": "text", "text": prompt}, {"type": "image_url", "image_url": {"url":
                                                                                                   f"data:image/jpeg;base64,{base64_image_raw}",
                                                                                               "detail": "high"}, }]},
            {"role": "assistant", "content": [{"type": "text", "text": f"{answer1}"}]},
            {"role": "user",
             "content": [{"type": "text", "text": prompt_2}, {"type": "image_url", "image_url": {"url":
                                                                                                     f"data:image/jpeg;base64,{base64_image_labeled}",
                                                                                                 "detail": "high"}, }]},
            {"role": "assistant", "content": [{"type": "text", "text": f"{answer2}"}]},
            {"role": "user",
             "content": [{"type": "text", "text": prompt_reflect}, {"type": "image_url", "image_url": {"url":
                                                                                                     f"data:image/jpeg;base64,{base64_image_labeled}",
                                                                                                 "detail": "high"}, }]},
            ]
            response_reflect = openai.ChatCompletion.create(
                model=gpt_model,
                # model="gpt-4o",
                messages=prompt_reflect_input,
                max_tokens=4096,
                temperature=0,
            )
            answer_reflect = [choice["message"]["content"] for choice in response_reflect["choices"]][0]
            print('\n')
            print(answer_reflect)

            guide = "You have attempted to solve the task before but failed. The following reflection(s) give a plan to avoid failing the task in the same way you did previously. Use them to improve your strategy of solving the task successfully."
            prompt_3 = "Previous Actions:\n"
            if len(quries["previous_actions"]) > 0:
                for pre_act_id, action in enumerate(quries["previous_actions"][-previous_k:]):
                    prompt_3 += f"{action}\n"
            else:
                prompt_3 += "None\n"
            prompt_3 += guide + f"\n\n(Failure Trial)\nTask:{quries['confirmed_task']}\nPrevious Actions: "
            if len(quries["previous_actions"]) > 0:
                for pre_act_id, action in enumerate(quries["previous_actions"][-previous_k:]):
                    prompt_3 += f"{pre_act_id}. {action}   "
            else:
                prompt_3 += "None"
            prompt_3 += f"\nPredicted Action: ELEMENT: {quries['choices'][ord(pred[0])-ord('A')]} ACTION: {pred[1]} VALUE: {pred[2]}\nReflection: {answer_reflect}\n"
            prompt_3 += """Under the guidance of your reflection from failure experience and combined with the screenshot and each step of the previous action history and their intention. Compared with the above task plan, identify the compeleted action steps. Then, based on the task plan and screenshot, in conjunction with human web browsing habits and the logic of web design, what should be the next action to complete the task? Please select from the following choices:\n"""
            for idx, choice in enumerate(quries["choices"]):
                # convert to ascii A, B, C, D, ...
                prompt_3 += f"{chr(65 + idx)}. {choice[1]}\n"

            prompt_3 += """None. None of the other options match the correct element.

                    All the choices above are HTML of interactive elements in the screenshot. You should take into account both their text content and location in screenshot (red rectangle) to determine whether one matches your target element. If none of these elements match your target element, please select None. None of the other options match the correct element.

                    Conclude your answer using the format below. Ensure your answer is strictly adhering to the format provided below. Please do not leave any explanation in your answers of the final standardized format part, and this final part should be clear and certain. The element choice, action, and value should be in three separate lines.

                    Format:

                    ELEMENT: The uppercase letter of your choice. (No need for PRESS ENTER)

                    ACTION: Choose an action from {CLICK, SELECT, TYPE, PRESS ENTER, TERMINATE, NONE}.

                    VALUE: Provide additional input based on ACTION.

                    The VALUE means:
                    If ACTION == TYPE, specify the text to be typed.
                    If ACTION == SELECT, indicate the option to be chosen. Revise the selection value to align with the available options within the element.
                    If ACTION == CLICK, PRESS ENTER, TERMINATE or NONE, write "None".

                    NOTE THAT your answer should strictly contains only 1 ELEMENT, 1 ACTION, and 1 VALUE!!!"""
            prompt3_input = [
                {"role": "system", "content": [{"type": "text", "text": system_prompt}]},
                {"role": "user",
                 "content": [{"type": "text", "text": prompt}, {"type": "image_url", "image_url": {"url":
                                                                                                       f"data:image/jpeg;base64,{base64_image_raw}",
                                                                                                   "detail": "high"}, }]},
                {"role": "assistant", "content": [{"type": "text", "text": f"{answer1}"}]},
                {"role": "user",
                 "content": [{"type": "text", "text": prompt_3}, {"type": "image_url", "image_url": {"url":
                                                                                                         f"data:image/jpeg;base64,{base64_image_labeled}",
                                                                                                     "detail": "high"}, }]}, ]
            response3 = openai.ChatCompletion.create(
                model=gpt_model,
                # model="gpt-4o",
                messages=prompt3_input,
                max_tokens=4096,
                temperature=0,
            )
            answer3 = [choice["message"]["content"] for choice in response3["choices"]][0]
            print(answer3)
                    # log_list.append({"id": i, "task": task_action, "planning": answer1, "action": answer2, "target": target})
        # with open(save_path, 'w') as fp:
        #     json.dump(log_list, fp)
        print('id: ', i)
        pdb.set_trace()