BrowserAI/browser_agent.py at main · CyanXLab/BrowserAI · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
"""
智能浏览器代理 - 自主分析和执行任务
支持：
1. 使用本地浏览器（如Edge）或Playwright内置浏览器
2. AI可随时暂停任务询问用户
3. 人机协作完成任务

核心能力：
- 获取页面DOM结构和可见元素
- 用LLM分析网页，决定下一步操作
- 自动定位元素（支持多种策略）
- 智能询问用户以获得帮助
"""
import json
import re
import asyncio
import os
from typing import Dict, Any, List, Optional, Tuple, Callable
from dataclasses import dataclass, field
from playwright.async_api import async_playwright, Page, Browser, BrowserContext
from config import Config


@dataclass
class AskQuestion:
    """AI询问用户的问题"""
    question: str
    options: Optional[List[str]] = None
    context: str = ""  # 当前的上下文信息

    def format_prompt(self) -> str:
        """格式化询问提示"""
        prompt = f"""🤔 我需要你的帮助：

{self.question}

{self.context}
"""
        if self.options:
            prompt += "\n选项:\n"
            for i, opt in enumerate(self.options, 1):
                prompt += f"  {i}. {opt}\n"

        prompt += "\n请回复你的选择或直接输入答案："
        return prompt


class SmartBrowserAgent:
    """智能浏览器代理"""

    def __init__(self):
        self.playwright = None
        self.browser: Optional[Browser] = None
        self.context: Optional[BrowserContext] = None
        self.page: Optional[Page] = None
        self.task_history: List[Dict] = []
        self.max_steps = 25
        self.ask_callback: Optional[Callable] = None  # 询问用户时的回调函数
        self.use_local_browser = True  # 默认使用本地浏览器

    def set_ask_callback(self, callback: Callable):
        """设置询问回调函数"""
        self.ask_callback = callback

    async def start(self):
        """启动浏览器 - 使用持久化上下文保留cookie"""
        self.playwright = await async_playwright().start()

        # 获取配置
        cfg = Config()
        self.use_local_browser = cfg.BROWSER.use_local_browser

        # 用户数据目录 - 用于保存cookie和登录状态
        user_data_dir = os.path.join(os.path.dirname(__file__), "browser_data")
        os.makedirs(user_data_dir, exist_ok=True)
        print(f"📁 浏览器数据目录: {user_data_dir}")

        if self.use_local_browser:
            # 尝试使用本地Edge浏览器
            edge_paths = [
                r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe",
                r"C:\Program Files\Microsoft\Edge\Application\msedge.exe",
                r"C:\Users\%USERNAME%\AppData\Local\Microsoft\Edge\Application\msedge.exe",
                # Chrome
                r"C:\Program Files\Google\Chrome\Application\chrome.exe",
                r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe",
                # 尝试从环境变量找
                os.environ.get("LOCAL_BROWSER_PATH", "")
            ]

            browser_path = None
            for path in edge_paths:
                if path and os.path.exists(path):
                    browser_path = path
                    print(f"✅ 找到本地浏览器: {path}")
                    break

            if browser_path:
                try:
                    # 使用持久化上下文启动浏览器 - 这样可以保留cookie!
                    self.context = await self.playwright.chromium.launch_persistent_context(
                        executable_path=browser_path,
                        user_data_dir=user_data_dir,
                        headless=cfg.BROWSER.headless,
                        viewport={"width": cfg.BROWSER.viewport_width, "height": cfg.BROWSER.viewport_height},
                        user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0",
                        args=['--start-maximized']  # 最大化窗口
                    )
                    print(f"✅ 成功启动本地浏览器（带数据持久化）")
                except Exception as e:
                    print(f"⚠️ 启动本地浏览器失败: {e}")
                    print("🔄 切换到Playwright内置浏览器...")
                    await self._start_builtin_browser_persistent(cfg, user_data_dir)
            else:
                print("⚠️ 未找到本地浏览器")
                print("🔄 使用Playwright内置浏览器...")
                await self._start_builtin_browser_persistent(cfg, user_data_dir)
        else:
            # 直接使用Playwright内置浏览器
            await self._start_builtin_browser_persistent(cfg, user_data_dir)

        # 获取或创建页面
        pages = self.context.pages
        if pages:
            self.page = pages[0]
        else:
            self.page = await self.context.new_page()
        print("✅ 浏览器已就绪（cookie将自动保存）")

    async def _start_builtin_browser_persistent(self, cfg, user_data_dir):
        """启动内置浏览器（持久化上下文）"""
        launch_options = {
            "user_data_dir": user_data_dir,
            "headless": cfg.BROWSER.headless,
            "viewport": {"width": cfg.BROWSER.viewport_width, "height": cfg.BROWSER.viewport_height}
        }

        try:
            self.context = await self.playwright.chromium.launch_persistent_context(**launch_options)
        except:
            try:
                self.context = await self.playwright.firefox.launch_persistent_context(**launch_options)
            except:
                self.context = await self.playwright.webkit.launch_persistent_context(**launch_options)

    async def _start_builtin_browser(self, cfg):
        """启动内置浏览器"""
        launch_options = {"headless": cfg.BROWSER.headless}

        try:
            self.browser = await self.playwright.chromium.launch(**launch_options)
        except:
            try:
                self.browser = await self.playwright.firefox.launch(**launch_options)
            except:
                self.browser = await self.playwright.webkit.launch(**launch_options)

    async def close(self):
        """关闭浏览器"""
        if self.browser:
            await self.browser.close()
        if self.playwright:
            await self.playwright.stop()
        print("✅ 浏览器已关闭")

    async def get_page_info(self) -> Dict[str, Any]:
        """获取当前页面信息"""
        if not self.page:
            return {"error": "浏览器未初始化"}

        try:
            title = await self.page.title()
            url = self.page.url
            elements = await self._extract_interactive_elements()
            content = await self._get_page_content_summary()

            return {
                "url": url,
                "title": title,
                "elements": elements,
                "content_summary": content
            }
        except Exception as e:
            return {"error": str(e)}

    async def _extract_interactive_elements(self) -> List[Dict]:
        """提取可交互元素"""
        elements = []

        selectors = [
            ("input[type='text']", "text_input"),
            ("input[type='search']", "search_input"),
            ("input[type='email']", "email_input"),
            ("input[type='password']", "password_input"),
            ("textarea", "textarea"),
            ("button", "button"),
            ("input[type='submit']", "submit_button"),
            ("input[type='button']", "button"),
            ("a[href]", "link"),
            ("select", "dropdown"),
            ("input[type='checkbox']", "checkbox"),
            ("input[type='radio']", "radio"),
        ]

        for selector, elem_type in selectors:
            try:
                elems = await self.page.query_selector_all(selector)
                for i, elem in enumerate(elems[:10]):  # 每种类型限制10个
                    try:
                        text = await elem.inner_text()
                        text = text.strip()[:60] if text else ""
                        placeholder = await elem.get_attribute("placeholder") or ""
                        name = await elem.get_attribute("name") or ""
                        id_attr = await elem.get_attribute("id") or ""

                        possible_selectors = []
                        if id_attr:
                            possible_selectors.append(f"#{id_attr}")
                        if name:
                            possible_selectors.append(f"{selector}[name='{name}']")
                        if placeholder:
                            possible_selectors.append(f"{selector}[placeholder*='{placeholder}']")
                        if text and len(text) < 40:
                            possible_selectors.append(selector)
                        possible_selectors.append(f"{selector}:nth-of-type({i+1})")

                        elements.append({
                            "type": elem_type,
                            "text": text,
                            "placeholder": placeholder,
                            "selectors": possible_selectors[:3],
                            "index": i
                        })
                    except:
                        continue
            except:
                continue

        return elements[:30]

    async def _get_page_content_summary(self) -> str:
        """获取页面文本摘要"""
        try:
            # 获取主要内容
            text = await self.page.evaluate('''() => {
                // 获取主要文本内容
                const main = document.querySelector('main, article, .content, #content');
                if (main) return main.innerText;
                // 退回body
                return document.body.innerText;
            }''')
            lines = [line.strip() for line in text.split('\n') if line.strip()]
            return '\n'.join(lines[:25])[:1200]
        except:
            return ""

    async def execute_action(self, action: Dict[str, Any]) -> Tuple[bool, str]:
        """执行操作"""
        action_type = action.get("action", "").lower()

        try:
            if action_type == "navigate":
                url = action.get("url", "")
                if not url.startswith(("http://", "https://")):
                    url = f"https://{url}"
                await self.page.goto(url, wait_until="domcontentloaded", timeout=45000)
                await asyncio.sleep(1)
                return True, f"导航到 {url}"

            elif action_type == "search":
                query = action.get("query", "")
                engine = action.get("engine", "bing")
                if engine == "baidu":
                    url = f"https://www.baidu.com/s?wd={query}"
                elif engine == "google":
                    url = f"https://www.google.com/search?q={query}"
                else:  # bing
                    url = f"https://www.bing.com/search?q={query}"
                await self.page.goto(url, wait_until="domcontentloaded", timeout=45000)
                await asyncio.sleep(1)
                return True, f"在{engine}搜索: {query}"

            elif action_type == "click":
                selector = action.get("selector", "")
                text = action.get("text", "")

                if not selector and text:
                    # 尝试用文本点击
                    try:
                        await self.page.click(f"text={text}", timeout=3000)
                        return True, f"点击: {text}"
                    except:
                        pass

                if selector:
                    await self.page.wait_for_selector(selector, timeout=5000)
                    await self.page.click(selector)
                    await asyncio.sleep(0.5)
                    return True, f"点击: {text or selector}"

                return False, "未找到可点击元素"

            elif action_type == "type":
                selector = action.get("selector", "")
                text = action.get("text", "")

                if not selector:
                    # 智能查找输入框
                    selectors = [
                        "input[type='search']",
                        "input[name='q']",
                        "input[name='wd']",
                        "input[placeholder*='搜索']",
                        "input[placeholder*='Search']",
                        "input[type='text']:visible",
                        "textarea:visible"
                    ]
                    for sel in selectors:
                        try:
                            if await self.page.query_selector(sel):
                                selector = sel
                                break
                        except:
                            continue

                if not selector:
                    return False, "未找到输入框"

                await self.page.wait_for_selector(selector, timeout=5000)
                await self.page.fill(selector, text)

                if action.get("submit", False):
                    await self.page.press(selector, "Enter")
                    await asyncio.sleep(1.5)

                return True, f"输入: {text[:50]}"

            elif action_type == "find_and_click":
                keywords = action.get("keywords", [])
                for keyword in keywords:
                    try:
                        # 尝试多种方式定位
                        locators = [
                            f"text={keyword}",
                            f"a:has-text('{keyword}')",
                            f"button:has-text('{keyword}')",
                            f"[title='{keyword}']"
                        ]
                        for loc in locators:
                            try:
                                await self.page.click(loc, timeout=2000)
                                return True, f"找到并点击: {keyword}"
                            except:
                                continue
                    except:
                        continue
                return False, f"未找到包含关键词的元素: {keywords}"

            elif action_type == "scroll":
                direction = action.get("direction", "down")
                amount = action.get("amount", 500)
                await self.page.evaluate(f"window.scrollBy(0, {amount if direction=='down' else -amount})")
                return True, f"向{direction}滚动 {amount}px"

            elif action_type == "wait":
                seconds = action.get("seconds", 2)
                await asyncio.sleep(seconds)
                return True, f"等待 {seconds}秒"

            elif action_type == "screenshot":
                path = action.get("path", "screenshot.png")
                await self.page.screenshot(path=path, full_page=action.get("full_page", True))
                return True, f"截图保存到 {path}"

            elif action_type == "extract":
                selector = action.get("selector", "body")
                text = await self.page.inner_text(selector)
                return True, text[:3000]

            elif action_type == "ask_user":
                # 注意：不在 execute_action 中调用 ask_callback
                # 让 run_task() 集中处理询问逻辑，避免重复调用
                question = action.get("question", "")
                context = action.get("context", "")
                return True, f"需要询问用户：{question}"

            elif action_type == "done":
                return True, action.get("answer", "任务完成")

            else:
                return False, f"未知操作: {action_type}"

        except Exception as e:
            return False, f"操作失败: {str(e)}"

    def build_system_prompt(self) -> str:
        """构建系统提示词"""
        return """你是一个智能浏览器代理，可以自主控制浏览器完成用户任务。

## 你的职责：
1. 分析当前页面状态
2. 决定下一步操作
3. 遇到不确定的问题时询问用户
4. 循环执行直到完成任务

## 可用操作（JSON格式）：

**基础操作：**
- {"action": "navigate", "url": "https://..."} - 导航到指定URL
- {"action": "search", "query": "关键词", "engine": "bing/baidu/google"} - 搜索（默认bing）
- {"action": "click", "selector": "#id", "text": "按钮文本"} - 点击元素
- {"action": "type", "selector": "input[name='q']", "text": "内容", "submit": true} - 输入文本

**智能操作：**
- {"action": "find_and_click", "keywords": ["关键词1", "关键词2"]} - 自动查找并点击
- {"action": "scroll", "direction": "down/up", "amount": 500} - 滚动页面
- {"action": "wait", "seconds": 2} - 等待加载
- {"action": "screenshot", "path": "result.png", "full_page": true} - 截图
- {"action": "extract", "selector": "body"} - 提取页面内容

**询问用户（重要）：**
- {"action": "ask_user", "question": "具体的问题", "context": "当前上下文信息"}

使用 ask_user 场景：
1. 需要用户登录（"需要登录才能继续，请在浏览器中完成登录后告诉我"）
2. 需要验证码（"页面需要验证码，请查看浏览器输入后告诉我"）
3. 需要用户选择（"这里有多个选项，您想要点击哪一个？"）
4. 需要确认信息（"我要点击'提交订单'按钮，确认吗？"）
5. 任务需要更多信息（"您要搜索什么内容？"）

**完成任务：**
- {"action": "done", "answer": "任务结果的总结"} - 结束任务

## 规则：
1. 每次只返回一个操作
2. **不确定时，使用 ask_user 询问用户**（特别是需要登录、验证码、选择时）
3. 遇到错误时尝试替代方案
4. 完成任务必须返回 done 操作
5. selector优先级: #id > [name='...'] > 文本匹配
"""

    async def run_task(self, task: str, llm_client) -> str:
        """运行复杂任务，支持人机协作"""
        if not self.page:
            await self.start()

        messages = [
            {"role": "system", "content": self.build_system_prompt()},
            {"role": "user", "content": f"任务: {task}\n\n请开始执行。如果遇到不确定的问题，请使用 ask_user 询问我。"}
        ]

        step = 0
        last_result = ""

        while step < self.max_steps:
            step += 1
            print(f"\n[Step {step}]")

            # 获取页面状态
            page_info = await self.get_page_info()

            if "error" in page_info:
                return f"❌ 获取页面失败：{page_info['error']}"

            # 注意：移除了自动登录检测逻辑，让 AI 通过分析页面内容自己决定是否需要询问用户
            # 这样避免了重复询问的问题（自动检测 + AI 决策双重触发）

            # 构建观察信息
            observation = f"""当前页面信息：
- URL: {page_info['url']}
- 标题：{page_info['title']}
- 可交互元素 ({len(page_info['elements'])} 个)：
"""
            for i, elem in enumerate(page_info['elements'][:12]):
                text = elem.get('text', '') or elem.get('placeholder', '')
                elem_type = elem.get('type', '')
                if text:
                    observation += f"  {i+1}. [{elem_type}] {text[:45]}\n"

            if page_info['content_summary']:
                observation += f"\n页面内容摘要:\n{page_info['content_summary'][:500]}"

            if last_result:
                observation += f"\n\n上次操作结果: {last_result}\n"

            observation += f"\n任务: {task}\n\n请决定下一步操作（或询问用户）："

            # 调用 LLM 决策（添加 1 秒延迟防止 API 限速）
            messages.append({"role": "user", "content": observation})
            llm_client.messages = messages
            await asyncio.sleep(1)  # 添加延迟防止 API 限速
            response = llm_client.chat(observation, json_mode=True, auto_add_message=False)

            try:
                action = json.loads(response)
            except:
                try:
                    start = response.find('{')
                    end = response.rfind('}') + 1
                    if start >= 0 and end > start:
                        action = json.loads(response[start:end])
                    else:
                        return f"❌ 无法解析操作: {response[:200]}"
                except:
                    return f"❌ 解析失败: {response[:200]}"

            print(f"操作: {action}")

            # 执行操作
            success, result = await self.execute_action(action)
            print(f"结果: {result}")

            # 处理询问用户 - 在这里集中处理，避免重复
            if action.get("action") == "ask_user":
                question = action.get("question", "")
                context = action.get("context", "")

                if self.ask_callback:
                    print(f"\n🤔 {question}")
                    user_answer = await self.ask_callback(question, context)
                    last_result = f"用户回答：{user_answer}"
                    # 添加用户回答到消息历史，然后继续下一轮（不添加操作结果）
                    messages.append({"role": "assistant", "content": json.dumps(action, ensure_ascii=False)})
                    messages.append({"role": "user", "content": f"用户回答：{user_answer}"})
                    continue
                else:
                    # 如果没有设置回调，直接继续
                    last_result = "未设置询问回调，继续执行"
                    continue

            # 检查完成
            if action.get("action") == "done":
                return result

            last_result = result
            messages.append({"role": "assistant", "content": json.dumps(action, ensure_ascii=False)})
            messages.append({"role": "user", "content": f"操作结果: {result}"})

            await asyncio.sleep(0.5)

        return "⚠️ 达到最大步骤限制"

    def _detect_login_required(self, page_info: Dict) -> bool:
        """检测页面是否需要登录"""
        content = page_info.get('content_summary', '').lower()
        title = page_info.get('title', '').lower()

        # 登录相关关键词
        login_keywords = ['登录', 'login', 'sign in', '注册', 'register', '请登录', '请先登录']

        for kw in login_keywords:
            if kw in content or kw in title:
                # 检查元素是否很少（通常登录页可交互元素少）
                if len(page_info.get('elements', [])) < 5:
                    return True

        return False


# 向后兼容
class SimpleBrowserController:
    """简化版控制器"""

    def __init__(self):
        self.agent = SmartBrowserAgent()

    async def start(self):
        await self.agent.start()

    async def close(self):
        await self.agent.close()

    async def execute(self, action: str, **kwargs):
        action_dict = {"action": action, **kwargs}
        success, result = await self.agent.execute_action(action_dict)
        return result