diff --git a/agent/prompt/builder.py b/agent/prompt/builder.py index 4a54963e..f5218622 100644 --- a/agent/prompt/builder.py +++ b/agent/prompt/builder.py @@ -165,7 +165,7 @@ def _build_tooling_section(tools: List[Any], language: str) -> List[str]: "terminal": "管理后台进程", "web_search": "网络搜索", "web_fetch": "获取URL内容", - "browser": "控制浏览器", + "browser": "控制浏览器(关键结果或需要协助可截图发送给用户)", "memory_search": "搜索记忆", "memory_get": "读取记忆内容", "env_config": "管理API密钥和技能配置", diff --git a/agent/protocol/agent_stream.py b/agent/protocol/agent_stream.py index 79dcd2ab..1b250011 100644 --- a/agent/protocol/agent_stream.py +++ b/agent/protocol/agent_stream.py @@ -300,13 +300,13 @@ class AgentStreamExecutor: f"with same arguments. This may indicate a loop." ) - # Check if this is a file to send (from read tool) + # Check if this is a file to send if result.get("status") == "success" and isinstance(result.get("result"), dict): result_data = result.get("result") if result_data.get("type") == "file_to_send": - # Store file metadata for later sending self.files_to_send.append(result_data) logger.info(f"📎 检测到待发送文件: {result_data.get('file_name', result_data.get('path'))}") + self._emit_event("file_to_send", result_data) # Check for critical error - abort entire conversation if result.get("status") == "critical_error": diff --git a/agent/tools/browser/browser_service.py b/agent/tools/browser/browser_service.py index d502ffb3..3065135a 100644 --- a/agent/tools/browser/browser_service.py +++ b/agent/tools/browser/browser_service.py @@ -283,7 +283,7 @@ class BrowserService: # ------------------------------------------------------------------ def navigate(self, url: str, timeout: int = 30000) -> Dict[str, Any]: - """Navigate to a URL and return page info.""" + """Navigate to a URL and wait for the page to be fully rendered.""" page = self.page try: resp = page.goto(url, wait_until="domcontentloaded", timeout=timeout) @@ -291,6 +291,14 @@ class BrowserService: except Exception as e: return {"error": f"Navigation failed: {e}"} + # Wait for network idle and visual stability + try: + page.wait_for_load_state("networkidle", timeout=10000) + except Exception: + pass + # Extra settle time for JS-rendered content (SPA frameworks, animations) + page.wait_for_timeout(800) + return { "url": page.url, "title": page.title(), diff --git a/agent/tools/browser/browser_tool.py b/agent/tools/browser/browser_tool.py index 0b51fa26..0d16406b 100644 --- a/agent/tools/browser/browser_tool.py +++ b/agent/tools/browser/browser_tool.py @@ -23,9 +23,9 @@ class BrowserTool(BaseTool): "Control a browser to navigate web pages, interact with elements, and extract content. " "Actions: navigate, snapshot, click, fill, select, scroll, screenshot, wait, back, forward, " "get_text, press, evaluate.\n\n" - "Workflow: navigate to a URL → snapshot to see the page (elements get numeric refs) → " - "use refs in click/fill/select actions → snapshot again to verify.\n\n" - "Use snapshot (not screenshot) as the primary way to read page content." + "Workflow: navigate (auto-includes snapshot with element refs) → click/fill/select by ref → snapshot to verify.\n\n" + "Use snapshot as the primary way to read pages. Use screenshot + send to show key results to the user. " + "For login/CAPTCHA/authorization etc., screenshot and ask the user for help." ) params: dict = { @@ -136,12 +136,15 @@ class BrowserTool(BaseTool): if not url.startswith(("http://", "https://")): url = "https://" + url timeout = args.get("timeout", 30000) - result = self._get_service().navigate(url, timeout=timeout) + service = self._get_service() + result = service.navigate(url, timeout=timeout) if "error" in result: return ToolResult.fail(result["error"]) + # Auto-snapshot after navigation so the agent gets page content in one call + snapshot_text = service.snapshot() return ToolResult.success( f"Navigated to: {result['url']}\nTitle: {result['title']}\nStatus: {result['status']}\n\n" - f"Use action 'snapshot' to see the page content." + f"--- Page Snapshot ---\n{snapshot_text}" ) def _do_snapshot(self, args: Dict[str, Any]) -> ToolResult: diff --git a/bridge/agent_bridge.py b/bridge/agent_bridge.py index 81caad3c..20e6e301 100644 --- a/bridge/agent_bridge.py +++ b/bridge/agent_bridge.py @@ -271,10 +271,13 @@ class AgentBridge: tool_manager.load_tools() tools = [] + workspace_dir = kwargs.get("workspace_dir") for tool_name in tool_manager.tool_classes.keys(): try: tool = tool_manager.create_tool(tool_name) if tool: + if workspace_dir and hasattr(tool, 'cwd'): + tool.cwd = workspace_dir tools.append(tool) except Exception as e: logger.warning(f"[AgentBridge] Failed to load tool {tool_name}: {e}") diff --git a/bridge/agent_initializer.py b/bridge/agent_initializer.py index f64d9715..26c67c48 100644 --- a/bridge/agent_initializer.py +++ b/bridge/agent_initializer.py @@ -366,7 +366,7 @@ class AgentInitializer: if tool: # Apply workspace config to file operation tools - if tool_name in ['read', 'write', 'edit', 'bash', 'grep', 'find', 'ls', 'web_fetch']: + if tool_name in ['read', 'write', 'edit', 'bash', 'grep', 'find', 'ls', 'web_fetch', 'send', 'browser']: tool.config = file_config tool.cwd = file_config.get("cwd", getattr(tool, 'cwd', None)) if 'memory_manager' in file_config: diff --git a/channel/web/static/js/console.js b/channel/web/static/js/console.js index aa47e23c..b9786142 100644 --- a/channel/web/static/js/console.js +++ b/channel/web/static/js/console.js @@ -719,6 +719,7 @@ function startSSE(requestId, loadingEl, timestamp) { let botEl = null; let stepsEl = null; // .agent-steps (thinking summaries + tool indicators) let contentEl = null; // .answer-content (final streaming answer) + let mediaEl = null; // .media-content (images & file attachments) let accumulatedText = ''; let currentToolEl = null; @@ -734,6 +735,7 @@ function startSSE(requestId, loadingEl, timestamp) {