diff --git a/agent/prompt/builder.py b/agent/prompt/builder.py index 4a54963e..f5218622 100644 --- a/agent/prompt/builder.py +++ b/agent/prompt/builder.py @@ -165,7 +165,7 @@ def _build_tooling_section(tools: List[Any], language: str) -> List[str]: "terminal": "管理后台进程", "web_search": "网络搜索", "web_fetch": "获取URL内容", - "browser": "控制浏览器", + "browser": "控制浏览器(关键结果或需要协助可截图发送给用户)", "memory_search": "搜索记忆", "memory_get": "读取记忆内容", "env_config": "管理API密钥和技能配置", diff --git a/agent/protocol/agent_stream.py b/agent/protocol/agent_stream.py index 79dcd2ab..1b250011 100644 --- a/agent/protocol/agent_stream.py +++ b/agent/protocol/agent_stream.py @@ -300,13 +300,13 @@ class AgentStreamExecutor: f"with same arguments. This may indicate a loop." ) - # Check if this is a file to send (from read tool) + # Check if this is a file to send if result.get("status") == "success" and isinstance(result.get("result"), dict): result_data = result.get("result") if result_data.get("type") == "file_to_send": - # Store file metadata for later sending self.files_to_send.append(result_data) logger.info(f"📎 检测到待发送文件: {result_data.get('file_name', result_data.get('path'))}") + self._emit_event("file_to_send", result_data) # Check for critical error - abort entire conversation if result.get("status") == "critical_error": diff --git a/agent/tools/browser/browser_service.py b/agent/tools/browser/browser_service.py index d502ffb3..3065135a 100644 --- a/agent/tools/browser/browser_service.py +++ b/agent/tools/browser/browser_service.py @@ -283,7 +283,7 @@ class BrowserService: # ------------------------------------------------------------------ def navigate(self, url: str, timeout: int = 30000) -> Dict[str, Any]: - """Navigate to a URL and return page info.""" + """Navigate to a URL and wait for the page to be fully rendered.""" page = self.page try: resp = page.goto(url, wait_until="domcontentloaded", timeout=timeout) @@ -291,6 +291,14 @@ class BrowserService: except Exception as e: return {"error": f"Navigation failed: {e}"} + # Wait for network idle and visual stability + try: + page.wait_for_load_state("networkidle", timeout=10000) + except Exception: + pass + # Extra settle time for JS-rendered content (SPA frameworks, animations) + page.wait_for_timeout(800) + return { "url": page.url, "title": page.title(), diff --git a/agent/tools/browser/browser_tool.py b/agent/tools/browser/browser_tool.py index 0b51fa26..0d16406b 100644 --- a/agent/tools/browser/browser_tool.py +++ b/agent/tools/browser/browser_tool.py @@ -23,9 +23,9 @@ class BrowserTool(BaseTool): "Control a browser to navigate web pages, interact with elements, and extract content. " "Actions: navigate, snapshot, click, fill, select, scroll, screenshot, wait, back, forward, " "get_text, press, evaluate.\n\n" - "Workflow: navigate to a URL → snapshot to see the page (elements get numeric refs) → " - "use refs in click/fill/select actions → snapshot again to verify.\n\n" - "Use snapshot (not screenshot) as the primary way to read page content." + "Workflow: navigate (auto-includes snapshot with element refs) → click/fill/select by ref → snapshot to verify.\n\n" + "Use snapshot as the primary way to read pages. Use screenshot + send to show key results to the user. " + "For login/CAPTCHA/authorization etc., screenshot and ask the user for help." ) params: dict = { @@ -136,12 +136,15 @@ class BrowserTool(BaseTool): if not url.startswith(("http://", "https://")): url = "https://" + url timeout = args.get("timeout", 30000) - result = self._get_service().navigate(url, timeout=timeout) + service = self._get_service() + result = service.navigate(url, timeout=timeout) if "error" in result: return ToolResult.fail(result["error"]) + # Auto-snapshot after navigation so the agent gets page content in one call + snapshot_text = service.snapshot() return ToolResult.success( f"Navigated to: {result['url']}\nTitle: {result['title']}\nStatus: {result['status']}\n\n" - f"Use action 'snapshot' to see the page content." + f"--- Page Snapshot ---\n{snapshot_text}" ) def _do_snapshot(self, args: Dict[str, Any]) -> ToolResult: diff --git a/bridge/agent_bridge.py b/bridge/agent_bridge.py index 81caad3c..20e6e301 100644 --- a/bridge/agent_bridge.py +++ b/bridge/agent_bridge.py @@ -271,10 +271,13 @@ class AgentBridge: tool_manager.load_tools() tools = [] + workspace_dir = kwargs.get("workspace_dir") for tool_name in tool_manager.tool_classes.keys(): try: tool = tool_manager.create_tool(tool_name) if tool: + if workspace_dir and hasattr(tool, 'cwd'): + tool.cwd = workspace_dir tools.append(tool) except Exception as e: logger.warning(f"[AgentBridge] Failed to load tool {tool_name}: {e}") diff --git a/bridge/agent_initializer.py b/bridge/agent_initializer.py index f64d9715..26c67c48 100644 --- a/bridge/agent_initializer.py +++ b/bridge/agent_initializer.py @@ -366,7 +366,7 @@ class AgentInitializer: if tool: # Apply workspace config to file operation tools - if tool_name in ['read', 'write', 'edit', 'bash', 'grep', 'find', 'ls', 'web_fetch']: + if tool_name in ['read', 'write', 'edit', 'bash', 'grep', 'find', 'ls', 'web_fetch', 'send', 'browser']: tool.config = file_config tool.cwd = file_config.get("cwd", getattr(tool, 'cwd', None)) if 'memory_manager' in file_config: diff --git a/channel/web/static/js/console.js b/channel/web/static/js/console.js index aa47e23c..b9786142 100644 --- a/channel/web/static/js/console.js +++ b/channel/web/static/js/console.js @@ -719,6 +719,7 @@ function startSSE(requestId, loadingEl, timestamp) { let botEl = null; let stepsEl = null; // .agent-steps (thinking summaries + tool indicators) let contentEl = null; // .answer-content (final streaming answer) + let mediaEl = null; // .media-content (images & file attachments) let accumulatedText = ''; let currentToolEl = null; @@ -734,6 +735,7 @@ function startSSE(requestId, loadingEl, timestamp) {
+
${formatTime(timestamp)}
@@ -741,6 +743,7 @@ function startSSE(requestId, loadingEl, timestamp) { messagesDiv.appendChild(botEl); stepsEl = botEl.querySelector('.agent-steps'); contentEl = botEl.querySelector('.answer-content'); + mediaEl = botEl.querySelector('.media-content'); } es.onmessage = function(e) { @@ -831,6 +834,29 @@ function startSSE(requestId, loadingEl, timestamp) { currentToolEl = null; } + } else if (item.type === 'image') { + ensureBotEl(); + const imgEl = document.createElement('img'); + imgEl.src = item.content; + imgEl.alt = 'screenshot'; + imgEl.style.cssText = 'max-width:360px;border-radius:8px;margin:8px 0;cursor:pointer;box-shadow:0 1px 4px rgba(0,0,0,0.1);'; + imgEl.onclick = () => window.open(item.content, '_blank'); + mediaEl.appendChild(imgEl); + scrollChatToBottom(); + + } else if (item.type === 'file') { + ensureBotEl(); + const fileName = item.file_name || item.content.split('/').pop(); + const fileEl = document.createElement('a'); + fileEl.href = item.content; + fileEl.download = fileName; + fileEl.target = '_blank'; + fileEl.className = 'file-attachment'; + fileEl.style.cssText = 'display:inline-flex;align-items:center;gap:6px;padding:8px 14px;margin:8px 0;border-radius:8px;background:var(--bg-secondary,#f3f4f6);color:var(--text-primary,#374151);text-decoration:none;font-size:14px;border:1px solid var(--border-color,#e5e7eb);'; + fileEl.innerHTML = ` ${fileName}`; + mediaEl.appendChild(fileEl); + scrollChatToBottom(); + } else if (item.type === 'done') { es.close(); delete activeStreams[requestId]; diff --git a/channel/web/web_channel.py b/channel/web/web_channel.py index 18770e96..cc77a771 100644 --- a/channel/web/web_channel.py +++ b/channel/web/web_channel.py @@ -99,6 +99,21 @@ class WebChannel(ChatChannel): # SSE mode: push done event to SSE queue if request_id in self.sse_queues: content = reply.content if reply.content is not None else "" + + # Files are already pushed via on_event (file_to_send) during agent execution. + # Skip duplicate file pushes here; just let the done event through. + if reply.type in (ReplyType.IMAGE_URL, ReplyType.FILE) and content.startswith("file://"): + text_content = getattr(reply, 'text_content', '') + if text_content: + self.sse_queues[request_id].put({ + "type": "done", + "content": text_content, + "request_id": request_id, + "timestamp": time.time() + }) + logger.debug(f"SSE skipped duplicate file for request {request_id}") + return + self.sse_queues[request_id].put({ "type": "done", "content": content, @@ -161,6 +176,19 @@ class WebChannel(ChatChannel): "execution_time": round(exec_time, 2) }) + elif event_type == "file_to_send": + file_path = data.get("path", "") + file_name = data.get("file_name", os.path.basename(file_path)) + file_type = data.get("file_type", "file") + from urllib.parse import quote + web_url = f"/api/file?path={quote(file_path)}" + is_image = file_type == "image" + q.put({ + "type": "image" if is_image else "file", + "content": web_url, + "file_name": file_name, + }) + return on_event def upload_file(self): @@ -377,6 +405,7 @@ class WebChannel(ChatChannel): '/message', 'MessageHandler', '/upload', 'UploadHandler', '/uploads/(.*)', 'UploadsHandler', + '/api/file', 'FileServeHandler', '/poll', 'PollHandler', '/stream', 'StreamHandler', '/chat', 'ChatHandler', @@ -463,6 +492,32 @@ class UploadsHandler: raise web.notfound() +class FileServeHandler: + def GET(self): + """Serve a local file by absolute path (for agent send tool).""" + try: + params = web.input(path="") + file_path = params.path + if not file_path or not os.path.isabs(file_path): + raise web.notfound() + file_path = os.path.normpath(file_path) + if not os.path.isfile(file_path): + raise web.notfound() + content_type = mimetypes.guess_type(file_path)[0] or "application/octet-stream" + file_name = os.path.basename(file_path) + from urllib.parse import quote + web.header('Content-Type', content_type) + web.header('Content-Disposition', f"inline; filename*=UTF-8''{quote(file_name)}") + web.header('Cache-Control', 'public, max-age=3600') + with open(file_path, 'rb') as f: + return f.read() + except web.HTTPError: + raise + except Exception as e: + logger.error(f"[WebChannel] Error serving file: {e}") + raise web.notfound() + + class PollHandler: def POST(self): return WebChannel().poll_response()