feat(browser): auto-snapshot on navigate, screenshot prompt guidance

Browser tool enhancements: - Navigate action now auto-includes snapshot result, saving one LLM round-trip - Wait for networkidle + 800ms after navigation for SPA/JS-rendered pages - Prompt guides agent to screenshot key results and ask user for login/CAPTCHA help - Fixed playwright version pinned to 1.52.0; mirror fallback to official CDN on failure Web console file/image support: - SSE real-time push for images and files via on_event (file_to_send) - Added /api/file endpoint to serve local files for web preview - Frontend renders images in media-content container (survives delta/done overwrites) - File attachment cards with download links; RFC 5987 encoding for non-ASCII filenames Tool workspace fix: - Inject workspace_dir as cwd into send and browser tools (previously only file tools) - Screenshots now save to ~/cow/tmp/ instead of project directory
2026-04-06 20:38:17 +08:00 · 2026-03-29 19:09:11 +08:00
parent 511ee0bbaf
commit d09ae49287
8 changed files with 105 additions and 10 deletions
--- a/agent/prompt/builder.py
+++ b/agent/prompt/builder.py
@@ -165,7 +165,7 @@ def _build_tooling_section(tools: List[Any], language: str) -> List[str]:
        "terminal": "管理后台进程",
        "web_search": "网络搜索",
        "web_fetch": "获取URL内容",
-        "browser": "控制浏览器",
+        "browser": "控制浏览器（关键结果或需要协助可截图发送给用户）",
        "memory_search": "搜索记忆",
        "memory_get": "读取记忆内容",
        "env_config": "管理API密钥和技能配置",
--- a/agent/protocol/agent_stream.py
+++ b/agent/protocol/agent_stream.py
@@ -300,13 +300,13 @@ class AgentStreamExecutor:
                                    f"with same arguments. This may indicate a loop."
                                )
                        
-                        # Check if this is a file to send (from read tool)
+                        # Check if this is a file to send
                        if result.get("status") == "success" and isinstance(result.get("result"), dict):
                            result_data = result.get("result")
                            if result_data.get("type") == "file_to_send":
-                                # Store file metadata for later sending
                                self.files_to_send.append(result_data)
                                logger.info(f"📎 检测到待发送文件: {result_data.get('file_name', result_data.get('path'))}")
+                                self._emit_event("file_to_send", result_data)
                        
                        # Check for critical error - abort entire conversation
                        if result.get("status") == "critical_error":
--- a/agent/tools/browser/browser_service.py
+++ b/agent/tools/browser/browser_service.py
@@ -283,7 +283,7 @@ class BrowserService:
    # ------------------------------------------------------------------

    def navigate(self, url: str, timeout: int = 30000) -> Dict[str, Any]:
-        """Navigate to a URL and return page info."""
+        """Navigate to a URL and wait for the page to be fully rendered."""
        page = self.page
        try:
            resp = page.goto(url, wait_until="domcontentloaded", timeout=timeout)
@@ -291,6 +291,14 @@ class BrowserService:
        except Exception as e:
            return {"error": f"Navigation failed: {e}"}

+        # Wait for network idle and visual stability
+        try:
+            page.wait_for_load_state("networkidle", timeout=10000)
+        except Exception:
+            pass
+        # Extra settle time for JS-rendered content (SPA frameworks, animations)
+        page.wait_for_timeout(800)
+
        return {
            "url": page.url,
            "title": page.title(),
--- a/agent/tools/browser/browser_tool.py
+++ b/agent/tools/browser/browser_tool.py
@@ -23,9 +23,9 @@ class BrowserTool(BaseTool):
        "Control a browser to navigate web pages, interact with elements, and extract content. "
        "Actions: navigate, snapshot, click, fill, select, scroll, screenshot, wait, back, forward, "
        "get_text, press, evaluate.\n\n"
-        "Workflow: navigate to a URL → snapshot to see the page (elements get numeric refs) → "
-        "use refs in click/fill/select actions → snapshot again to verify.\n\n"
-        "Use snapshot (not screenshot) as the primary way to read page content."
+        "Workflow: navigate (auto-includes snapshot with element refs) → click/fill/select by ref → snapshot to verify.\n\n"
+        "Use snapshot as the primary way to read pages. Use screenshot + send to show key results to the user. "
+        "For login/CAPTCHA/authorization etc., screenshot and ask the user for help."
    )

    params: dict = {
@@ -136,12 +136,15 @@ class BrowserTool(BaseTool):
        if not url.startswith(("http://", "https://")):
            url = "https://" + url
        timeout = args.get("timeout", 30000)
-        result = self._get_service().navigate(url, timeout=timeout)
+        service = self._get_service()
+        result = service.navigate(url, timeout=timeout)
        if "error" in result:
            return ToolResult.fail(result["error"])
+        # Auto-snapshot after navigation so the agent gets page content in one call
+        snapshot_text = service.snapshot()
        return ToolResult.success(
            f"Navigated to: {result['url']}\nTitle: {result['title']}\nStatus: {result['status']}\n\n"
-            f"Use action 'snapshot' to see the page content."
+            f"--- Page Snapshot ---\n{snapshot_text}"
        )

    def _do_snapshot(self, args: Dict[str, Any]) -> ToolResult:
--- a/bridge/agent_bridge.py
+++ b/bridge/agent_bridge.py
@@ -271,10 +271,13 @@ class AgentBridge:
            tool_manager.load_tools()
            
            tools = []
+            workspace_dir = kwargs.get("workspace_dir")
            for tool_name in tool_manager.tool_classes.keys():
                try:
                    tool = tool_manager.create_tool(tool_name)
                    if tool:
+                        if workspace_dir and hasattr(tool, 'cwd'):
+                            tool.cwd = workspace_dir
                        tools.append(tool)
                except Exception as e:
                    logger.warning(f"[AgentBridge] Failed to load tool {tool_name}: {e}")
--- a/bridge/agent_initializer.py
+++ b/bridge/agent_initializer.py
@@ -366,7 +366,7 @@ class AgentInitializer:

                if tool:
                    # Apply workspace config to file operation tools
-                    if tool_name in ['read', 'write', 'edit', 'bash', 'grep', 'find', 'ls', 'web_fetch']:
+                    if tool_name in ['read', 'write', 'edit', 'bash', 'grep', 'find', 'ls', 'web_fetch', 'send', 'browser']:
                        tool.config = file_config
                        tool.cwd = file_config.get("cwd", getattr(tool, 'cwd', None))
                        if 'memory_manager' in file_config:
--- a/channel/web/static/js/console.js
+++ b/channel/web/static/js/console.js
@@ -719,6 +719,7 @@ function startSSE(requestId, loadingEl, timestamp) {
    let botEl = null;
    let stepsEl = null;    // .agent-steps  (thinking summaries + tool indicators)
    let contentEl = null;  // .answer-content (final streaming answer)
+    let mediaEl = null;    // .media-content (images & file attachments)
    let accumulatedText = '';
    let currentToolEl = null;

@@ -734,6 +735,7 @@ function startSSE(requestId, loadingEl, timestamp) {
                <div class="bg-white dark:bg-[#1A1A1A] border border-slate-200 dark:border-white/10 rounded-2xl px-4 py-3 text-sm leading-relaxed msg-content text-slate-700 dark:text-slate-200">
                    <div class="agent-steps"></div>
                    <div class="answer-content sse-streaming"></div>
+                    <div class="media-content"></div>
                </div>
                <div class="text-xs text-slate-400 dark:text-slate-500 mt-1.5">${formatTime(timestamp)}</div>
            </div>
@@ -741,6 +743,7 @@ function startSSE(requestId, loadingEl, timestamp) {
        messagesDiv.appendChild(botEl);
        stepsEl = botEl.querySelector('.agent-steps');
        contentEl = botEl.querySelector('.answer-content');
+        mediaEl = botEl.querySelector('.media-content');
    }

    es.onmessage = function(e) {
@@ -831,6 +834,29 @@ function startSSE(requestId, loadingEl, timestamp) {
                currentToolEl = null;
            }

+        } else if (item.type === 'image') {
+            ensureBotEl();
+            const imgEl = document.createElement('img');
+            imgEl.src = item.content;
+            imgEl.alt = 'screenshot';
+            imgEl.style.cssText = 'max-width:360px;border-radius:8px;margin:8px 0;cursor:pointer;box-shadow:0 1px 4px rgba(0,0,0,0.1);';
+            imgEl.onclick = () => window.open(item.content, '_blank');
+            mediaEl.appendChild(imgEl);
+            scrollChatToBottom();
+
+        } else if (item.type === 'file') {
+            ensureBotEl();
+            const fileName = item.file_name || item.content.split('/').pop();
+            const fileEl = document.createElement('a');
+            fileEl.href = item.content;
+            fileEl.download = fileName;
+            fileEl.target = '_blank';
+            fileEl.className = 'file-attachment';
+            fileEl.style.cssText = 'display:inline-flex;align-items:center;gap:6px;padding:8px 14px;margin:8px 0;border-radius:8px;background:var(--bg-secondary,#f3f4f6);color:var(--text-primary,#374151);text-decoration:none;font-size:14px;border:1px solid var(--border-color,#e5e7eb);';
+            fileEl.innerHTML = `<i class="fas fa-file-download" style="color:#6b7280;"></i> ${fileName}`;
+            mediaEl.appendChild(fileEl);
+            scrollChatToBottom();
+
        } else if (item.type === 'done') {
            es.close();
            delete activeStreams[requestId];
--- a/channel/web/web_channel.py
+++ b/channel/web/web_channel.py
@@ -99,6 +99,21 @@ class WebChannel(ChatChannel):
            # SSE mode: push done event to SSE queue
            if request_id in self.sse_queues:
                content = reply.content if reply.content is not None else ""
+
+                # Files are already pushed via on_event (file_to_send) during agent execution.
+                # Skip duplicate file pushes here; just let the done event through.
+                if reply.type in (ReplyType.IMAGE_URL, ReplyType.FILE) and content.startswith("file://"):
+                    text_content = getattr(reply, 'text_content', '')
+                    if text_content:
+                        self.sse_queues[request_id].put({
+                            "type": "done",
+                            "content": text_content,
+                            "request_id": request_id,
+                            "timestamp": time.time()
+                        })
+                    logger.debug(f"SSE skipped duplicate file for request {request_id}")
+                    return
+
                self.sse_queues[request_id].put({
                    "type": "done",
                    "content": content,
@@ -161,6 +176,19 @@ class WebChannel(ChatChannel):
                    "execution_time": round(exec_time, 2)
                })

+            elif event_type == "file_to_send":
+                file_path = data.get("path", "")
+                file_name = data.get("file_name", os.path.basename(file_path))
+                file_type = data.get("file_type", "file")
+                from urllib.parse import quote
+                web_url = f"/api/file?path={quote(file_path)}"
+                is_image = file_type == "image"
+                q.put({
+                    "type": "image" if is_image else "file",
+                    "content": web_url,
+                    "file_name": file_name,
+                })
+
        return on_event

    def upload_file(self):
@@ -377,6 +405,7 @@ class WebChannel(ChatChannel):
            '/message', 'MessageHandler',
            '/upload', 'UploadHandler',
            '/uploads/(.*)', 'UploadsHandler',
+            '/api/file', 'FileServeHandler',
            '/poll', 'PollHandler',
            '/stream', 'StreamHandler',
            '/chat', 'ChatHandler',
@@ -463,6 +492,32 @@ class UploadsHandler:
            raise web.notfound()


+class FileServeHandler:
+    def GET(self):
+        """Serve a local file by absolute path (for agent send tool)."""
+        try:
+            params = web.input(path="")
+            file_path = params.path
+            if not file_path or not os.path.isabs(file_path):
+                raise web.notfound()
+            file_path = os.path.normpath(file_path)
+            if not os.path.isfile(file_path):
+                raise web.notfound()
+            content_type = mimetypes.guess_type(file_path)[0] or "application/octet-stream"
+            file_name = os.path.basename(file_path)
+            from urllib.parse import quote
+            web.header('Content-Type', content_type)
+            web.header('Content-Disposition', f"inline; filename*=UTF-8''{quote(file_name)}")
+            web.header('Cache-Control', 'public, max-age=3600')
+            with open(file_path, 'rb') as f:
+                return f.read()
+        except web.HTTPError:
+            raise
+        except Exception as e:
+            logger.error(f"[WebChannel] Error serving file: {e}")
+            raise web.notfound()
+
+
 class PollHandler:
    def POST(self):
        return WebChannel().poll_response()