mirror of
https://github.com/zhayujie/chatgpt-on-wechat.git
synced 2026-04-06 20:38:17 +08:00
feat(browser): auto-snapshot on navigate, screenshot prompt guidance
Browser tool enhancements: - Navigate action now auto-includes snapshot result, saving one LLM round-trip - Wait for networkidle + 800ms after navigation for SPA/JS-rendered pages - Prompt guides agent to screenshot key results and ask user for login/CAPTCHA help - Fixed playwright version pinned to 1.52.0; mirror fallback to official CDN on failure Web console file/image support: - SSE real-time push for images and files via on_event (file_to_send) - Added /api/file endpoint to serve local files for web preview - Frontend renders images in media-content container (survives delta/done overwrites) - File attachment cards with download links; RFC 5987 encoding for non-ASCII filenames Tool workspace fix: - Inject workspace_dir as cwd into send and browser tools (previously only file tools) - Screenshots now save to ~/cow/tmp/ instead of project directory
This commit is contained in:
@@ -165,7 +165,7 @@ def _build_tooling_section(tools: List[Any], language: str) -> List[str]:
|
||||
"terminal": "管理后台进程",
|
||||
"web_search": "网络搜索",
|
||||
"web_fetch": "获取URL内容",
|
||||
"browser": "控制浏览器",
|
||||
"browser": "控制浏览器(关键结果或需要协助可截图发送给用户)",
|
||||
"memory_search": "搜索记忆",
|
||||
"memory_get": "读取记忆内容",
|
||||
"env_config": "管理API密钥和技能配置",
|
||||
|
||||
@@ -300,13 +300,13 @@ class AgentStreamExecutor:
|
||||
f"with same arguments. This may indicate a loop."
|
||||
)
|
||||
|
||||
# Check if this is a file to send (from read tool)
|
||||
# Check if this is a file to send
|
||||
if result.get("status") == "success" and isinstance(result.get("result"), dict):
|
||||
result_data = result.get("result")
|
||||
if result_data.get("type") == "file_to_send":
|
||||
# Store file metadata for later sending
|
||||
self.files_to_send.append(result_data)
|
||||
logger.info(f"📎 检测到待发送文件: {result_data.get('file_name', result_data.get('path'))}")
|
||||
self._emit_event("file_to_send", result_data)
|
||||
|
||||
# Check for critical error - abort entire conversation
|
||||
if result.get("status") == "critical_error":
|
||||
|
||||
@@ -283,7 +283,7 @@ class BrowserService:
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def navigate(self, url: str, timeout: int = 30000) -> Dict[str, Any]:
|
||||
"""Navigate to a URL and return page info."""
|
||||
"""Navigate to a URL and wait for the page to be fully rendered."""
|
||||
page = self.page
|
||||
try:
|
||||
resp = page.goto(url, wait_until="domcontentloaded", timeout=timeout)
|
||||
@@ -291,6 +291,14 @@ class BrowserService:
|
||||
except Exception as e:
|
||||
return {"error": f"Navigation failed: {e}"}
|
||||
|
||||
# Wait for network idle and visual stability
|
||||
try:
|
||||
page.wait_for_load_state("networkidle", timeout=10000)
|
||||
except Exception:
|
||||
pass
|
||||
# Extra settle time for JS-rendered content (SPA frameworks, animations)
|
||||
page.wait_for_timeout(800)
|
||||
|
||||
return {
|
||||
"url": page.url,
|
||||
"title": page.title(),
|
||||
|
||||
@@ -23,9 +23,9 @@ class BrowserTool(BaseTool):
|
||||
"Control a browser to navigate web pages, interact with elements, and extract content. "
|
||||
"Actions: navigate, snapshot, click, fill, select, scroll, screenshot, wait, back, forward, "
|
||||
"get_text, press, evaluate.\n\n"
|
||||
"Workflow: navigate to a URL → snapshot to see the page (elements get numeric refs) → "
|
||||
"use refs in click/fill/select actions → snapshot again to verify.\n\n"
|
||||
"Use snapshot (not screenshot) as the primary way to read page content."
|
||||
"Workflow: navigate (auto-includes snapshot with element refs) → click/fill/select by ref → snapshot to verify.\n\n"
|
||||
"Use snapshot as the primary way to read pages. Use screenshot + send to show key results to the user. "
|
||||
"For login/CAPTCHA/authorization etc., screenshot and ask the user for help."
|
||||
)
|
||||
|
||||
params: dict = {
|
||||
@@ -136,12 +136,15 @@ class BrowserTool(BaseTool):
|
||||
if not url.startswith(("http://", "https://")):
|
||||
url = "https://" + url
|
||||
timeout = args.get("timeout", 30000)
|
||||
result = self._get_service().navigate(url, timeout=timeout)
|
||||
service = self._get_service()
|
||||
result = service.navigate(url, timeout=timeout)
|
||||
if "error" in result:
|
||||
return ToolResult.fail(result["error"])
|
||||
# Auto-snapshot after navigation so the agent gets page content in one call
|
||||
snapshot_text = service.snapshot()
|
||||
return ToolResult.success(
|
||||
f"Navigated to: {result['url']}\nTitle: {result['title']}\nStatus: {result['status']}\n\n"
|
||||
f"Use action 'snapshot' to see the page content."
|
||||
f"--- Page Snapshot ---\n{snapshot_text}"
|
||||
)
|
||||
|
||||
def _do_snapshot(self, args: Dict[str, Any]) -> ToolResult:
|
||||
|
||||
@@ -271,10 +271,13 @@ class AgentBridge:
|
||||
tool_manager.load_tools()
|
||||
|
||||
tools = []
|
||||
workspace_dir = kwargs.get("workspace_dir")
|
||||
for tool_name in tool_manager.tool_classes.keys():
|
||||
try:
|
||||
tool = tool_manager.create_tool(tool_name)
|
||||
if tool:
|
||||
if workspace_dir and hasattr(tool, 'cwd'):
|
||||
tool.cwd = workspace_dir
|
||||
tools.append(tool)
|
||||
except Exception as e:
|
||||
logger.warning(f"[AgentBridge] Failed to load tool {tool_name}: {e}")
|
||||
|
||||
@@ -366,7 +366,7 @@ class AgentInitializer:
|
||||
|
||||
if tool:
|
||||
# Apply workspace config to file operation tools
|
||||
if tool_name in ['read', 'write', 'edit', 'bash', 'grep', 'find', 'ls', 'web_fetch']:
|
||||
if tool_name in ['read', 'write', 'edit', 'bash', 'grep', 'find', 'ls', 'web_fetch', 'send', 'browser']:
|
||||
tool.config = file_config
|
||||
tool.cwd = file_config.get("cwd", getattr(tool, 'cwd', None))
|
||||
if 'memory_manager' in file_config:
|
||||
|
||||
@@ -719,6 +719,7 @@ function startSSE(requestId, loadingEl, timestamp) {
|
||||
let botEl = null;
|
||||
let stepsEl = null; // .agent-steps (thinking summaries + tool indicators)
|
||||
let contentEl = null; // .answer-content (final streaming answer)
|
||||
let mediaEl = null; // .media-content (images & file attachments)
|
||||
let accumulatedText = '';
|
||||
let currentToolEl = null;
|
||||
|
||||
@@ -734,6 +735,7 @@ function startSSE(requestId, loadingEl, timestamp) {
|
||||
<div class="bg-white dark:bg-[#1A1A1A] border border-slate-200 dark:border-white/10 rounded-2xl px-4 py-3 text-sm leading-relaxed msg-content text-slate-700 dark:text-slate-200">
|
||||
<div class="agent-steps"></div>
|
||||
<div class="answer-content sse-streaming"></div>
|
||||
<div class="media-content"></div>
|
||||
</div>
|
||||
<div class="text-xs text-slate-400 dark:text-slate-500 mt-1.5">${formatTime(timestamp)}</div>
|
||||
</div>
|
||||
@@ -741,6 +743,7 @@ function startSSE(requestId, loadingEl, timestamp) {
|
||||
messagesDiv.appendChild(botEl);
|
||||
stepsEl = botEl.querySelector('.agent-steps');
|
||||
contentEl = botEl.querySelector('.answer-content');
|
||||
mediaEl = botEl.querySelector('.media-content');
|
||||
}
|
||||
|
||||
es.onmessage = function(e) {
|
||||
@@ -831,6 +834,29 @@ function startSSE(requestId, loadingEl, timestamp) {
|
||||
currentToolEl = null;
|
||||
}
|
||||
|
||||
} else if (item.type === 'image') {
|
||||
ensureBotEl();
|
||||
const imgEl = document.createElement('img');
|
||||
imgEl.src = item.content;
|
||||
imgEl.alt = 'screenshot';
|
||||
imgEl.style.cssText = 'max-width:360px;border-radius:8px;margin:8px 0;cursor:pointer;box-shadow:0 1px 4px rgba(0,0,0,0.1);';
|
||||
imgEl.onclick = () => window.open(item.content, '_blank');
|
||||
mediaEl.appendChild(imgEl);
|
||||
scrollChatToBottom();
|
||||
|
||||
} else if (item.type === 'file') {
|
||||
ensureBotEl();
|
||||
const fileName = item.file_name || item.content.split('/').pop();
|
||||
const fileEl = document.createElement('a');
|
||||
fileEl.href = item.content;
|
||||
fileEl.download = fileName;
|
||||
fileEl.target = '_blank';
|
||||
fileEl.className = 'file-attachment';
|
||||
fileEl.style.cssText = 'display:inline-flex;align-items:center;gap:6px;padding:8px 14px;margin:8px 0;border-radius:8px;background:var(--bg-secondary,#f3f4f6);color:var(--text-primary,#374151);text-decoration:none;font-size:14px;border:1px solid var(--border-color,#e5e7eb);';
|
||||
fileEl.innerHTML = `<i class="fas fa-file-download" style="color:#6b7280;"></i> ${fileName}`;
|
||||
mediaEl.appendChild(fileEl);
|
||||
scrollChatToBottom();
|
||||
|
||||
} else if (item.type === 'done') {
|
||||
es.close();
|
||||
delete activeStreams[requestId];
|
||||
|
||||
@@ -99,6 +99,21 @@ class WebChannel(ChatChannel):
|
||||
# SSE mode: push done event to SSE queue
|
||||
if request_id in self.sse_queues:
|
||||
content = reply.content if reply.content is not None else ""
|
||||
|
||||
# Files are already pushed via on_event (file_to_send) during agent execution.
|
||||
# Skip duplicate file pushes here; just let the done event through.
|
||||
if reply.type in (ReplyType.IMAGE_URL, ReplyType.FILE) and content.startswith("file://"):
|
||||
text_content = getattr(reply, 'text_content', '')
|
||||
if text_content:
|
||||
self.sse_queues[request_id].put({
|
||||
"type": "done",
|
||||
"content": text_content,
|
||||
"request_id": request_id,
|
||||
"timestamp": time.time()
|
||||
})
|
||||
logger.debug(f"SSE skipped duplicate file for request {request_id}")
|
||||
return
|
||||
|
||||
self.sse_queues[request_id].put({
|
||||
"type": "done",
|
||||
"content": content,
|
||||
@@ -161,6 +176,19 @@ class WebChannel(ChatChannel):
|
||||
"execution_time": round(exec_time, 2)
|
||||
})
|
||||
|
||||
elif event_type == "file_to_send":
|
||||
file_path = data.get("path", "")
|
||||
file_name = data.get("file_name", os.path.basename(file_path))
|
||||
file_type = data.get("file_type", "file")
|
||||
from urllib.parse import quote
|
||||
web_url = f"/api/file?path={quote(file_path)}"
|
||||
is_image = file_type == "image"
|
||||
q.put({
|
||||
"type": "image" if is_image else "file",
|
||||
"content": web_url,
|
||||
"file_name": file_name,
|
||||
})
|
||||
|
||||
return on_event
|
||||
|
||||
def upload_file(self):
|
||||
@@ -377,6 +405,7 @@ class WebChannel(ChatChannel):
|
||||
'/message', 'MessageHandler',
|
||||
'/upload', 'UploadHandler',
|
||||
'/uploads/(.*)', 'UploadsHandler',
|
||||
'/api/file', 'FileServeHandler',
|
||||
'/poll', 'PollHandler',
|
||||
'/stream', 'StreamHandler',
|
||||
'/chat', 'ChatHandler',
|
||||
@@ -463,6 +492,32 @@ class UploadsHandler:
|
||||
raise web.notfound()
|
||||
|
||||
|
||||
class FileServeHandler:
|
||||
def GET(self):
|
||||
"""Serve a local file by absolute path (for agent send tool)."""
|
||||
try:
|
||||
params = web.input(path="")
|
||||
file_path = params.path
|
||||
if not file_path or not os.path.isabs(file_path):
|
||||
raise web.notfound()
|
||||
file_path = os.path.normpath(file_path)
|
||||
if not os.path.isfile(file_path):
|
||||
raise web.notfound()
|
||||
content_type = mimetypes.guess_type(file_path)[0] or "application/octet-stream"
|
||||
file_name = os.path.basename(file_path)
|
||||
from urllib.parse import quote
|
||||
web.header('Content-Type', content_type)
|
||||
web.header('Content-Disposition', f"inline; filename*=UTF-8''{quote(file_name)}")
|
||||
web.header('Cache-Control', 'public, max-age=3600')
|
||||
with open(file_path, 'rb') as f:
|
||||
return f.read()
|
||||
except web.HTTPError:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"[WebChannel] Error serving file: {e}")
|
||||
raise web.notfound()
|
||||
|
||||
|
||||
class PollHandler:
|
||||
def POST(self):
|
||||
return WebChannel().poll_response()
|
||||
|
||||
Reference in New Issue
Block a user