fix: long-term memory bug

2026-04-18 01:53:47 +08:00 · 2026-01-30 11:31:13 +08:00
parent bb850bb6c5
commit 5a466d0ff6
12 changed files with 202 additions and 295 deletions
--- a/agent/memory/config.py
+++ b/agent/memory/config.py
@@ -28,7 +28,7 @@ class MemoryConfig:
    
    # Search config
    max_results: int = 10
-    min_score: float = 0.3
+    min_score: float = 0.1
    
    # Hybrid search weights
    vector_weight: float = 0.7
--- a/agent/memory/manager.py
+++ b/agent/memory/manager.py
@@ -213,7 +213,6 @@ class MemoryManager:
            
            memory_chunks.append(MemoryChunk(
                id=chunk_id,
-                agent_id="default",
                user_id=user_id,
                scope=scope,
                source=source,
@@ -330,7 +329,6 @@ class MemoryManager:
            
            memory_chunks.append(MemoryChunk(
                id=chunk_id,
-                agent_id="default",
                user_id=user_id,
                scope=scope,
                source=source,
@@ -428,7 +426,7 @@ class MemoryManager:
        
        return success
    
-    def build_memory_guidance(self, lang: str = "en", include_context: bool = True) -> str:
+    def build_memory_guidance(self, lang: str = "zh", include_context: bool = True) -> str:
        """
        Build natural memory guidance for agent system prompt
        
@@ -450,7 +448,7 @@ class MemoryManager:
        
        if lang == "zh":
            guidance = f"""## 记忆召回
-回答关于过去工作、决策、日期、人物、偏好或待办事项的问题前：先用 memory_search 搜索 MEMORY.md + memory/*.md；然后用 memory_get 只读取需要的行。如果搜索后仍不确定，说明你已检查过。
+下方"背景知识"包含你的核心长期记忆，可直接使用。如果背景知识中没有相关信息，再用 memory_search 搜索历史记录（memory/*.md 日期文件）。

 ## 记忆存储
 当用户分享持久偏好、决策或重要事实时（无论是否明确要求"记住"），主动存储：
@@ -465,7 +463,7 @@ class MemoryManager:
 - 自然使用记忆，就像你本来就知道这些信息"""
        else:
            guidance = f"""## Memory Recall
-Before answering anything about prior work, decisions, dates, people, preferences, or todos: run memory_search on MEMORY.md + memory/*.md; then use memory_get to pull only the needed lines. If low confidence after search, say you checked.
+"Background Knowledge" below contains your core long-term memories - use them directly. If information is not in Background Knowledge, use memory_search to search, then use memory_get to read files (path format: memory/MEMORY.md, memory/2026-01-30.md).

 ## Memory Storage
 When user shares durable preferences, decisions, or important facts (whether or not they explicitly say "remember"), proactively store:
--- a/agent/memory/storage.py
+++ b/agent/memory/storage.py
@@ -50,11 +50,45 @@ class MemoryStorage:
    
    def _init_db(self):
        """Initialize database with schema"""
-        self.conn = sqlite3.connect(str(self.db_path))
-        self.conn.row_factory = sqlite3.Row
-        
-        # Enable JSON support
-        self.conn.execute("PRAGMA journal_mode=WAL")
+        try:
+            self.conn = sqlite3.connect(str(self.db_path), check_same_thread=False)
+            self.conn.row_factory = sqlite3.Row
+            
+            # Check database integrity
+            try:
+                result = self.conn.execute("PRAGMA integrity_check").fetchone()
+                if result[0] != 'ok':
+                    print(f"⚠️  Database integrity check failed: {result[0]}")
+                    print(f"   Recreating database...")
+                    self.conn.close()
+                    self.conn = None
+                    # Remove corrupted database
+                    self.db_path.unlink(missing_ok=True)
+                    # Remove WAL files
+                    Path(str(self.db_path) + '-wal').unlink(missing_ok=True)
+                    Path(str(self.db_path) + '-shm').unlink(missing_ok=True)
+                    # Reconnect to create new database
+                    self.conn = sqlite3.connect(str(self.db_path), check_same_thread=False)
+                    self.conn.row_factory = sqlite3.Row
+            except sqlite3.DatabaseError:
+                # Database is corrupted, recreate it
+                print(f"⚠️  Database is corrupted, recreating...")
+                if self.conn:
+                    self.conn.close()
+                    self.conn = None
+                self.db_path.unlink(missing_ok=True)
+                Path(str(self.db_path) + '-wal').unlink(missing_ok=True)
+                Path(str(self.db_path) + '-shm').unlink(missing_ok=True)
+                self.conn = sqlite3.connect(str(self.db_path), check_same_thread=False)
+                self.conn.row_factory = sqlite3.Row
+            
+            # Enable WAL mode for better concurrency
+            self.conn.execute("PRAGMA journal_mode=WAL")
+            # Set busy timeout to avoid "database is locked" errors
+            self.conn.execute("PRAGMA busy_timeout=5000")
+        except Exception as e:
+            print(f"⚠️  Unexpected error during database initialization: {e}")
+            raise
        
        # Create chunks table with embeddings
        self.conn.execute("""
@@ -92,6 +126,8 @@ class MemoryStorage:
        """)
        
        # Create FTS5 virtual table for keyword search
+        # Use default unicode61 tokenizer (stable and compatible)
+        # For CJK support, we'll use LIKE queries as fallback
        self.conn.execute("""
            CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
                text,
@@ -261,13 +297,37 @@ class MemoryStorage:
        scopes: List[str] = None,
        limit: int = 10
    ) -> List[SearchResult]:
-        """Keyword search using FTS5"""
+        """
+        Keyword search using FTS5 + LIKE fallback
+        
+        Strategy:
+        1. Try FTS5 search first (good for English and word-based languages)
+        2. If no results and query contains CJK characters, use LIKE search
+        """
        if scopes is None:
            scopes = ["shared"]
            if user_id:
                scopes.append("user")
        
-        # Build FTS query
+        # Try FTS5 search first
+        fts_results = self._search_fts5(query, user_id, scopes, limit)
+        if fts_results:
+            return fts_results
+        
+        # Fallback to LIKE search for CJK characters
+        if MemoryStorage._contains_cjk(query):
+            return self._search_like(query, user_id, scopes, limit)
+        
+        return []
+    
+    def _search_fts5(
+        self,
+        query: str,
+        user_id: Optional[str],
+        scopes: List[str],
+        limit: int
+    ) -> List[SearchResult]:
+        """FTS5 full-text search"""
        fts_query = self._build_fts_query(query)
        if not fts_query:
            return []
@@ -299,20 +359,83 @@ class MemoryStorage:
            """
            params.append(limit)
        
-        rows = self.conn.execute(sql_query, params).fetchall()
+        try:
+            rows = self.conn.execute(sql_query, params).fetchall()
+            return [
+                SearchResult(
+                    path=row['path'],
+                    start_line=row['start_line'],
+                    end_line=row['end_line'],
+                    score=self._bm25_rank_to_score(row['rank']),
+                    snippet=self._truncate_text(row['text'], 500),
+                    source=row['source'],
+                    user_id=row['user_id']
+                )
+                for row in rows
+            ]
+        except Exception:
+            return []
+    
+    def _search_like(
+        self,
+        query: str,
+        user_id: Optional[str],
+        scopes: List[str],
+        limit: int
+    ) -> List[SearchResult]:
+        """LIKE-based search for CJK characters"""
+        import re
+        # Extract CJK words (2+ characters)
+        cjk_words = re.findall(r'[\u4e00-\u9fff]{2,}', query)
+        if not cjk_words:
+            return []
        
-        return [
-            SearchResult(
-                path=row['path'],
-                start_line=row['start_line'],
-                end_line=row['end_line'],
-                score=self._bm25_rank_to_score(row['rank']),
-                snippet=self._truncate_text(row['text'], 500),
-                source=row['source'],
-                user_id=row['user_id']
-            )
-            for row in rows
-        ]
+        scope_placeholders = ','.join('?' * len(scopes))
+        
+        # Build LIKE conditions for each word
+        like_conditions = []
+        params = []
+        for word in cjk_words:
+            like_conditions.append("text LIKE ?")
+            params.append(f'%{word}%')
+        
+        where_clause = ' OR '.join(like_conditions)
+        params.extend(scopes)
+        
+        if user_id:
+            sql_query = f"""
+                SELECT * FROM chunks
+                WHERE ({where_clause})
+                AND scope IN ({scope_placeholders})
+                AND (scope = 'shared' OR user_id = ?)
+                LIMIT ?
+            """
+            params.extend([user_id, limit])
+        else:
+            sql_query = f"""
+                SELECT * FROM chunks
+                WHERE ({where_clause})
+                AND scope IN ({scope_placeholders})
+                LIMIT ?
+            """
+            params.append(limit)
+        
+        try:
+            rows = self.conn.execute(sql_query, params).fetchall()
+            return [
+                SearchResult(
+                    path=row['path'],
+                    start_line=row['start_line'],
+                    end_line=row['end_line'],
+                    score=0.5,  # Fixed score for LIKE search
+                    snippet=self._truncate_text(row['text'], 500),
+                    source=row['source'],
+                    user_id=row['user_id']
+                )
+                for row in rows
+            ]
+        except Exception:
+            return []
    
    def delete_by_path(self, path: str):
        """Delete all chunks from a file"""
@@ -354,7 +477,19 @@ class MemoryStorage:
    def close(self):
        """Close database connection"""
        if self.conn:
-            self.conn.close()
+            try:
+                self.conn.commit()  # Ensure all changes are committed
+                self.conn.close()
+                self.conn = None  # Mark as closed
+            except Exception as e:
+                print(f"⚠️  Error closing database connection: {e}")
+    
+    def __del__(self):
+        """Destructor to ensure connection is closed"""
+        try:
+            self.close()
+        except:
+            pass  # Ignore errors during cleanup
    
    # Helper methods
    
@@ -390,14 +525,29 @@ class MemoryStorage:
        return dot_product / (norm1 * norm2)
    
    @staticmethod
-    def _build_fts_query(raw_query: str) -> Optional[str]:
-        """Build FTS5 query from raw text"""
+    def _contains_cjk(text: str) -> bool:
+        """Check if text contains CJK (Chinese/Japanese/Korean) characters"""
        import re
-        tokens = re.findall(r'[A-Za-z0-9_\u4e00-\u9fff]+', raw_query)
+        return bool(re.search(r'[\u4e00-\u9fff]', text))
+    
+    @staticmethod
+    def _build_fts_query(raw_query: str) -> Optional[str]:
+        """
+        Build FTS5 query from raw text
+        
+        Works best for English and word-based languages.
+        For CJK characters, LIKE search will be used as fallback.
+        """
+        import re
+        # Extract words (primarily English words and numbers)
+        tokens = re.findall(r'[A-Za-z0-9_]+', raw_query)
        if not tokens:
            return None
+        
+        # Quote tokens for exact matching
        quoted = [f'"{t}"' for t in tokens]
-        return ' AND '.join(quoted)
+        # Use OR for more flexible matching
+        return ' OR '.join(quoted)
    
    @staticmethod
    def _bm25_rank_to_score(rank: float) -> float:
--- a/agent/memory/tools/init.py
+++ b/agent/memory/tools/init.py
@@ -1,10 +0,0 @@
-"""
-Memory tools for AgentMesh
-
-Provides memory_search and memory_get tools for agents
-"""
-
-from agent.memory.tools.memory_search import MemorySearchTool
-from agent.memory.tools.memory_get import MemoryGetTool
-
-__all__ = ['MemorySearchTool', 'MemoryGetTool']
--- a/agent/memory/tools/memory_get.py
+++ b/agent/memory/tools/memory_get.py
@@ -1,118 +0,0 @@
-"""
-Memory get tool
-
-Allows agents to read specific sections from memory files
-"""
-
-from typing import Dict, Any, Optional
-from pathlib import Path
-from agent.tools.base_tool import BaseTool
-from agent.memory.manager import MemoryManager
-
-
-class MemoryGetTool(BaseTool):
-    """Tool for reading memory file contents"""
-    
-    def __init__(self, memory_manager: MemoryManager):
-        """
-        Initialize memory get tool
-        
-        Args:
-            memory_manager: MemoryManager instance
-        """
-        super().__init__()
-        self.memory_manager = memory_manager
-        self._name = "memory_get"
-        self._description = (
-            "Read specific memory file content by path and line range. "
-            "Use after memory_search to get full context from historical memory files."
-        )
-    
-    @property
-    def name(self) -> str:
-        return self._name
-    
-    @property
-    def description(self) -> str:
-        return self._description
-    
-    @property
-    def parameters(self) -> Dict[str, Any]:
-        return {
-            "type": "object",
-            "properties": {
-                "path": {
-                    "type": "string",
-                    "description": "Relative path to the memory file (e.g., 'MEMORY.md', 'memory/2024-01-29.md')"
-                },
-                "start_line": {
-                    "type": "integer",
-                    "description": "Starting line number (optional, default: 1)",
-                    "default": 1
-                },
-                "num_lines": {
-                    "type": "integer",
-                    "description": "Number of lines to read (optional, reads all if not specified)"
-                }
-            },
-            "required": ["path"]
-        }
-    
-    async def execute(self, **kwargs) -> str:
-        """
-        Execute memory file read
-        
-        Args:
-            path: File path
-            start_line: Start line
-            num_lines: Number of lines
-            
-        Returns:
-            File content
-        """
-        path = kwargs.get("path")
-        start_line = kwargs.get("start_line", 1)
-        num_lines = kwargs.get("num_lines")
-        
-        if not path:
-            return "Error: path parameter is required"
-        
-        try:
-            workspace_dir = self.memory_manager.config.get_workspace()
-            file_path = workspace_dir / path
-            
-            if not file_path.exists():
-                return f"Error: File not found: {path}"
-            
-            content = file_path.read_text()
-            lines = content.split('\n')
-            
-            # Handle line range
-            if start_line < 1:
-                start_line = 1
-            
-            start_idx = start_line - 1
-            
-            if num_lines:
-                end_idx = start_idx + num_lines
-                selected_lines = lines[start_idx:end_idx]
-            else:
-                selected_lines = lines[start_idx:]
-            
-            result = '\n'.join(selected_lines)
-            
-            # Add metadata
-            total_lines = len(lines)
-            shown_lines = len(selected_lines)
-            
-            output = [
-                f"File: {path}",
-                f"Lines: {start_line}-{start_line + shown_lines - 1} (total: {total_lines})",
-                "",
-                result
-            ]
-            
-            return '\n'.join(output)
-            
-        except Exception as e:
-            return f"Error reading memory file: {str(e)}"
--- a/agent/memory/tools/memory_search.py
+++ b/agent/memory/tools/memory_search.py
@@ -1,106 +0,0 @@
-"""
-Memory search tool
-
-Allows agents to search their memory using semantic and keyword search
-"""
-
-from typing import Dict, Any, Optional
-from agent.tools.base_tool import BaseTool
-from agent.memory.manager import MemoryManager
-
-
-class MemorySearchTool(BaseTool):
-    """Tool for searching agent memory"""
-    
-    def __init__(self, memory_manager: MemoryManager, user_id: Optional[str] = None):
-        """
-        Initialize memory search tool
-        
-        Args:
-            memory_manager: MemoryManager instance
-            user_id: Optional user ID for scoped search
-        """
-        super().__init__()
-        self.memory_manager = memory_manager
-        self.user_id = user_id
-        self._name = "memory_search"
-        self._description = (
-            "Search historical memory files (beyond today/yesterday) using semantic and keyword search. "
-            "Recent context (MEMORY.md + today + yesterday) is already loaded. "
-            "Use this ONLY for older dates, specific past events, or when current context lacks needed info."
-        )
-    
-    @property
-    def name(self) -> str:
-        return self._name
-    
-    @property
-    def description(self) -> str:
-        return self._description
-    
-    @property
-    def parameters(self) -> Dict[str, Any]:
-        return {
-            "type": "object",
-            "properties": {
-                "query": {
-                    "type": "string",
-                    "description": "Search query (can be natural language question or keywords)"
-                },
-                "max_results": {
-                    "type": "integer",
-                    "description": "Maximum number of results to return (default: 10)",
-                    "default": 10
-                },
-                "min_score": {
-                    "type": "number",
-                    "description": "Minimum relevance score (0-1, default: 0.3)",
-                    "default": 0.3
-                }
-            },
-            "required": ["query"]
-        }
-    
-    async def execute(self, **kwargs) -> str:
-        """
-        Execute memory search
-        
-        Args:
-            query: Search query
-            max_results: Maximum results
-            min_score: Minimum score
-            
-        Returns:
-            Formatted search results
-        """
-        query = kwargs.get("query")
-        max_results = kwargs.get("max_results", 10)
-        min_score = kwargs.get("min_score", 0.3)
-        
-        if not query:
-            return "Error: query parameter is required"
-        
-        try:
-            results = await self.memory_manager.search(
-                query=query,
-                user_id=self.user_id,
-                max_results=max_results,
-                min_score=min_score,
-                include_shared=True
-            )
-            
-            if not results:
-                return f"No relevant memories found for query: {query}"
-            
-            # Format results
-            output = [f"Found {len(results)} relevant memories:\n"]
-            
-            for i, result in enumerate(results, 1):
-                output.append(f"\n{i}. {result.path} (lines {result.start_line}-{result.end_line})")
-                output.append(f"   Score: {result.score:.3f}")
-                output.append(f"   Snippet: {result.snippet}")
-            
-            return "\n".join(output)
-            
-        except Exception as e:
-            return f"Error searching memory: {str(e)}"
--- a/agent/tools/edit/edit.py
+++ b/agent/tools/edit/edit.py
@@ -46,6 +46,7 @@ class Edit(BaseTool):
    def __init__(self, config: dict = None):
        self.config = config or {}
        self.cwd = self.config.get("cwd", os.getcwd())
+        self.memory_manager = self.config.get("memory_manager", None)
    
    def execute(self, args: Dict[str, Any]) -> ToolResult:
        """
@@ -141,6 +142,14 @@ class Edit(BaseTool):
                "first_changed_line": diff_result['first_changed_line']
            }
            
+            # Notify memory manager if file is in memory directory
+            if self.memory_manager and "memory/" in path:
+                try:
+                    self.memory_manager.mark_dirty()
+                except Exception as e:
+                    # Don't fail the edit if memory notification fails
+                    pass
+            
            return ToolResult.success(result)
            
        except UnicodeDecodeError:
--- a/agent/tools/memory/memory_get.py
+++ b/agent/tools/memory/memory_get.py
@@ -22,7 +22,7 @@ class MemoryGetTool(BaseTool):
        "properties": {
            "path": {
                "type": "string",
-                "description": "Relative path to the memory file (e.g., 'MEMORY.md', 'memory/2024-01-29.md')"
+                "description": "Relative path to the memory file (e.g., 'memory/MEMORY.md', 'memory/2024-01-29.md')"
            },
            "start_line": {
                "type": "integer",
@@ -68,6 +68,11 @@ class MemoryGetTool(BaseTool):
        
        try:
            workspace_dir = self.memory_manager.config.get_workspace()
+            
+            # Auto-prepend memory/ if not present and not absolute path
+            if not path.startswith('memory/') and not path.startswith('/'):
+                path = f'memory/{path}'
+            
            file_path = workspace_dir / path
            
            if not file_path.exists():
--- a/agent/tools/memory/memory_search.py
+++ b/agent/tools/memory/memory_search.py
@@ -30,8 +30,8 @@ class MemorySearchTool(BaseTool):
            },
            "min_score": {
                "type": "number",
-                "description": "Minimum relevance score (0-1, default: 0.3)",
-                "default": 0.3
+                "description": "Minimum relevance score (0-1, default: 0.1)",
+                "default": 0.1
            }
        },
        "required": ["query"]
@@ -64,7 +64,7 @@ class MemorySearchTool(BaseTool):
        
        query = args.get("query")
        max_results = args.get("max_results", 10)
-        min_score = args.get("min_score", 0.3)
+        min_score = args.get("min_score", 0.1)
        
        if not query:
            return ToolResult.fail("Error: query parameter is required")
--- a/agent/tools/write/write.py
+++ b/agent/tools/write/write.py
@@ -34,6 +34,7 @@ class Write(BaseTool):
    def __init__(self, config: dict = None):
        self.config = config or {}
        self.cwd = self.config.get("cwd", os.getcwd())
+        self.memory_manager = self.config.get("memory_manager", None)
    
    def execute(self, args: Dict[str, Any]) -> ToolResult:
        """
@@ -64,6 +65,10 @@ class Write(BaseTool):
            # Get bytes written
            bytes_written = len(content.encode('utf-8'))
            
+            # Auto-sync to memory database if this is a memory file
+            if self.memory_manager and 'memory/' in path:
+                self.memory_manager.mark_dirty()
+            
            result = {
                "message": f"Successfully wrote {bytes_written} bytes to {path}",
                "path": path,
--- a/memory/2026-01-29.md
+++ b/memory/2026-01-29.md
@@ -1,5 +0,0 @@
-# 2026-01-29 记录
-
-## 老王的重要决定
- 今天老王告诉我他决定要学AI了，这是一个重要的决策
- 这可能会是他学习和职业发展的一个转折点
--- a/memory/MEMORY.md
+++ b/memory/MEMORY.md
@@ -1,21 +0,0 @@
-# Memory
-
-Long-term curated memories and preferences.
-
-## 用户信息
- 用户名：老王
-
-## 用户信息
- 用户名：老王
-
-## 用户偏好
- 喜欢吃红烧肉
- 爱打篮球
-
-## 重要决策
- 决定要学习AI（2026-01-29）
-
-## Notes
-
- Important decisions and facts go here
- This is your long-term knowledge base