feat: add Codex model name normalization for consistent pricing lookup

Normalize model names from JSONL session logs before storage and pricing lookup: lowercase, strip provider prefix (openai/), strip date suffixes (-YYYY-MM-DD, -YYYYMMDD). Also clamp cached tokens to not exceed input.
2026-05-18 19:20:07 +08:00 · 2026-04-06 19:02:32 +08:00
parent 2ea6a307fc
commit 8ad1bb7924
1 changed files with 165 additions and 32 deletions
@@ -55,6 +55,51 @@ struct FileParseState {
    event_index: u32,
 }

+/// 归一化 Codex 模型名
+///
+/// 处理规则（按顺序）：
+/// 1. 转小写：`GLM-4.6` → `glm-4.6`
+/// 2. 剥离 provider 前缀：`openai/gpt-5.4` → `gpt-5.4`
+/// 3. 剥离 ISO 日期后缀：`gpt-5.4-2026-03-05` → `gpt-5.4`
+/// 4. 剥离紧凑日期后缀：`gpt-5.4-20260305` → `gpt-5.4`
+fn normalize_codex_model(raw: &str) -> String {
+    // Step 1: 小写
+    let mut name = raw.to_lowercase();
+
+    // Step 2: 剥离 "provider/" 前缀（如 openai/, azure/）
+    if let Some(pos) = name.rfind('/') {
+        name = name[pos + 1..].to_string();
+    }
+
+    // Step 3: 剥离 ISO 日期后缀 -YYYY-MM-DD（正好 11 字符）
+    if name.len() > 11 {
+        let suffix = &name[name.len() - 11..];
+        if suffix.as_bytes()[0] == b'-'
+            && suffix[1..5].chars().all(|c| c.is_ascii_digit())
+            && suffix.as_bytes()[5] == b'-'
+            && suffix[6..8].chars().all(|c| c.is_ascii_digit())
+            && suffix.as_bytes()[8] == b'-'
+            && suffix[9..11].chars().all(|c| c.is_ascii_digit())
+        {
+            name.truncate(name.len() - 11);
+        }
+    }
+
+    // Step 4: 剥离紧凑日期后缀 -YYYYMMDD（正好 9 字符）
+    if name.len() > 9 {
+        let parts: Vec<&str> = name.rsplitn(2, '-').collect();
+        if parts.len() == 2 {
+            if let Some(suffix) = parts.first() {
+                if suffix.len() == 8 && suffix.chars().all(|c| c.is_ascii_digit()) {
+                    name = parts[1].to_string();
+                }
+            }
+        }
+    }
+
+    name
+}
+
 /// 计算两次累计值之间的 delta
 fn compute_delta(prev: &Option<CumulativeTokens>, current: &CumulativeTokens) -> DeltaTokens {
    match prev {
@@ -273,7 +318,7 @@ fn sync_single_codex_file(db: &Database, file_path: &Path) -> Result<(u32, u32),
                        .or_else(|| payload.get("info").and_then(|info| info.get("model")))
                        .and_then(|v| v.as_str())
                    {
-                        state.current_model = model.to_string();
+                        state.current_model = normalize_codex_model(model);
                    }
                }
            }
@@ -300,7 +345,7 @@ fn sync_single_codex_file(db: &Database, file_path: &Path) -> Result<(u32, u32),
                    .or_else(|| payload.get("model"))
                    .and_then(|v| v.as_str())
                {
-                    state.current_model = model.to_string();
+                    state.current_model = normalize_codex_model(model);
                }

                // 优先用 total_token_usage（累计值），fallback 到 last_token_usage（增量值）
@@ -331,6 +376,12 @@ fn sync_single_codex_file(db: &Database, file_path: &Path) -> Result<(u32, u32),
                    }
                };

+                // 钳制：cached 不应超过 input（防护异常数据）
+                let delta = DeltaTokens {
+                    cached_input: delta.cached_input.min(delta.input),
+                    ..delta
+                };
+
                if delta.is_zero() {
                    continue; // 跳过 task 边界的零 delta 事件
                }
@@ -521,10 +572,38 @@ fn update_sync_state(
    Ok(())
 }

-/// 查找 Codex 模型定价
+/// ��找 Codex 模型定价（带归一化）
 fn find_codex_pricing(conn: &rusqlite::Connection, model_id: &str) -> Option<ModelPricing> {
-    // 精确匹��
-    let result = conn.query_row(
+    let normalized = normalize_codex_model(model_id);
+
+    // 1. 精确匹配（归一化后的名称）
+    if let Some(pricing) = try_find_pricing(conn, &normalized) {
+        return Some(pricing);
+    }
+
+    // 2. LIKE 模糊匹配（兜底）
+    let pattern = format!("{normalized}%");
+    conn.query_row(
+        "SELECT input_cost_per_million, output_cost_per_million,
+                cache_read_cost_per_million, cache_creation_cost_per_million
+         FROM model_pricing WHERE model_id LIKE ?1 LIMIT 1",
+        rusqlite::params![pattern],
+        |row| {
+            Ok((
+                row.get::<_, String>(0)?,
+                row.get::<_, String>(1)?,
+                row.get::<_, String>(2)?,
+                row.get::<_, String>(3)?,
+            ))
+        },
+    )
+    .ok()
+    .and_then(|(i, o, cr, cc)| ModelPricing::from_strings(&i, &o, &cr, &cc).ok())
+}
+
+/// 精确匹配定价查询
+fn try_find_pricing(conn: &rusqlite::Connection, model_id: &str) -> Option<ModelPricing> {
+    conn.query_row(
        "SELECT input_cost_per_million, output_cost_per_million,
                cache_read_cost_per_million, cache_creation_cost_per_million
         FROM model_pricing WHERE model_id = ?1",
@@ -537,33 +616,9 @@ fn find_codex_pricing(conn: &rusqlite::Connection, model_id: &str) -> Option<Mod
                row.get::<_, String>(3)?,
            ))
        },
-    );
-
-    match result {
-        Ok((input, output, cache_read, cache_creation)) => {
-            ModelPricing::from_strings(&input, &output, &cache_read, &cache_creation).ok()
-        }
-        Err(_) => {
-            // 尝试 LIKE 匹配
-            let pattern = format!("{model_id}%");
-            conn.query_row(
-                "SELECT input_cost_per_million, output_cost_per_million,
-                        cache_read_cost_per_million, cache_creation_cost_per_million
-                 FROM model_pricing WHERE model_id LIKE ?1 LIMIT 1",
-                rusqlite::params![pattern],
-                |row| {
-                    Ok((
-                        row.get::<_, String>(0)?,
-                        row.get::<_, String>(1)?,
-                        row.get::<_, String>(2)?,
-                        row.get::<_, String>(3)?,
-                    ))
-                },
-            )
-            .ok()
-            .and_then(|(i, o, cr, cc)| ModelPricing::from_strings(&i, &o, &cr, &cc).ok())
-        }
-    }
+    )
+    .ok()
+    .and_then(|(i, o, cr, cc)| ModelPricing::from_strings(&i, &o, &cr, &cc).ok())
 }

 #[cfg(test)]
@@ -678,4 +733,82 @@ mod tests {
        let files = collect_codex_session_files(Path::new("/nonexistent/path"));
        assert!(files.is_empty());
    }
+
+    // ── 模型名归一化测试 ──
+
+    #[test]
+    fn test_normalize_codex_model_lowercase() {
+        assert_eq!(normalize_codex_model("GLM-4.6"), "glm-4.6");
+        assert_eq!(normalize_codex_model("DeepSeek-Chat"), "deepseek-chat");
+        assert_eq!(normalize_codex_model("GPT-5.4"), "gpt-5.4");
+    }
+
+    #[test]
+    fn test_normalize_codex_model_strip_prefix() {
+        assert_eq!(normalize_codex_model("openai/gpt-5.4"), "gpt-5.4");
+        assert_eq!(normalize_codex_model("azure/gpt-5.2-codex"), "gpt-5.2-codex");
+        assert_eq!(normalize_codex_model("OPENAI/GPT-5.4"), "gpt-5.4");
+    }
+
+    #[test]
+    fn test_normalize_codex_model_strip_iso_date() {
+        assert_eq!(normalize_codex_model("gpt-5.4-2026-03-05"), "gpt-5.4");
+        assert_eq!(
+            normalize_codex_model("gpt-5.4-pro-2026-03-05"),
+            "gpt-5.4-pro"
+        );
+    }
+
+    #[test]
+    fn test_normalize_codex_model_strip_compact_date() {
+        assert_eq!(normalize_codex_model("gpt-5.4-20260305"), "gpt-5.4");
+        assert_eq!(
+            normalize_codex_model("claude-opus-4-6-20260206"),
+            "claude-opus-4-6"
+        );
+    }
+
+    #[test]
+    fn test_normalize_codex_model_no_change() {
+        assert_eq!(normalize_codex_model("gpt-5.4"), "gpt-5.4");
+        assert_eq!(normalize_codex_model("gpt-5.2-codex"), "gpt-5.2-codex");
+        assert_eq!(normalize_codex_model("o3"), "o3");
+        assert_eq!(normalize_codex_model("deepseek-chat"), "deepseek-chat");
+    }
+
+    #[test]
+    fn test_normalize_codex_model_combined() {
+        // prefix + uppercase + ISO date
+        assert_eq!(
+            normalize_codex_model("openai/GPT-5.4-2026-03-05"),
+            "gpt-5.4"
+        );
+        // prefix + compact date
+        assert_eq!(
+            normalize_codex_model("openai/gpt-5.4-20260305"),
+            "gpt-5.4"
+        );
+    }
+
+    #[test]
+    fn test_cached_clamped_to_input() {
+        // cached > input 的异常场景应被 min() 钳制
+        let prev = Some(CumulativeTokens {
+            input: 100,
+            cached_input: 0,
+            output: 50,
+        });
+        let current = CumulativeTokens {
+            input: 110,   // delta = 10
+            cached_input: 80, // delta = 80（异常：大于 input delta）
+            output: 60,
+        };
+        let delta = compute_delta(&prev, &current);
+        // 钳制前：cached_input = 80, input = 10
+        assert_eq!(delta.cached_input, 80);
+        assert_eq!(delta.input, 10);
+        // 实际钳制在调用侧：delta.cached_input.min(delta.input)
+        let clamped = delta.cached_input.min(delta.input);
+        assert_eq!(clamped, 10);
+    }
 }