cultivation-world-simulator/tools/i18n/check_po_duplicates.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""检查 po 文件中是否有重复的 msgid"""

import re
import sys
from pathlib import Path
from collections import Counter


def extract_msgids(filepath: Path) -> list[str]:
    """
    从 po 文件中提取所有 msgid

    Args:
        filepath: po 文件路径

    Returns:
        msgid 列表（不包含空字符串）
    """
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()

    # 匹配 msgid "..." 模式
    pattern = r'msgid\s+"([^"]*)"'
    matches = re.findall(pattern, content)

    # 过滤掉空字符串（文件头的 msgid ""）
    msgids = [m for m in matches if m]

    return msgids


def find_duplicates(msgids: list[str]) -> dict[str, int]:
    """
    找出重复的 msgid

    Args:
        msgids: msgid 列表

    Returns:
        字典，键为重复的 msgid，值为出现次数
    """
    counter = Counter(msgids)
    duplicates = {msgid: count for msgid, count in counter.items() if count > 1}
    return duplicates


def check_file(filepath: Path, lang_name: str) -> tuple[int, dict[str, int]]:
    """
    检查单个 po 文件

    Args:
        filepath: po 文件路径
        lang_name: 语言名称（用于显示）

    Returns:
        (msgid总数, 重复项字典)
    """
    print(f"\n{'='*60}")
    print(f"检查文件: {lang_name}")
    print(f"路径: {filepath}")
    print(f"{'='*60}")

    if not filepath.exists():
        print(f"[ERROR] 文件不存在")
        return 0, {}

    msgids = extract_msgids(filepath)
    print(f"总共找到 {len(msgids)} 个 msgid 条目")

    duplicates = find_duplicates(msgids)

    if duplicates:
        print(f"\n[WARNING] 发现 {len(duplicates)} 个重复的 msgid:")
        for msgid, count in sorted(duplicates.items()):
            print(f"  - '{msgid}' 出现了 {count} 次")
    else:
        print(f"\n[OK] 未发现重复的 msgid")

    return len(msgids), duplicates


def main():
    """主函数"""
    # 获取项目根目录
    script_dir = Path(__file__).parent
    project_root = script_dir.parent.parent

    # po 文件路径
    zh_file = project_root / "static" / "locales" / "zh-CN" / "LC_MESSAGES" / "messages.po"
    en_file = project_root / "static" / "locales" / "en-US" / "LC_MESSAGES" / "messages.po"

    # 检查中文文件
    zh_count, zh_dups = check_file(zh_file, "中文 (zh_CN)")

    # 检查英文文件
    en_count, en_dups = check_file(en_file, "英文 (en_US)")

    # 打印总结
    print(f"\n{'='*60}")
    print("检查总结")
    print(f"{'='*60}")

    has_error = False

    if zh_dups or en_dups:
        print("[ERROR] 发现重复条目，需要修复")
        has_error = True
    else:
        print("[OK] 两个文件都没有重复的 msgid")

    if zh_count != en_count:
        print(f"[WARNING] 中英文 msgid 数量不一致: 中文 {zh_count} 个, 英文 {en_count} 个")
        has_error = True
    else:
        print(f"[OK] 中英文 msgid 数量一致: {zh_count} 个")

    # 检查 msgid 键是否匹配
    if zh_count > 0 and en_count > 0:
        zh_msgids = set(extract_msgids(zh_file))
        en_msgids = set(extract_msgids(en_file))

        zh_only = zh_msgids - en_msgids
        en_only = en_msgids - zh_msgids

        if zh_only:
            print(f"\n[WARNING] 只在中文中存在的 msgid ({len(zh_only)} 个):")
            for msgid in sorted(zh_only)[:5]:
                print(f"  - '{msgid}'")
            if len(zh_only) > 5:
                print(f"  ... 还有 {len(zh_only) - 5} 个")
            has_error = True

        if en_only:
            print(f"\n[WARNING] 只在英文中存在的 msgid ({len(en_only)} 个):")
            for msgid in sorted(en_only)[:5]:
                print(f"  - '{msgid}'")
            if len(en_only) > 5:
                print(f"  ... 还有 {len(en_only) - 5} 个")
            has_error = True

        if not zh_only and not en_only:
            print("[OK] 中英文 msgid 键完全匹配")

    # 返回状态码
    return 1 if has_error else 0


if __name__ == "__main__":
    sys.exit(main())