Files
WeChatMsg_fix/generate_report_data.py
2025-12-26 15:59:00 +08:00

429 lines
19 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import os
import sys
import time
import re
import jieba
import collections
from collections import Counter, defaultdict
import datetime
import urllib.request
import ssl
import html
# Add project root to path
project_root = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, project_root)
from wxManager import DatabaseConnection, MessageType
from wxManager.model import Me
from wxManager.parser.link_parser import wx_sport
def generate_report_data():
print("开始生成个性化年度报告数据...")
# 1. Setup DB
db_dir = r'e:\WeChatMsg\wxid_g4pshorcc0r529\db_storage'
db_version = 4
conn = DatabaseConnection(db_dir, db_version)
db = conn.get_interface()
# Load Self Info
Me().load_from_json(os.path.join(db_dir, 'info.json'))
self_wxid = Me().wxid
self_name = Me().name
print(f"当前用户: {self_name} ({self_wxid})")
# 2. Setup Paths
report_root = r"e:\WeChatMsg\AnnualReport\report-2025\single"
js_file = os.path.join(report_root, "src", "js", "getdata.js")
avatar_dir = os.path.join(report_root, "public", "header")
if not os.path.exists(avatar_dir):
os.makedirs(avatar_dir)
# 3. Helper: Save Avatar
def get_avatar_path(wxid):
# Try to get avatar buffer
try:
buf = db.get_avatar_buffer(wxid)
if buf:
filename = f"{wxid}.jpg"
filepath = os.path.join(avatar_dir, filename)
with open(filepath, 'wb') as f:
f.write(buf)
return f"'./header/{filename}'"
except Exception as e:
pass
return "'./header/header12.webp'" # Default
# Save Self Avatar
self_avatar_src = get_avatar_path(self_wxid)
# 4. Helper: Get Name
contact_cache = {}
all_contacts = db.get_contacts()
for c in all_contacts:
contact_cache[c.wxid] = c
def get_name(wxid):
if wxid in contact_cache:
c = contact_cache[wxid]
# Prefer remark, then nickname, then wxid
if hasattr(c, 'remark') and c.remark:
return c.remark
if hasattr(c, 'nickname') and c.nickname:
return c.nickname
return wxid
# 5. Analyze Messages
print("正在分析消息记录 (仅统计私聊)...")
# Stats
total_sent = 0
total_received = 0
total_words = 0
# Time stats
hour_counts = [0] * 24
daily_msg_counts = defaultdict(int) # '2025-01-01' -> count
# Friend stats
friend_msg_counts = Counter()
friend_word_counts = Counter()
friend_monthly_counts = defaultdict(lambda: defaultdict(int)) # '1月' -> {wxid: count}
# Emoji stats
emoji_counter = Counter()
emoji_urls = {}
# Keywords
text_content = []
# Step data (微信运动)
daily_step_counts = defaultdict(int) # '2025-01-01' -> steps
# Date range for 2025
start_2025 = datetime.datetime(2025, 1, 1).timestamp()
end_2025 = datetime.datetime(2026, 1, 1).timestamp()
sessions = db.session_db.get_session()
session_users = [s[0] for s in sessions]
processed_count = 0
for username in session_users:
processed_count += 1
if processed_count % 50 == 0:
print(f"已处理 {processed_count}/{len(session_users)} 个会话...")
# STRICT FILTER: Only private chats
# Exclude chatrooms, official accounts (gh_), filehelper, openim (Enterprise WeChat), and specific IDs
# BUT we need to read "微信运动" (gh_43f2581f6fd6) for step data
if username == 'gh_43f2581f6fd6':
# 微信运动公众号 - 读取步数数据
sport_msgs = db.get_messages(username)
if sport_msgs:
for msg in sport_msgs:
ts = msg.timestamp
if ts <= 0: continue
dt = datetime.datetime.fromtimestamp(ts)
date_str = dt.strftime('%Y-%m-%d')
if start_2025 <= ts < end_2025:
if msg.type == MessageType.LinkMessage and hasattr(msg, 'xml_content') and msg.xml_content:
try:
sport_data = wx_sport(msg.xml_content)
if sport_data and sport_data.get('score'):
score_str = str(sport_data.get('score', '0')).replace(',', '')
steps = int(score_str) if score_str.isdigit() else 0
if steps > 0:
# 取每天最大的步数(可能有多条记录)
daily_step_counts[date_str] = max(daily_step_counts[date_str], steps)
except:
pass
continue
if username.endswith('@chatroom') or username.startswith('gh_') or username == 'filehelper' or username.endswith('@openim') or username.endswith('@qy_u') or username == 'jQ4jTweaBCAFtdK':
continue
msgs = db.get_messages(username)
if not msgs: continue
for msg in msgs:
ts = msg.timestamp
if ts <= 0: continue
# Only count 2025 data for the report?
# The user complained about "960 days". Let's focus on 2025 for the main charts.
# But for "Total Days", we might check min/max of all time.
# Let's stick to 2025 for the "Annual" part.
dt = datetime.datetime.fromtimestamp(ts)
date_str = dt.strftime('%Y-%m-%d')
# Global stats (All time or 2025? Usually annual report is for that year)
# Let's filter for 2025 for the report content
if start_2025 <= ts < end_2025:
daily_msg_counts[date_str] += 1
hour_counts[dt.hour] += 1
if msg.is_sender:
total_sent += 1
else:
total_received += 1
if msg.type == MessageType.Text and msg.content:
l = len(msg.content)
total_words += l
friend_msg_counts[username] += 1
friend_word_counts[username] += l
month_key = f"{dt.month}"
friend_monthly_counts[month_key][username] += 1
# Keywords source - ONLY FROM SENDER (ME)
if msg.is_sender and len(text_content) < 50000: # Limit for memory
text_content.append(msg.content)
elif msg.type == 47 and msg.is_sender:
# Emoji
if hasattr(msg, 'md5') and msg.md5:
emoji_counter[msg.md5] += 1
if hasattr(msg, 'url') and msg.url:
emoji_urls[msg.md5] = msg.url
# 6. Process Data
print("正在计算统计数据...")
print(f"读取到 {len(daily_step_counts)} 天的步数数据,总计 {sum(daily_step_counts.values())}")
# Days in 2025 (so far)
# If today is in 2025, use today. If later, use 365.
now = datetime.datetime.now()
if now.year == 2025:
days_in_year = (now - datetime.datetime(2025, 1, 1)).days + 1
elif now.year > 2025:
days_in_year = 365
else:
days_in_year = 1 # Should not happen based on context
# Top Friends
top_friends = friend_msg_counts.most_common(5)
chat_friends_data = []
for wxid, count in top_friends:
chat_friends_data.append({
'name': get_name(wxid),
'messageCount': f"{count}条消息",
'wordCount': f"{friend_word_counts[wxid]}",
'avatarSrc': get_avatar_path(wxid)
})
# Monthly Top Friends
month_friends_data = []
for i in range(1, 13):
m_key = f"{i}"
if m_key in friend_monthly_counts:
top_month = max(friend_monthly_counts[m_key].items(), key=lambda x: x[1])
wxid = top_month[0]
month_friends_data.append({
'month': m_key,
'nickname': get_name(wxid),
'className': 'passion',
'num': top_month[1],
'avatar': get_avatar_path(wxid)
})
# Keywords
print("正在生成关键词...")
# full_text = "\n".join(text_content) # Changed to per-message processing
word_counter = Counter()
stop_words = {
'', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '一个', '这个', '什么', '怎么', '可以', '知道', '现在', '今天', '就是', '还是', '没有', '不是', '但是', '因为', '所以', '如果', '那个', '觉得', '其实', '应该', '可能', '然后', '时候', '感觉', '一下', '一点', '真的', '已经', '只是', '出来', '起来', '看着', '看到', '自己', '我们', '你们', '他们', '图片', '表情', '收到', '链接', '视频', '语音', 'https', 'http', 'com', 'cn', 'www', '美团', '红包', 'net', 'org', 'html', 'htm',
'或者', '还有', '微信', '没事', '直接', '明天', '数据', '消息', '东西', '里面', '问号', '朋友', '人家', '之前', '哪个', '开始', '问题', '感情', '晚上', '意思', '学校', '手机', '不能', '一会', '这种', '宝宝', '不行', '我草', '谢谢', '多少', '不到', '的话', '别人', '我要', '左右', '组家', '警这种关系左石东西号1号0号0老狐函我草HШ李睢', '_别乐w月', '或者192', '捂脸', '笑哭', '呲牙', '偷笑', '调皮', '阴险', '', '', '骷髅', '敲打', '再见', '擦汗', '抠鼻', '鼓掌', '糗大了', '坏笑', '左哼哼', '右哼哼', '哈欠', '鄙视', '委屈', '快哭了', '亲亲', '', '可怜', '菜刀', '西瓜', '啤酒', '篮球', '乒乓', '咖啡', '', '猪头', '玫瑰', '凋谢', '示爱', '爱心', '心碎', '蛋糕', '闪电', '炸弹', '', '足球', '瓢虫', '便便', '月亮', '太阳', '礼物', '拥抱', '', '', '握手', '胜利', '抱拳', '勾引', '拳头', '差劲', '爱你', 'NO', 'OK', '爱情', '飞吻', '跳跳', '发抖', '怄火', '转圈', '磕头', '回头', '跳绳', '挥手', '激动', '街舞', '献吻', '左太极', '右太极'
}
for msg_text in text_content:
words = jieba.cut(msg_text)
unique_words_in_msg = set()
for w in words:
# Filter: Length > 1, not in stop_words, not digit, not containing digits
if len(w) > 1 and w not in stop_words and not w.isdigit() and not any(char.isdigit() for char in w):
# Strict filter: Must contain at least one Chinese character or be a valid English word
# This filters out garbage like "HШ" or random symbols
if re.search(r'[\u4e00-\u9fa5]', w) or (w.isalpha() and len(w) > 2):
unique_words_in_msg.add(w)
word_counter.update(unique_words_in_msg)
common_words = word_counter.most_common(50)
keywords_list = [[w, c] for w, c in common_words]
top_keyword = ""
top_keyword_num = 0
if keywords_list:
top_keyword = keywords_list[0][0]
top_keyword_num = keywords_list[0][1]
# Heatmap Data (Step Data) - 真正的微信运动步数
# Format: [['2025-01-01', 10000], ...]
heatmap_data_js = "[\n"
for date_str, steps in daily_step_counts.items():
heatmap_data_js += f" ['{date_str}', {steps}],\n"
heatmap_data_js += " ]"
# Top Emoji
top_emoji_src = "'./header/header48.webp'" # Default
if emoji_counter:
top_md5, top_count = emoji_counter.most_common(1)[0]
print(f"最常用表情包 MD5: {top_md5} (使用 {top_count} 次)")
cdn_url = emoji_urls.get(top_md5)
try:
if not cdn_url:
emoticon_db_path = os.path.join(db_dir, 'emoticon', 'emoticon.db')
if os.path.exists(emoticon_db_path):
import sqlite3
conn_emo = sqlite3.connect(emoticon_db_path)
cursor_emo = conn_emo.cursor()
cursor_emo.execute("select cdn_url, thumb_url from kNonStoreEmoticonTable where md5=?", (top_md5,))
row = cursor_emo.fetchone()
conn_emo.close()
if row:
cdn_url = row[0] or row[1]
if cdn_url:
cdn_url = html.unescape(cdn_url)
print(f"下载表情包: {cdn_url}")
emoji_filename = f"emoji_{top_md5}.jpg"
emoji_path = os.path.join(avatar_dir, emoji_filename)
ssl_context = ssl._create_unverified_context()
req = urllib.request.Request(cdn_url, headers={'User-Agent': 'Mozilla/5.0'})
with urllib.request.urlopen(req, context=ssl_context) as response, open(emoji_path, 'wb') as out_file:
out_file.write(response.read())
top_emoji_src = f"'./header/{emoji_filename}'"
except Exception as e:
print(f"获取表情包失败: {e}")
# 7. Update File
print(f"正在更新前端文件: {js_file}")
with open(js_file, 'r', encoding='utf-8') as f:
content = f.read()
# Replace Welcome_data
# Use self_avatar_src
welcome_js = f"""export const Welcome_data = reactive({{
avatarSrc: {self_avatar_src},
nickname: "{self_name}",
descriptionText: {{
hello: "Hello World!",
text1: "时光荏苒,转眼间我们又走过了一年。",
text2: "在过去的365天里从深夜的长谈到清晨的祝福从好友间的调侃到工作中的忙碌……",
text4: "这些聊天记录,是属于你的独家记忆。",
text7: "打开报告,开启你的专属年度记忆吧!"
}}
}});"""
content = re.sub(r'export const Welcome_data = reactive\(\{[\s\S]*?\}\);', welcome_js, content)
# Replace statsData
content = re.sub(r'export var statsData = \{[^}]+\};',
f'export var statsData = {{\n daysInWeChat: {days_in_year},\n numOfFriends: {len(friend_msg_counts)},\n messagesSent: {total_sent},\n messagesReceived: {total_received},\n totalWords: {total_words}\n}};' ,
content)
# Replace yAxisData (Hours)
content = re.sub(r'export var yAxisData = \[[^\]]+\];',
f'export var yAxisData = {hour_counts};',
content)
# Replace chatFriendsData
friends_js = "export const chatFriendsData = reactive({\n chatFriends:[\n"
for f in chat_friends_data:
friends_js += f" {{\n name: \"{f['name']}\",\n messageCount: \"{f['messageCount']}\",\n wordCount: \"{f['wordCount']}\",\n avatarSrc: {f['avatarSrc']}\n }},\n"
friends_js += " ]\n});"
content = re.sub(r'export const chatFriendsData = reactive\(\{[\s\S]*?\}\);', friends_js, content)
# Replace monthFriendsData
month_js = "export const monthFriendsData = reactive(\n {\n month_data:[\n"
for m in month_friends_data:
month_js += f" {{ month: \"{m['month']}\", nickname: \"{m['nickname']}\", className: \"{m['className']}\", num: {m['num']}, avatar: {m['avatar']} }},\n"
month_js += " ]\n }\n)"
content = re.sub(r'export const monthFriendsData = reactive\(\s*\{[\s\S]*?\}\s*\)', month_js, content)
# Replace keywordsData
keywords_js = "export const keywordsData = reactive({\n"
keywords_js += f" keyword: \"{top_keyword}\",\n"
keywords_js += f" keyword_num: {top_keyword_num},\n"
keywords_js += " messages : [],\n" # Clear dummy messages
keywords_js += " chart_option:{},\n"
keywords_js += " word_counter:[\n"
for w, c in keywords_list:
keywords_js += f" [\"{w}\",{c}],\n"
keywords_js += " ]\n});"
content = re.sub(r'export const keywordsData = reactive\(\{[\s\S]*?\}\);', keywords_js, content)
# Replace stepData (Heatmap)
# Find "export var stepData = ...;"
# It might be "getVirtualData('2024')" in the original file
content = re.sub(r'export var\s+stepData\s+=\s+[^;]+;', f'export var stepData = {heatmap_data_js};', content)
# Find max day (步数最高的一天)
max_day_str = '2025-01-01'
max_day_count = 0
if daily_step_counts:
max_day_str = max(daily_step_counts, key=daily_step_counts.get)
max_day_count = daily_step_counts[max_day_str]
max_date = datetime.datetime.strptime(max_day_str, '%Y-%m-%d')
# 计算年度总步数和距离
total_steps = sum(daily_step_counts.values())
distance_km = int(total_steps * 0.0007) # 大约每步0.7米
earth_rounds = round(distance_km / 40075, 2) # 地球周长约40075公里
# Update stepdescription - 真实的步数统计
step_desc_js = """export const stepdescription = {
sumUp: '行万里路',
left: {
totalStepsPrefix: '年度总步数',
totalSteps: %d,
distancePrefix: '相当于走了',
distance: %d,
distanceSuffix: '公里',
earthPrefix: '绕了地球',
earthRounds: %s,
earthSuffix: '',
},
right: {
year: '%s',
month: '%02d',
day: '%02d',
stepsPrefix: '达成',
steps: %d,
stepsSuffix: '',
message: '这一天,走过的是未知的风景,留下的是每一步的精彩',
},
};""" % (total_steps, distance_km, str(earth_rounds), str(max_date.year), max_date.month, max_date.day, max_day_count)
content = re.sub(r'export const stepdescription = \{[\s\S]*?\};', step_desc_js, content)
# Update Summary Card (wechatReportData)
summary_friends_js = "friends : [\n"
for f in chat_friends_data:
summary_friends_js += f" {{ name: '{f['name']}', avatarSrc: {f['avatarSrc']} }},\n"
summary_friends_js += " ],"
content = re.sub(r'friends : \[[\s\S]*?\],', summary_friends_js, content)
content = re.sub(r"\{ label: '聊天联系人', value: \d+, unit: '' \}", f"{{ label: '聊天联系人', value: {len(friend_msg_counts)}, unit: '' }}", content)
content = re.sub(r"\{ label: '发送消息', value: \d+, unit: '' \}", f"{{ label: '发送消息', value: {total_sent}, unit: '' }}", content)
content = re.sub(r"\{ label: '收到消息', value: \d+, unit: '' \}", f"{{ label: '收到消息', value: {total_received}, unit: '' }}", content)
content = re.sub(r"\{ label: '发送总字数', value: \d+, unit: '' \}", f"{{ label: '发送总字数', value: {total_words}, unit: '' }}", content)
content = re.sub(r"\{ label: '年度关键词', value: '[^']+' \}", f"{{ label: '年度关键词', value: '{top_keyword}' }}", content)
content = re.sub(r"\{ label: '常用表情包', image: '[^']+' \}", f"{{ label: '常用表情包', image: {top_emoji_src} }}", content)
with open(js_file, 'w', encoding='utf-8') as f:
f.write(content)
print("生成完成!请刷新网页查看。")
if __name__ == '__main__':
generate_report_data()