for line in lines: # 仅保留互相发送的信息记录 if (line.startswith('对方的用户名') or line.startswith('你的用户名')) and re.match(reg, line): filter_lines.append(line.strip())
cur.execute("select * from log") r = cur.fetchall() # 获得全部聊天记录
result = {}
# 获得最长的一句话 max_item = None for item in r: content = item[3] if (max_item isNoneorlen(content) > len(max_item[3])) and content.find('http') == -1: max_item = item print(max_item)
# 进行分词 word_arr = [] for item in r: content = item[3] seg_list = jieba.cut(content) word_arr = word_arr + list(seg_list)
# 计算词频 word_count_map = {} for word in word_arr: if word in word_count_map: word_count_map[word] = word_count_map[word] + 1 else: word_count_map[word] = 1
# 组合成字典 word_count_arr = [] for word in word_count_map: o = { 'word': word, 'count': word_count_map[word] } word_count_arr.append(o)
# 按词频排序
defcustom_sort(x, y): if x['count'] > y['count']: return -1 if x['count'] < y['count']: return1 return0
# with open("wechat-report\bin\result.json", "w", encoding="utf-8") as f: # f.write( # json.dumps(result, ensure_ascii=False) # )
# 使用分号隔开写入csv文件 withopen(r"wechat-report\bin\result.csv", "w", encoding="utf-8") as f: for i in result['word']: f.write(i['word']+';'+str(i['count'])+'\n')