offline_data_model_pipline/data_generate/query_completion/frequency.py

92 lines
3.4 KiB
Python
Raw Normal View History

2025-05-13 13:00:51 +08:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright @2024 AI. Inspur Inc.
#
# @author: suojiayi <suojiayi@inspur.com>
# @date: 2025/05/13
import json
import pandas as pd
import json
from collections import Counter
import pandas as pd
def process_jsonl_and_save_to_excel(jsonl_file, output_excel):
"""
读取 JSONL 文件提取最后一个 content 的内容统计频率并保存到 Excel 表格中
参数:
jsonl_file: 输入的 JSONL 文件路径
output_excel: 输出的 Excel 文件路径
"""
# 存储所有最后一行的 content
last_contents = []
# 读取 JSONL 文件
with open(jsonl_file, 'r', encoding='utf-8') as f:
for line in f:
# 解析每一行为 JSON 对象
data = json.loads(line)
# 提取 data 列表的最后一个 content
if 'data' in data and isinstance(data['data'], list) and len(data['data']) > 0:
last_content = data['data'][-2].get('content', '')
last_contents.append(last_content)
# 统计相同内容出现的频率
content_counter = Counter(last_contents)
# 转换为 DataFrame
df = pd.DataFrame(content_counter.items(), columns=['Content', 'Frequency'])
# 按频率降序排序
df = df.sort_values(by='Frequency', ascending=False)
# 保存到 Excel 文件
df.to_excel(output_excel, index=False)
print(f"统计结果已保存到 {output_excel}")
# 示例调用
jsonl_file = '/dataset-pvc/suojiayi/duihuabuquan/train_prepare/20250425_103516/tmp_data/instruct_data_dhbq_0425_filtered_2504251824.jsonl' # 输入的 JSONL 文件路径
output_excel = "./dhbq/frequency.xlsx" # 输出的 Excel 文件路径
process_jsonl_and_save_to_excel(jsonl_file, output_excel)
def index_jsonl(jsonl_file_path):
"""索引jsonl文件返回一个字典键是'data'字段数组中最后一个对象的'content'"""
index_dict = {}
with open(jsonl_file_path, 'r', encoding='utf-8') as file:
for line in file:
record = json.loads(line.strip())
data_list = record.get('data', [])
if data_list: # 确保'data'字段存在且非空
content_value = data_list[-1].get('content')
if content_value is not None:
index_dict[content_value] = record
return index_dict
def update_excel_with_matched_data(input_excel_path, output_excel_path, jsonl_index):
"""根据索引更新Excel文件"""
df = pd.read_excel(input_excel_path)
matched_data = [jsonl_index.get(value) for value in df.iloc[:, 0]]
df['matched_json'] = matched_data
df.to_excel(output_excel_path, index=False)
# 使用示例
if __name__ == '__main__':
input_excel_path = './dhbq/frequency.xlsx' # 输入Excel文件路径请替换为你的实际路径
output_excel_path = './dhbq/frequency-score-5.xlsx' # 输出Excel文件路径请替换为你的实际路径
jsonl_file_path = './dhbq/dhbq_merged_with_score_5.jsonl' # jsonl文件路径请替换为你的实际路径
print("开始索引jsonl文件...")
jsonl_index = index_jsonl(jsonl_file_path)
print(f"完成索引,共索引 {len(jsonl_index)} 条记录")
update_excel_with_matched_data(input_excel_path, output_excel_path, jsonl_index)
print(f"处理完成,已将结果保存至{output_excel_path}")