92 lines
3.4 KiB
Python
92 lines
3.4 KiB
Python
|
#!/usr/bin/env python
|
|||
|
# -*- coding: utf-8 -*-
|
|||
|
#
|
|||
|
# Copyright @2024 AI. Inspur Inc.
|
|||
|
#
|
|||
|
# @author: suojiayi <suojiayi@inspur.com>
|
|||
|
# @date: 2025/05/13
|
|||
|
|
|||
|
import json
|
|||
|
import pandas as pd
|
|||
|
|
|||
|
import json
|
|||
|
from collections import Counter
|
|||
|
import pandas as pd
|
|||
|
|
|||
|
def process_jsonl_and_save_to_excel(jsonl_file, output_excel):
|
|||
|
"""
|
|||
|
读取 JSONL 文件,提取最后一个 content 的内容,统计频率并保存到 Excel 表格中。
|
|||
|
|
|||
|
参数:
|
|||
|
jsonl_file: 输入的 JSONL 文件路径
|
|||
|
output_excel: 输出的 Excel 文件路径
|
|||
|
"""
|
|||
|
# 存储所有最后一行的 content
|
|||
|
last_contents = []
|
|||
|
|
|||
|
# 读取 JSONL 文件
|
|||
|
with open(jsonl_file, 'r', encoding='utf-8') as f:
|
|||
|
for line in f:
|
|||
|
# 解析每一行为 JSON 对象
|
|||
|
data = json.loads(line)
|
|||
|
# 提取 data 列表的最后一个 content
|
|||
|
if 'data' in data and isinstance(data['data'], list) and len(data['data']) > 0:
|
|||
|
last_content = data['data'][-2].get('content', '')
|
|||
|
last_contents.append(last_content)
|
|||
|
|
|||
|
# 统计相同内容出现的频率
|
|||
|
content_counter = Counter(last_contents)
|
|||
|
|
|||
|
# 转换为 DataFrame
|
|||
|
df = pd.DataFrame(content_counter.items(), columns=['Content', 'Frequency'])
|
|||
|
|
|||
|
# 按频率降序排序
|
|||
|
df = df.sort_values(by='Frequency', ascending=False)
|
|||
|
|
|||
|
# 保存到 Excel 文件
|
|||
|
df.to_excel(output_excel, index=False)
|
|||
|
print(f"统计结果已保存到 {output_excel}")
|
|||
|
|
|||
|
# 示例调用
|
|||
|
jsonl_file = '/dataset-pvc/suojiayi/duihuabuquan/train_prepare/20250425_103516/tmp_data/instruct_data_dhbq_0425_filtered_2504251824.jsonl' # 输入的 JSONL 文件路径
|
|||
|
output_excel = "./dhbq/frequency.xlsx" # 输出的 Excel 文件路径
|
|||
|
|
|||
|
process_jsonl_and_save_to_excel(jsonl_file, output_excel)
|
|||
|
|
|||
|
def index_jsonl(jsonl_file_path):
|
|||
|
"""索引jsonl文件,返回一个字典,键是'data'字段数组中最后一个对象的'content'值"""
|
|||
|
index_dict = {}
|
|||
|
with open(jsonl_file_path, 'r', encoding='utf-8') as file:
|
|||
|
for line in file:
|
|||
|
record = json.loads(line.strip())
|
|||
|
data_list = record.get('data', [])
|
|||
|
if data_list: # 确保'data'字段存在且非空
|
|||
|
content_value = data_list[-1].get('content')
|
|||
|
if content_value is not None:
|
|||
|
index_dict[content_value] = record
|
|||
|
return index_dict
|
|||
|
|
|||
|
|
|||
|
def update_excel_with_matched_data(input_excel_path, output_excel_path, jsonl_index):
|
|||
|
"""根据索引更新Excel文件"""
|
|||
|
df = pd.read_excel(input_excel_path)
|
|||
|
matched_data = [jsonl_index.get(value) for value in df.iloc[:, 0]]
|
|||
|
df['matched_json'] = matched_data
|
|||
|
df.to_excel(output_excel_path, index=False)
|
|||
|
|
|||
|
|
|||
|
# 使用示例
|
|||
|
if __name__ == '__main__':
|
|||
|
input_excel_path = './dhbq/frequency.xlsx' # 输入Excel文件路径,请替换为你的实际路径
|
|||
|
output_excel_path = './dhbq/frequency-score-5.xlsx' # 输出Excel文件路径,请替换为你的实际路径
|
|||
|
jsonl_file_path = './dhbq/dhbq_merged_with_score_5.jsonl' # jsonl文件路径,请替换为你的实际路径
|
|||
|
|
|||
|
print("开始索引jsonl文件...")
|
|||
|
jsonl_index = index_jsonl(jsonl_file_path)
|
|||
|
print(f"完成索引,共索引 {len(jsonl_index)} 条记录")
|
|||
|
|
|||
|
update_excel_with_matched_data(input_excel_path, output_excel_path, jsonl_index)
|
|||
|
print(f"处理完成,已将结果保存至{output_excel_path}")
|
|||
|
|
|||
|
|