#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright @2024 AI. Inspur Inc. # # @author: suojiayi # @date: 2025/05/13 import json import pandas as pd import json from collections import Counter import pandas as pd def process_jsonl_and_save_to_excel(jsonl_file, output_excel): """ 读取 JSONL 文件,提取最后一个 content 的内容,统计频率并保存到 Excel 表格中。 参数: jsonl_file: 输入的 JSONL 文件路径 output_excel: 输出的 Excel 文件路径 """ # 存储所有最后一行的 content last_contents = [] # 读取 JSONL 文件 with open(jsonl_file, 'r', encoding='utf-8') as f: for line in f: # 解析每一行为 JSON 对象 data = json.loads(line) # 提取 data 列表的最后一个 content if 'data' in data and isinstance(data['data'], list) and len(data['data']) > 0: last_content = data['data'][-2].get('content', '') last_contents.append(last_content) # 统计相同内容出现的频率 content_counter = Counter(last_contents) # 转换为 DataFrame df = pd.DataFrame(content_counter.items(), columns=['Content', 'Frequency']) # 按频率降序排序 df = df.sort_values(by='Frequency', ascending=False) # 保存到 Excel 文件 df.to_excel(output_excel, index=False) print(f"统计结果已保存到 {output_excel}") # 示例调用 jsonl_file = '/dataset-pvc/suojiayi/duihuabuquan/train_prepare/20250425_103516/tmp_data/instruct_data_dhbq_0425_filtered_2504251824.jsonl' # 输入的 JSONL 文件路径 output_excel = "./dhbq/frequency.xlsx" # 输出的 Excel 文件路径 process_jsonl_and_save_to_excel(jsonl_file, output_excel) def index_jsonl(jsonl_file_path): """索引jsonl文件,返回一个字典,键是'data'字段数组中最后一个对象的'content'值""" index_dict = {} with open(jsonl_file_path, 'r', encoding='utf-8') as file: for line in file: record = json.loads(line.strip()) data_list = record.get('data', []) if data_list: # 确保'data'字段存在且非空 content_value = data_list[-1].get('content') if content_value is not None: index_dict[content_value] = record return index_dict def update_excel_with_matched_data(input_excel_path, output_excel_path, jsonl_index): """根据索引更新Excel文件""" df = pd.read_excel(input_excel_path) matched_data = [jsonl_index.get(value) for value in df.iloc[:, 0]] df['matched_json'] = matched_data df.to_excel(output_excel_path, index=False) # 使用示例 if __name__ == '__main__': input_excel_path = './dhbq/frequency.xlsx' # 输入Excel文件路径,请替换为你的实际路径 output_excel_path = './dhbq/frequency-score-5.xlsx' # 输出Excel文件路径,请替换为你的实际路径 jsonl_file_path = './dhbq/dhbq_merged_with_score_5.jsonl' # jsonl文件路径,请替换为你的实际路径 print("开始索引jsonl文件...") jsonl_index = index_jsonl(jsonl_file_path) print(f"完成索引,共索引 {len(jsonl_index)} 条记录") update_excel_with_matched_data(input_excel_path, output_excel_path, jsonl_index) print(f"处理完成,已将结果保存至{output_excel_path}")