offline_data_model_pipline/data_generate/query_completion/frequency.py

92 lines
3.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright @2024 AI. Inspur Inc.
#
# @author: suojiayi <suojiayi@inspur.com>
# @date: 2025/05/13
import json
import pandas as pd
import json
from collections import Counter
import pandas as pd
def process_jsonl_and_save_to_excel(jsonl_file, output_excel):
"""
读取 JSONL 文件,提取最后一个 content 的内容,统计频率并保存到 Excel 表格中。
参数:
jsonl_file: 输入的 JSONL 文件路径
output_excel: 输出的 Excel 文件路径
"""
# 存储所有最后一行的 content
last_contents = []
# 读取 JSONL 文件
with open(jsonl_file, 'r', encoding='utf-8') as f:
for line in f:
# 解析每一行为 JSON 对象
data = json.loads(line)
# 提取 data 列表的最后一个 content
if 'data' in data and isinstance(data['data'], list) and len(data['data']) > 0:
last_content = data['data'][-2].get('content', '')
last_contents.append(last_content)
# 统计相同内容出现的频率
content_counter = Counter(last_contents)
# 转换为 DataFrame
df = pd.DataFrame(content_counter.items(), columns=['Content', 'Frequency'])
# 按频率降序排序
df = df.sort_values(by='Frequency', ascending=False)
# 保存到 Excel 文件
df.to_excel(output_excel, index=False)
print(f"统计结果已保存到 {output_excel}")
# 示例调用
jsonl_file = '/dataset-pvc/suojiayi/duihuabuquan/train_prepare/20250425_103516/tmp_data/instruct_data_dhbq_0425_filtered_2504251824.jsonl' # 输入的 JSONL 文件路径
output_excel = "./dhbq/frequency.xlsx" # 输出的 Excel 文件路径
process_jsonl_and_save_to_excel(jsonl_file, output_excel)
def index_jsonl(jsonl_file_path):
"""索引jsonl文件返回一个字典键是'data'字段数组中最后一个对象的'content'"""
index_dict = {}
with open(jsonl_file_path, 'r', encoding='utf-8') as file:
for line in file:
record = json.loads(line.strip())
data_list = record.get('data', [])
if data_list: # 确保'data'字段存在且非空
content_value = data_list[-1].get('content')
if content_value is not None:
index_dict[content_value] = record
return index_dict
def update_excel_with_matched_data(input_excel_path, output_excel_path, jsonl_index):
"""根据索引更新Excel文件"""
df = pd.read_excel(input_excel_path)
matched_data = [jsonl_index.get(value) for value in df.iloc[:, 0]]
df['matched_json'] = matched_data
df.to_excel(output_excel_path, index=False)
# 使用示例
if __name__ == '__main__':
input_excel_path = './dhbq/frequency.xlsx' # 输入Excel文件路径请替换为你的实际路径
output_excel_path = './dhbq/frequency-score-5.xlsx' # 输出Excel文件路径请替换为你的实际路径
jsonl_file_path = './dhbq/dhbq_merged_with_score_5.jsonl' # jsonl文件路径请替换为你的实际路径
print("开始索引jsonl文件...")
jsonl_index = index_jsonl(jsonl_file_path)
print(f"完成索引,共索引 {len(jsonl_index)} 条记录")
update_excel_with_matched_data(input_excel_path, output_excel_path, jsonl_index)
print(f"处理完成,已将结果保存至{output_excel_path}")