offline_data_model_pipline/data_generate/query_completion/frequency.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright @2024 AI. Inspur Inc.
#
# @author: suojiayi <suojiayi@inspur.com>
# @date: 2025/05/13

import json
import pandas as pd

import json
from collections import Counter
import pandas as pd

def process_jsonl_and_save_to_excel(jsonl_file, output_excel):
    """
    读取 JSONL 文件，提取最后一个 content 的内容，统计频率并保存到 Excel 表格中。

    参数:
        jsonl_file: 输入的 JSONL 文件路径
        output_excel: 输出的 Excel 文件路径
    """
    # 存储所有最后一行的 content
    last_contents = []

    # 读取 JSONL 文件
    with open(jsonl_file, 'r', encoding='utf-8') as f:
        for line in f:
            # 解析每一行为 JSON 对象
            data = json.loads(line)
            # 提取 data 列表的最后一个 content
            if 'data' in data and isinstance(data['data'], list) and len(data['data']) > 0:
                last_content = data['data'][-2].get('content', '')
                last_contents.append(last_content)

    # 统计相同内容出现的频率
    content_counter = Counter(last_contents)

    # 转换为 DataFrame
    df = pd.DataFrame(content_counter.items(), columns=['Content', 'Frequency'])

    # 按频率降序排序
    df = df.sort_values(by='Frequency', ascending=False)

    # 保存到 Excel 文件
    df.to_excel(output_excel, index=False)
    print(f"统计结果已保存到 {output_excel}")

# 示例调用
jsonl_file = '/dataset-pvc/suojiayi/duihuabuquan/train_prepare/20250425_103516/tmp_data/instruct_data_dhbq_0425_filtered_2504251824.jsonl' # 输入的 JSONL 文件路径
output_excel = "./dhbq/frequency.xlsx"  # 输出的 Excel 文件路径

process_jsonl_and_save_to_excel(jsonl_file, output_excel)

def index_jsonl(jsonl_file_path):
    """索引jsonl文件，返回一个字典，键是'data'字段数组中最后一个对象的'content'值"""
    index_dict = {}
    with open(jsonl_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            record = json.loads(line.strip())
            data_list = record.get('data', [])
            if data_list:  # 确保'data'字段存在且非空
                content_value = data_list[-1].get('content')
                if content_value is not None:
                    index_dict[content_value] = record
    return index_dict


def update_excel_with_matched_data(input_excel_path, output_excel_path, jsonl_index):
    """根据索引更新Excel文件"""
    df = pd.read_excel(input_excel_path)
    matched_data = [jsonl_index.get(value) for value in df.iloc[:, 0]]
    df['matched_json'] = matched_data
    df.to_excel(output_excel_path, index=False)


# 使用示例
if __name__ == '__main__':
    input_excel_path = './dhbq/frequency.xlsx'  # 输入Excel文件路径，请替换为你的实际路径
    output_excel_path = './dhbq/frequency-score-5.xlsx'  # 输出Excel文件路径，请替换为你的实际路径
    jsonl_file_path = './dhbq/dhbq_merged_with_score_5.jsonl'  # jsonl文件路径，请替换为你的实际路径

    print("开始索引jsonl文件...")
    jsonl_index = index_jsonl(jsonl_file_path)
    print(f"完成索引，共索引 {len(jsonl_index)} 条记录")

    update_excel_with_matched_data(input_excel_path, output_excel_path, jsonl_index)
    print(f"处理完成，已将结果保存至{output_excel_path}")