offline_data_model_pipline/data_generate/query_completion/merge.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright @2024 AI. Inspur Inc.
#
# @author: suojiayi <suojiayi@inspur.com>
# @date: 2025/05/13

import json

def clean_answer(answer):
    """
    提取 ```json 和 ``` 包裹的内容，并清理多余标记和空格。
    如果包裹的内容是合法的 JSON，则返回紧凑格式的 JSON 字符串；
    否则返回原始内容。
    """
    # 查找是否包含 ```json 开头和 ``` 结尾
    if "```" in answer:
        # 提取 ```json 和 ``` 之间的内容
        start = answer.find("```json")
        end = answer.find("```", start+len('```json'))
        if end != -1:
            content = answer[start+len('```json'):end].strip()
            # print(content)
        else:
            content = answer.strip()  # 如果没有找到结束的 ```，保留原始内容
    else:
        content = answer.strip()  # 如果没有匹配到 ```json 标记，保留原始内容

    # 尝试解析为 JSON 对象
    try:
        cleaned_json = json.loads(content)  # 解析为 JSON 对象
        return json.dumps(cleaned_json, ensure_ascii=False)  # 返回紧凑格式的 JSON 字符串
    except json.JSONDecodeError:
        return content  # 如果不是合法 JSON，返回原始内容


def merge_jsonl_files(file1, file2, output_file):
    """
    合并两个JSONL文件，并清理answer字段。
    """
    merged_data = []
    count1 = 0
    count2 = 0
    # 读取第一个文件
    with open(file1, 'r', encoding='utf-8') as f1:
        for line in f1:
            data = json.loads(line.strip())
            if 'answer' in data:
                data['answer'] = clean_answer(data['answer'])
            merged_data.append(data)

    # 读取第二个文件
    with open(file2, 'r', encoding='utf-8') as f2:
        for line in f2:
            data = json.loads(line.strip())
            if 'answer' in data:
                data['answer'] = clean_answer(data['answer'])
            merged_data.append(data)


    # 写入合并后的文件
    with open(output_file, 'w', encoding='utf-8') as out_f:
        for item in merged_data:
            out_f.write(json.dumps(item, ensure_ascii=False) + '\n')


示例用法
file1 = './dhbq/duihuabuquan_prompt_1.jsonl'
file2 = './dhbq/duihuabuquan_prompt_2.jsonl'
output_file = './dhbq/dhbq_prompt.jsonl'
file1 = './dhbq/duihuabuquan_score_1.jsonl'
file2 = './dhbq/duihuabuquan_score_2.jsonl'
output_file = './dhbq/dhbq_score.jsonl'

merge_jsonl_files(file1, file2, output_file)


import json

def merge_jsonl_files_by_uid(file1, file2, output_file):
    """
    根据UID合并两个JSONL文件，保留每个UID的一组'data'，并整合'answer'和'ins_tag_label'。
    'answer'字段重命名为'prompt_label'，同时保留'uid'字段。
    """
    # 从第一个文件读取数据并存储在字典中，键为uid
    data_dict = {}
    with open(file1, 'r', encoding='utf-8') as f1:
        for line in f1:
            record = json.loads(line.strip())
            uid = record['uid']
            if uid not in data_dict:
                data_dict[uid] = {
                    'uid': uid,  # 保留uid字段
                    'data': record['data'],
                    'prompt_label': record.get('answer')
                }
            else:
                print(f"Warning: Duplicate UID found in the first file: {uid}")

    # 处理第二个文件，根据UID合并数据
    with open(file2, 'r', encoding='utf-8') as f2:
        for line in f2:
            record = json.loads(line.strip())
            uid = record['uid']
            if uid in data_dict:
                # 如果UID存在，则添加或更新ins_tag_label字段
                if 'ins_tag_label' in record:
                    data_dict[uid]['ins_tag_label'] = record['ins_tag_label']
            else:
                # 如果UID不存在于第一个文件中，则直接添加到结果集中
                new_record = {
                    'uid': uid,  # 保留uid字段
                    'data': record['data']
                }
                if 'ins_tag_label' in record:
                    new_record['ins_tag_label'] = record['ins_tag_label']
                data_dict[uid] = new_record

    # 将合并后的数据写入输出文件
    with open(output_file, 'w', encoding='utf-8') as out_f:
        for uid in data_dict:
            out_f.write(json.dumps(data_dict[uid], ensure_ascii=False) + '\n')

# 示例用法
file1 = './dhbq/dhbq_prompt.jsonl'
file2 = './dhbq/dhbq_instag.jsonl'
output_file = './dhbq/dhbq.jsonl'

merge_jsonl_files_by_uid(file1, file2, output_file)

import json

def merge_jsonl_files(file1, file2, output_file):
    """
    合并两个JSONL文件：
    - 第一个文件包含'uid'和其他内容。
    - 第二个文件包含'index'、'cluster_center'和'embedding'。
    - 根据'uid'和'index'匹配，将第二个文件的'cluster_center'和'embedding'添加到第一个文件中。
    """
    # 读取第一个文件，构建以'uid'为键的字典
    data_dict = {}
    with open(file1, 'r', encoding='utf-8') as f1:
        for line in f1:
            record = json.loads(line.strip())
            uid = record['uid']
            data_dict[uid] = record

    # 读取第二个文件，提取'index'、'cluster_center'和'embedding'
    with open(file2, 'r', encoding='utf-8') as f2:
        for line in f2:
            record = json.loads(line.strip())
            index = record.get('data_idx')
            cluster_center = record.get('cluster_center')
            embedding = record.get('embedding')

            # 如果'index'存在于第一个文件的'uid'中，则合并数据
            if index in data_dict:
                data_dict[index]['cluster_center'] = cluster_center
                data_dict[index]['embedding'] = embedding

    # 将合并后的数据写入输出文件
    with open(output_file, 'w', encoding='utf-8') as out_f:
        for uid in data_dict:
            out_f.write(json.dumps(data_dict[uid], ensure_ascii=False) + '\n')

# 示例用法
file1 = './dhbq/dhbq.jsonl'
file2 = './dhbq/dhbq_cluster_kmeans_result.jsonl'
output_file = './dhbq/dhbq_merged.jsonl'

merge_jsonl_files(file1, file2, output_file)


import json

def merge_jsonl_files(file1, file2, output_file):
    """
    合并两个JSONL文件：
    - 第一个文件包含'uid'和其他内容。
    - 第二个文件包含'index'、'cluster_center'和'embedding'。
    - 根据'uid'和'index'匹配，将第二个文件的'cluster_center'和'embedding'添加到第一个文件中。
    """
    # 读取第一个文件，构建以'uid'为键的字典
    data_dict = {}
    with open(file1, 'r', encoding='utf-8') as f1:
        for line in f1:
            record = json.loads(line.strip())
            uid = record['uid']
            data_dict[uid] = record

    # 读取第二个文件，提取'index'、'cluster_center'和'embedding'
    with open(file2, 'r', encoding='utf-8') as f2:
        for line in f2:
            record = json.loads(line.strip())
            index = record.get('uid')
            score = record.get('answer')
            # 如果'index'存在于第一个文件的'uid'中，则合并数据
            if index in data_dict:
                data_dict[index]['score'] = score

    # 将合并后的数据写入输出文件
    with open(output_file, 'w', encoding='utf-8') as out_f:
        for uid in data_dict:
            out_f.write(json.dumps(data_dict[uid], ensure_ascii=False) + '\n')

# 示例用法
file1 = './dhbq/dhbq_merged.jsonl'
file2 = './dhbq/dhbq_score.jsonl'
output_file = './dhbq/dhbq_merged_with_score.jsonl'

merge_jsonl_files(file1, file2, output_file)


import json

def filter_records(input_file_path, output_file_path, target_field='score', target_value=5):
    """
    筛选出jsonl文件中指定字段等于特定值的记录，并保存为新的jsonl文件。

    :param input_file_path: 输入jsonl文件路径
    :param output_file_path: 输出jsonl文件路径
    :param target_field: 目标字段，默认为'score'
    :param target_value: 目标字段需要匹配的值，默认为5
    """
    with open(input_file_path, 'r', encoding='utf-8') as infile, \
            open(output_file_path, 'w', encoding='utf-8') as outfile:
        for line in infile:
            record = json.loads(line.strip())
            # 检查记录中目标字段的值是否等于目标值
            try:
                if int(record.get(target_field)) == target_value:
                    # 将符合条件的记录写入输出文件
                    outfile.write(json.dumps(record, ensure_ascii=False) + '\n')
            except Exception as e:
                continue


# 使用示例
if __name__ == '__main__':
    input_file_path = './dhbq/dhbq_merged_with_score.jsonl'  # 输入jsonl文件路径，请替换为你的实际路径
    output_file_path = './dhbq/dhbq_merged_with_score_5.jsonl'  # 输出jsonl文件路径，请替换为你的实际路径

    filter_records(input_file_path, output_file_path)
    print(f"筛选完成，已将符合条件的记录保存至{output_file_path}")