#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright @2024 AI. Inspur Inc. # # @author: suojiayi # @date: 2025/05/13 import json def clean_answer(answer): """ 提取 ```json 和 ``` 包裹的内容,并清理多余标记和空格。 如果包裹的内容是合法的 JSON,则返回紧凑格式的 JSON 字符串; 否则返回原始内容。 """ # 查找是否包含 ```json 开头和 ``` 结尾 if "```" in answer: # 提取 ```json 和 ``` 之间的内容 start = answer.find("```json") end = answer.find("```", start+len('```json')) if end != -1: content = answer[start+len('```json'):end].strip() # print(content) else: content = answer.strip() # 如果没有找到结束的 ```,保留原始内容 else: content = answer.strip() # 如果没有匹配到 ```json 标记,保留原始内容 # 尝试解析为 JSON 对象 try: cleaned_json = json.loads(content) # 解析为 JSON 对象 return json.dumps(cleaned_json, ensure_ascii=False) # 返回紧凑格式的 JSON 字符串 except json.JSONDecodeError: return content # 如果不是合法 JSON,返回原始内容 def merge_jsonl_files(file1, file2, output_file): """ 合并两个JSONL文件,并清理answer字段。 """ merged_data = [] count1 = 0 count2 = 0 # 读取第一个文件 with open(file1, 'r', encoding='utf-8') as f1: for line in f1: data = json.loads(line.strip()) if 'answer' in data: data['answer'] = clean_answer(data['answer']) merged_data.append(data) # 读取第二个文件 with open(file2, 'r', encoding='utf-8') as f2: for line in f2: data = json.loads(line.strip()) if 'answer' in data: data['answer'] = clean_answer(data['answer']) merged_data.append(data) # 写入合并后的文件 with open(output_file, 'w', encoding='utf-8') as out_f: for item in merged_data: out_f.write(json.dumps(item, ensure_ascii=False) + '\n') 示例用法 file1 = './dhbq/duihuabuquan_prompt_1.jsonl' file2 = './dhbq/duihuabuquan_prompt_2.jsonl' output_file = './dhbq/dhbq_prompt.jsonl' file1 = './dhbq/duihuabuquan_score_1.jsonl' file2 = './dhbq/duihuabuquan_score_2.jsonl' output_file = './dhbq/dhbq_score.jsonl' merge_jsonl_files(file1, file2, output_file) import json def merge_jsonl_files_by_uid(file1, file2, output_file): """ 根据UID合并两个JSONL文件,保留每个UID的一组'data',并整合'answer'和'ins_tag_label'。 'answer'字段重命名为'prompt_label',同时保留'uid'字段。 """ # 从第一个文件读取数据并存储在字典中,键为uid data_dict = {} with open(file1, 'r', encoding='utf-8') as f1: for line in f1: record = json.loads(line.strip()) uid = record['uid'] if uid not in data_dict: data_dict[uid] = { 'uid': uid, # 保留uid字段 'data': record['data'], 'prompt_label': record.get('answer') } else: print(f"Warning: Duplicate UID found in the first file: {uid}") # 处理第二个文件,根据UID合并数据 with open(file2, 'r', encoding='utf-8') as f2: for line in f2: record = json.loads(line.strip()) uid = record['uid'] if uid in data_dict: # 如果UID存在,则添加或更新ins_tag_label字段 if 'ins_tag_label' in record: data_dict[uid]['ins_tag_label'] = record['ins_tag_label'] else: # 如果UID不存在于第一个文件中,则直接添加到结果集中 new_record = { 'uid': uid, # 保留uid字段 'data': record['data'] } if 'ins_tag_label' in record: new_record['ins_tag_label'] = record['ins_tag_label'] data_dict[uid] = new_record # 将合并后的数据写入输出文件 with open(output_file, 'w', encoding='utf-8') as out_f: for uid in data_dict: out_f.write(json.dumps(data_dict[uid], ensure_ascii=False) + '\n') # 示例用法 file1 = './dhbq/dhbq_prompt.jsonl' file2 = './dhbq/dhbq_instag.jsonl' output_file = './dhbq/dhbq.jsonl' merge_jsonl_files_by_uid(file1, file2, output_file) import json def merge_jsonl_files(file1, file2, output_file): """ 合并两个JSONL文件: - 第一个文件包含'uid'和其他内容。 - 第二个文件包含'index'、'cluster_center'和'embedding'。 - 根据'uid'和'index'匹配,将第二个文件的'cluster_center'和'embedding'添加到第一个文件中。 """ # 读取第一个文件,构建以'uid'为键的字典 data_dict = {} with open(file1, 'r', encoding='utf-8') as f1: for line in f1: record = json.loads(line.strip()) uid = record['uid'] data_dict[uid] = record # 读取第二个文件,提取'index'、'cluster_center'和'embedding' with open(file2, 'r', encoding='utf-8') as f2: for line in f2: record = json.loads(line.strip()) index = record.get('data_idx') cluster_center = record.get('cluster_center') #embedding = record.get('embedding') # 如果'index'存在于第一个文件的'uid'中,则合并数据 if index in data_dict: data_dict[index]['cluster_center'] = cluster_center #data_dict[index]['embedding'] = embedding # 将合并后的数据写入输出文件 with open(output_file, 'w', encoding='utf-8') as out_f: for uid in data_dict: out_f.write(json.dumps(data_dict[uid], ensure_ascii=False) + '\n') # 示例用法 file1 = './dhbq/dhbq.jsonl' file2 = './dhbq/dhbq_cluster_kmeans_result.jsonl' output_file = './dhbq/dhbq_merged.jsonl' merge_jsonl_files(file1, file2, output_file) import json def merge_jsonl_files(file1, file2, output_file): """ 合并两个JSONL文件: - 第一个文件包含'uid'和其他内容。 - 第二个文件包含'index'、'cluster_center'和'embedding'。 - 根据'uid'和'index'匹配,将第二个文件的'cluster_center'和'embedding'添加到第一个文件中。 """ # 读取第一个文件,构建以'uid'为键的字典 data_dict = {} with open(file1, 'r', encoding='utf-8') as f1: for line in f1: record = json.loads(line.strip()) uid = record['uid'] data_dict[uid] = record # 读取第二个文件,提取'index'、'cluster_center'和'embedding' with open(file2, 'r', encoding='utf-8') as f2: for line in f2: record = json.loads(line.strip()) index = record.get('uid') score = record.get('answer') embedding = record.get('embedding') # 如果'index'存在于第一个文件的'uid'中,则合并数据 if index in data_dict: data_dict[index]['score'] = score data_dict[index]['embedding'] = embedding # 将合并后的数据写入输出文件 with open(output_file, 'w', encoding='utf-8') as out_f: for uid in data_dict: out_f.write(json.dumps(data_dict[uid], ensure_ascii=False) + '\n') # 示例用法 file1 = './dhbq/dhbq_merged.jsonl' file2 = './dhbq/dhbq_score.jsonl' output_file = './dhbq/dhbq_merged_with_score.jsonl' merge_jsonl_files(file1, file2, output_file) import json def filter_records(input_file_path, output_file_path, target_field='score', target_value=5): """ 筛选出jsonl文件中指定字段等于特定值的记录,并保存为新的jsonl文件。 :param input_file_path: 输入jsonl文件路径 :param output_file_path: 输出jsonl文件路径 :param target_field: 目标字段,默认为'score' :param target_value: 目标字段需要匹配的值,默认为5 """ with open(input_file_path, 'r', encoding='utf-8') as infile, \ open(output_file_path, 'w', encoding='utf-8') as outfile: for line in infile: record = json.loads(line.strip()) # 检查记录中目标字段的值是否等于目标值 try: if int(record.get(target_field)) == target_value: # 将符合条件的记录写入输出文件 outfile.write(json.dumps(record, ensure_ascii=False) + '\n') except Exception as e: continue # 使用示例 if __name__ == '__main__': input_file_path = './dhbq/dhbq_merged_with_score.jsonl' # 输入jsonl文件路径,请替换为你的实际路径 output_file_path = './dhbq/dhbq_merged_with_score_5.jsonl' # 输出jsonl文件路径,请替换为你的实际路径 filter_records(input_file_path, output_file_path) print(f"筛选完成,已将符合条件的记录保存至{output_file_path}")