2025-05-13 13:00:51 +08:00
|
|
|
|
#!/usr/bin/env python
|
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
#
|
|
|
|
|
# Copyright @2024 AI. Inspur Inc.
|
|
|
|
|
#
|
|
|
|
|
# @author: suojiayi <suojiayi@inspur.com>
|
|
|
|
|
# @date: 2025/05/13
|
|
|
|
|
|
|
|
|
|
import json
|
|
|
|
|
|
|
|
|
|
def clean_answer(answer):
|
|
|
|
|
"""
|
|
|
|
|
提取 ```json 和 ``` 包裹的内容,并清理多余标记和空格。
|
|
|
|
|
如果包裹的内容是合法的 JSON,则返回紧凑格式的 JSON 字符串;
|
|
|
|
|
否则返回原始内容。
|
|
|
|
|
"""
|
|
|
|
|
# 查找是否包含 ```json 开头和 ``` 结尾
|
|
|
|
|
if "```" in answer:
|
|
|
|
|
# 提取 ```json 和 ``` 之间的内容
|
|
|
|
|
start = answer.find("```json")
|
|
|
|
|
end = answer.find("```", start+len('```json'))
|
|
|
|
|
if end != -1:
|
|
|
|
|
content = answer[start+len('```json'):end].strip()
|
|
|
|
|
# print(content)
|
|
|
|
|
else:
|
|
|
|
|
content = answer.strip() # 如果没有找到结束的 ```,保留原始内容
|
|
|
|
|
else:
|
|
|
|
|
content = answer.strip() # 如果没有匹配到 ```json 标记,保留原始内容
|
|
|
|
|
|
|
|
|
|
# 尝试解析为 JSON 对象
|
|
|
|
|
try:
|
|
|
|
|
cleaned_json = json.loads(content) # 解析为 JSON 对象
|
|
|
|
|
return json.dumps(cleaned_json, ensure_ascii=False) # 返回紧凑格式的 JSON 字符串
|
|
|
|
|
except json.JSONDecodeError:
|
|
|
|
|
return content # 如果不是合法 JSON,返回原始内容
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def merge_jsonl_files(file1, file2, output_file):
|
|
|
|
|
"""
|
|
|
|
|
合并两个JSONL文件,并清理answer字段。
|
|
|
|
|
"""
|
|
|
|
|
merged_data = []
|
|
|
|
|
count1 = 0
|
|
|
|
|
count2 = 0
|
|
|
|
|
# 读取第一个文件
|
|
|
|
|
with open(file1, 'r', encoding='utf-8') as f1:
|
|
|
|
|
for line in f1:
|
|
|
|
|
data = json.loads(line.strip())
|
|
|
|
|
if 'answer' in data:
|
|
|
|
|
data['answer'] = clean_answer(data['answer'])
|
|
|
|
|
merged_data.append(data)
|
|
|
|
|
|
|
|
|
|
# 读取第二个文件
|
|
|
|
|
with open(file2, 'r', encoding='utf-8') as f2:
|
|
|
|
|
for line in f2:
|
|
|
|
|
data = json.loads(line.strip())
|
|
|
|
|
if 'answer' in data:
|
|
|
|
|
data['answer'] = clean_answer(data['answer'])
|
|
|
|
|
merged_data.append(data)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 写入合并后的文件
|
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as out_f:
|
|
|
|
|
for item in merged_data:
|
|
|
|
|
out_f.write(json.dumps(item, ensure_ascii=False) + '\n')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
示例用法
|
|
|
|
|
file1 = './dhbq/duihuabuquan_prompt_1.jsonl'
|
|
|
|
|
file2 = './dhbq/duihuabuquan_prompt_2.jsonl'
|
|
|
|
|
output_file = './dhbq/dhbq_prompt.jsonl'
|
|
|
|
|
file1 = './dhbq/duihuabuquan_score_1.jsonl'
|
|
|
|
|
file2 = './dhbq/duihuabuquan_score_2.jsonl'
|
|
|
|
|
output_file = './dhbq/dhbq_score.jsonl'
|
|
|
|
|
|
|
|
|
|
merge_jsonl_files(file1, file2, output_file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import json
|
|
|
|
|
|
|
|
|
|
def merge_jsonl_files_by_uid(file1, file2, output_file):
|
|
|
|
|
"""
|
|
|
|
|
根据UID合并两个JSONL文件,保留每个UID的一组'data',并整合'answer'和'ins_tag_label'。
|
|
|
|
|
'answer'字段重命名为'prompt_label',同时保留'uid'字段。
|
|
|
|
|
"""
|
|
|
|
|
# 从第一个文件读取数据并存储在字典中,键为uid
|
|
|
|
|
data_dict = {}
|
|
|
|
|
with open(file1, 'r', encoding='utf-8') as f1:
|
|
|
|
|
for line in f1:
|
|
|
|
|
record = json.loads(line.strip())
|
|
|
|
|
uid = record['uid']
|
|
|
|
|
if uid not in data_dict:
|
|
|
|
|
data_dict[uid] = {
|
|
|
|
|
'uid': uid, # 保留uid字段
|
|
|
|
|
'data': record['data'],
|
|
|
|
|
'prompt_label': record.get('answer')
|
|
|
|
|
}
|
|
|
|
|
else:
|
|
|
|
|
print(f"Warning: Duplicate UID found in the first file: {uid}")
|
|
|
|
|
|
|
|
|
|
# 处理第二个文件,根据UID合并数据
|
|
|
|
|
with open(file2, 'r', encoding='utf-8') as f2:
|
|
|
|
|
for line in f2:
|
|
|
|
|
record = json.loads(line.strip())
|
|
|
|
|
uid = record['uid']
|
|
|
|
|
if uid in data_dict:
|
|
|
|
|
# 如果UID存在,则添加或更新ins_tag_label字段
|
|
|
|
|
if 'ins_tag_label' in record:
|
|
|
|
|
data_dict[uid]['ins_tag_label'] = record['ins_tag_label']
|
|
|
|
|
else:
|
|
|
|
|
# 如果UID不存在于第一个文件中,则直接添加到结果集中
|
|
|
|
|
new_record = {
|
|
|
|
|
'uid': uid, # 保留uid字段
|
|
|
|
|
'data': record['data']
|
|
|
|
|
}
|
|
|
|
|
if 'ins_tag_label' in record:
|
|
|
|
|
new_record['ins_tag_label'] = record['ins_tag_label']
|
|
|
|
|
data_dict[uid] = new_record
|
|
|
|
|
|
|
|
|
|
# 将合并后的数据写入输出文件
|
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as out_f:
|
|
|
|
|
for uid in data_dict:
|
|
|
|
|
out_f.write(json.dumps(data_dict[uid], ensure_ascii=False) + '\n')
|
|
|
|
|
|
|
|
|
|
# 示例用法
|
|
|
|
|
file1 = './dhbq/dhbq_prompt.jsonl'
|
|
|
|
|
file2 = './dhbq/dhbq_instag.jsonl'
|
|
|
|
|
output_file = './dhbq/dhbq.jsonl'
|
|
|
|
|
|
|
|
|
|
merge_jsonl_files_by_uid(file1, file2, output_file)
|
|
|
|
|
|
|
|
|
|
import json
|
|
|
|
|
|
|
|
|
|
def merge_jsonl_files(file1, file2, output_file):
|
|
|
|
|
"""
|
|
|
|
|
合并两个JSONL文件:
|
|
|
|
|
- 第一个文件包含'uid'和其他内容。
|
|
|
|
|
- 第二个文件包含'index'、'cluster_center'和'embedding'。
|
|
|
|
|
- 根据'uid'和'index'匹配,将第二个文件的'cluster_center'和'embedding'添加到第一个文件中。
|
|
|
|
|
"""
|
|
|
|
|
# 读取第一个文件,构建以'uid'为键的字典
|
|
|
|
|
data_dict = {}
|
|
|
|
|
with open(file1, 'r', encoding='utf-8') as f1:
|
|
|
|
|
for line in f1:
|
|
|
|
|
record = json.loads(line.strip())
|
|
|
|
|
uid = record['uid']
|
|
|
|
|
data_dict[uid] = record
|
|
|
|
|
|
|
|
|
|
# 读取第二个文件,提取'index'、'cluster_center'和'embedding'
|
|
|
|
|
with open(file2, 'r', encoding='utf-8') as f2:
|
|
|
|
|
for line in f2:
|
|
|
|
|
record = json.loads(line.strip())
|
|
|
|
|
index = record.get('data_idx')
|
|
|
|
|
cluster_center = record.get('cluster_center')
|
2025-05-13 14:39:28 +08:00
|
|
|
|
embedding = record.get('embedding')
|
2025-05-13 13:00:51 +08:00
|
|
|
|
|
|
|
|
|
# 如果'index'存在于第一个文件的'uid'中,则合并数据
|
|
|
|
|
if index in data_dict:
|
|
|
|
|
data_dict[index]['cluster_center'] = cluster_center
|
2025-05-13 14:39:28 +08:00
|
|
|
|
data_dict[index]['embedding'] = embedding
|
2025-05-13 13:00:51 +08:00
|
|
|
|
|
|
|
|
|
# 将合并后的数据写入输出文件
|
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as out_f:
|
|
|
|
|
for uid in data_dict:
|
|
|
|
|
out_f.write(json.dumps(data_dict[uid], ensure_ascii=False) + '\n')
|
|
|
|
|
|
|
|
|
|
# 示例用法
|
|
|
|
|
file1 = './dhbq/dhbq.jsonl'
|
|
|
|
|
file2 = './dhbq/dhbq_cluster_kmeans_result.jsonl'
|
|
|
|
|
output_file = './dhbq/dhbq_merged.jsonl'
|
|
|
|
|
|
|
|
|
|
merge_jsonl_files(file1, file2, output_file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import json
|
|
|
|
|
|
|
|
|
|
def merge_jsonl_files(file1, file2, output_file):
|
|
|
|
|
"""
|
|
|
|
|
合并两个JSONL文件:
|
|
|
|
|
- 第一个文件包含'uid'和其他内容。
|
|
|
|
|
- 第二个文件包含'index'、'cluster_center'和'embedding'。
|
|
|
|
|
- 根据'uid'和'index'匹配,将第二个文件的'cluster_center'和'embedding'添加到第一个文件中。
|
|
|
|
|
"""
|
|
|
|
|
# 读取第一个文件,构建以'uid'为键的字典
|
|
|
|
|
data_dict = {}
|
|
|
|
|
with open(file1, 'r', encoding='utf-8') as f1:
|
|
|
|
|
for line in f1:
|
|
|
|
|
record = json.loads(line.strip())
|
|
|
|
|
uid = record['uid']
|
|
|
|
|
data_dict[uid] = record
|
|
|
|
|
|
|
|
|
|
# 读取第二个文件,提取'index'、'cluster_center'和'embedding'
|
|
|
|
|
with open(file2, 'r', encoding='utf-8') as f2:
|
|
|
|
|
for line in f2:
|
|
|
|
|
record = json.loads(line.strip())
|
|
|
|
|
index = record.get('uid')
|
|
|
|
|
score = record.get('answer')
|
|
|
|
|
# 如果'index'存在于第一个文件的'uid'中,则合并数据
|
|
|
|
|
if index in data_dict:
|
|
|
|
|
data_dict[index]['score'] = score
|
|
|
|
|
|
|
|
|
|
# 将合并后的数据写入输出文件
|
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as out_f:
|
|
|
|
|
for uid in data_dict:
|
|
|
|
|
out_f.write(json.dumps(data_dict[uid], ensure_ascii=False) + '\n')
|
|
|
|
|
|
|
|
|
|
# 示例用法
|
|
|
|
|
file1 = './dhbq/dhbq_merged.jsonl'
|
|
|
|
|
file2 = './dhbq/dhbq_score.jsonl'
|
|
|
|
|
output_file = './dhbq/dhbq_merged_with_score.jsonl'
|
|
|
|
|
|
|
|
|
|
merge_jsonl_files(file1, file2, output_file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import json
|
|
|
|
|
|
|
|
|
|
def filter_records(input_file_path, output_file_path, target_field='score', target_value=5):
|
|
|
|
|
"""
|
|
|
|
|
筛选出jsonl文件中指定字段等于特定值的记录,并保存为新的jsonl文件。
|
|
|
|
|
|
|
|
|
|
:param input_file_path: 输入jsonl文件路径
|
|
|
|
|
:param output_file_path: 输出jsonl文件路径
|
|
|
|
|
:param target_field: 目标字段,默认为'score'
|
|
|
|
|
:param target_value: 目标字段需要匹配的值,默认为5
|
|
|
|
|
"""
|
|
|
|
|
with open(input_file_path, 'r', encoding='utf-8') as infile, \
|
|
|
|
|
open(output_file_path, 'w', encoding='utf-8') as outfile:
|
|
|
|
|
for line in infile:
|
|
|
|
|
record = json.loads(line.strip())
|
|
|
|
|
# 检查记录中目标字段的值是否等于目标值
|
|
|
|
|
try:
|
|
|
|
|
if int(record.get(target_field)) == target_value:
|
|
|
|
|
# 将符合条件的记录写入输出文件
|
|
|
|
|
outfile.write(json.dumps(record, ensure_ascii=False) + '\n')
|
|
|
|
|
except Exception as e:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 使用示例
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
input_file_path = './dhbq/dhbq_merged_with_score.jsonl' # 输入jsonl文件路径,请替换为你的实际路径
|
|
|
|
|
output_file_path = './dhbq/dhbq_merged_with_score_5.jsonl' # 输出jsonl文件路径,请替换为你的实际路径
|
|
|
|
|
|
|
|
|
|
filter_records(input_file_path, output_file_path)
|
|
|
|
|
print(f"筛选完成,已将符合条件的记录保存至{output_file_path}")
|