offline_data_model_pipline/data_generate/query_completion/merge.py

246 lines
9.2 KiB
Python
Raw Normal View History

2025-05-13 13:00:51 +08:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright @2024 AI. Inspur Inc.
#
# @author: suojiayi <suojiayi@inspur.com>
# @date: 2025/05/13
import json
def clean_answer(answer):
"""
提取 ```json ``` 包裹的内容并清理多余标记和空格
如果包裹的内容是合法的 JSON则返回紧凑格式的 JSON 字符串
否则返回原始内容
"""
# 查找是否包含 ```json 开头和 ``` 结尾
if "```" in answer:
# 提取 ```json 和 ``` 之间的内容
start = answer.find("```json")
end = answer.find("```", start+len('```json'))
if end != -1:
content = answer[start+len('```json'):end].strip()
# print(content)
else:
content = answer.strip() # 如果没有找到结束的 ```,保留原始内容
else:
content = answer.strip() # 如果没有匹配到 ```json 标记,保留原始内容
# 尝试解析为 JSON 对象
try:
cleaned_json = json.loads(content) # 解析为 JSON 对象
return json.dumps(cleaned_json, ensure_ascii=False) # 返回紧凑格式的 JSON 字符串
except json.JSONDecodeError:
return content # 如果不是合法 JSON返回原始内容
def merge_jsonl_files(file1, file2, output_file):
"""
合并两个JSONL文件并清理answer字段
"""
merged_data = []
count1 = 0
count2 = 0
# 读取第一个文件
with open(file1, 'r', encoding='utf-8') as f1:
for line in f1:
data = json.loads(line.strip())
if 'answer' in data:
data['answer'] = clean_answer(data['answer'])
merged_data.append(data)
# 读取第二个文件
with open(file2, 'r', encoding='utf-8') as f2:
for line in f2:
data = json.loads(line.strip())
if 'answer' in data:
data['answer'] = clean_answer(data['answer'])
merged_data.append(data)
# 写入合并后的文件
with open(output_file, 'w', encoding='utf-8') as out_f:
for item in merged_data:
out_f.write(json.dumps(item, ensure_ascii=False) + '\n')
示例用法
file1 = './dhbq/duihuabuquan_prompt_1.jsonl'
file2 = './dhbq/duihuabuquan_prompt_2.jsonl'
output_file = './dhbq/dhbq_prompt.jsonl'
file1 = './dhbq/duihuabuquan_score_1.jsonl'
file2 = './dhbq/duihuabuquan_score_2.jsonl'
output_file = './dhbq/dhbq_score.jsonl'
merge_jsonl_files(file1, file2, output_file)
import json
def merge_jsonl_files_by_uid(file1, file2, output_file):
"""
根据UID合并两个JSONL文件保留每个UID的一组'data'并整合'answer''ins_tag_label'
'answer'字段重命名为'prompt_label'同时保留'uid'字段
"""
# 从第一个文件读取数据并存储在字典中键为uid
data_dict = {}
with open(file1, 'r', encoding='utf-8') as f1:
for line in f1:
record = json.loads(line.strip())
uid = record['uid']
if uid not in data_dict:
data_dict[uid] = {
'uid': uid, # 保留uid字段
'data': record['data'],
'prompt_label': record.get('answer')
}
else:
print(f"Warning: Duplicate UID found in the first file: {uid}")
# 处理第二个文件根据UID合并数据
with open(file2, 'r', encoding='utf-8') as f2:
for line in f2:
record = json.loads(line.strip())
uid = record['uid']
if uid in data_dict:
# 如果UID存在则添加或更新ins_tag_label字段
if 'ins_tag_label' in record:
data_dict[uid]['ins_tag_label'] = record['ins_tag_label']
else:
# 如果UID不存在于第一个文件中则直接添加到结果集中
new_record = {
'uid': uid, # 保留uid字段
'data': record['data']
}
if 'ins_tag_label' in record:
new_record['ins_tag_label'] = record['ins_tag_label']
data_dict[uid] = new_record
# 将合并后的数据写入输出文件
with open(output_file, 'w', encoding='utf-8') as out_f:
for uid in data_dict:
out_f.write(json.dumps(data_dict[uid], ensure_ascii=False) + '\n')
# 示例用法
file1 = './dhbq/dhbq_prompt.jsonl'
file2 = './dhbq/dhbq_instag.jsonl'
output_file = './dhbq/dhbq.jsonl'
merge_jsonl_files_by_uid(file1, file2, output_file)
import json
def merge_jsonl_files(file1, file2, output_file):
"""
合并两个JSONL文件
- 第一个文件包含'uid'和其他内容
- 第二个文件包含'index''cluster_center''embedding'
- 根据'uid''index'匹配将第二个文件的'cluster_center''embedding'添加到第一个文件中
"""
# 读取第一个文件,构建以'uid'为键的字典
data_dict = {}
with open(file1, 'r', encoding='utf-8') as f1:
for line in f1:
record = json.loads(line.strip())
uid = record['uid']
data_dict[uid] = record
# 读取第二个文件,提取'index'、'cluster_center'和'embedding'
with open(file2, 'r', encoding='utf-8') as f2:
for line in f2:
record = json.loads(line.strip())
index = record.get('data_idx')
cluster_center = record.get('cluster_center')
2025-05-13 14:39:28 +08:00
embedding = record.get('embedding')
2025-05-13 13:00:51 +08:00
# 如果'index'存在于第一个文件的'uid'中,则合并数据
if index in data_dict:
data_dict[index]['cluster_center'] = cluster_center
2025-05-13 14:39:28 +08:00
data_dict[index]['embedding'] = embedding
2025-05-13 13:00:51 +08:00
# 将合并后的数据写入输出文件
with open(output_file, 'w', encoding='utf-8') as out_f:
for uid in data_dict:
out_f.write(json.dumps(data_dict[uid], ensure_ascii=False) + '\n')
# 示例用法
file1 = './dhbq/dhbq.jsonl'
file2 = './dhbq/dhbq_cluster_kmeans_result.jsonl'
output_file = './dhbq/dhbq_merged.jsonl'
merge_jsonl_files(file1, file2, output_file)
import json
def merge_jsonl_files(file1, file2, output_file):
"""
合并两个JSONL文件
- 第一个文件包含'uid'和其他内容
- 第二个文件包含'index''cluster_center''embedding'
- 根据'uid''index'匹配将第二个文件的'cluster_center''embedding'添加到第一个文件中
"""
# 读取第一个文件,构建以'uid'为键的字典
data_dict = {}
with open(file1, 'r', encoding='utf-8') as f1:
for line in f1:
record = json.loads(line.strip())
uid = record['uid']
data_dict[uid] = record
# 读取第二个文件,提取'index'、'cluster_center'和'embedding'
with open(file2, 'r', encoding='utf-8') as f2:
for line in f2:
record = json.loads(line.strip())
index = record.get('uid')
score = record.get('answer')
# 如果'index'存在于第一个文件的'uid'中,则合并数据
if index in data_dict:
data_dict[index]['score'] = score
# 将合并后的数据写入输出文件
with open(output_file, 'w', encoding='utf-8') as out_f:
for uid in data_dict:
out_f.write(json.dumps(data_dict[uid], ensure_ascii=False) + '\n')
# 示例用法
file1 = './dhbq/dhbq_merged.jsonl'
file2 = './dhbq/dhbq_score.jsonl'
output_file = './dhbq/dhbq_merged_with_score.jsonl'
merge_jsonl_files(file1, file2, output_file)
import json
def filter_records(input_file_path, output_file_path, target_field='score', target_value=5):
"""
筛选出jsonl文件中指定字段等于特定值的记录并保存为新的jsonl文件
:param input_file_path: 输入jsonl文件路径
:param output_file_path: 输出jsonl文件路径
:param target_field: 目标字段默认为'score'
:param target_value: 目标字段需要匹配的值默认为5
"""
with open(input_file_path, 'r', encoding='utf-8') as infile, \
open(output_file_path, 'w', encoding='utf-8') as outfile:
for line in infile:
record = json.loads(line.strip())
# 检查记录中目标字段的值是否等于目标值
try:
if int(record.get(target_field)) == target_value:
# 将符合条件的记录写入输出文件
outfile.write(json.dumps(record, ensure_ascii=False) + '\n')
except Exception as e:
continue
# 使用示例
if __name__ == '__main__':
input_file_path = './dhbq/dhbq_merged_with_score.jsonl' # 输入jsonl文件路径请替换为你的实际路径
output_file_path = './dhbq/dhbq_merged_with_score_5.jsonl' # 输出jsonl文件路径请替换为你的实际路径
filter_records(input_file_path, output_file_path)
print(f"筛选完成,已将符合条件的记录保存至{output_file_path}")