offline_data_model_pipline/data_generate/query_completion/merge.py

249 lines
9.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright @2024 AI. Inspur Inc.
#
# @author: suojiayi <suojiayi@inspur.com>
# @date: 2025/05/13
import json
def clean_answer(answer):
"""
提取 ```json 和 ``` 包裹的内容,并清理多余标记和空格。
如果包裹的内容是合法的 JSON则返回紧凑格式的 JSON 字符串;
否则返回原始内容。
"""
# 查找是否包含 ```json 开头和 ``` 结尾
if "```" in answer:
# 提取 ```json 和 ``` 之间的内容
start = answer.find("```json")
end = answer.find("```", start+len('```json'))
if end != -1:
content = answer[start+len('```json'):end].strip()
# print(content)
else:
content = answer.strip() # 如果没有找到结束的 ```,保留原始内容
else:
content = answer.strip() # 如果没有匹配到 ```json 标记,保留原始内容
# 尝试解析为 JSON 对象
try:
cleaned_json = json.loads(content) # 解析为 JSON 对象
return json.dumps(cleaned_json, ensure_ascii=False) # 返回紧凑格式的 JSON 字符串
except json.JSONDecodeError:
return content # 如果不是合法 JSON返回原始内容
def merge_jsonl_files(file1, file2, output_file):
"""
合并两个JSONL文件并清理answer字段。
"""
merged_data = []
count1 = 0
count2 = 0
# 读取第一个文件
with open(file1, 'r', encoding='utf-8') as f1:
for line in f1:
data = json.loads(line.strip())
if 'answer' in data:
data['answer'] = clean_answer(data['answer'])
merged_data.append(data)
# 读取第二个文件
with open(file2, 'r', encoding='utf-8') as f2:
for line in f2:
data = json.loads(line.strip())
if 'answer' in data:
data['answer'] = clean_answer(data['answer'])
merged_data.append(data)
# 写入合并后的文件
with open(output_file, 'w', encoding='utf-8') as out_f:
for item in merged_data:
out_f.write(json.dumps(item, ensure_ascii=False) + '\n')
示例用法
file1 = './dhbq/duihuabuquan_prompt_1.jsonl'
file2 = './dhbq/duihuabuquan_prompt_2.jsonl'
output_file = './dhbq/dhbq_prompt.jsonl'
file1 = './dhbq/duihuabuquan_score_1.jsonl'
file2 = './dhbq/duihuabuquan_score_2.jsonl'
output_file = './dhbq/dhbq_score.jsonl'
merge_jsonl_files(file1, file2, output_file)
import json
def merge_jsonl_files_by_uid(file1, file2, output_file):
"""
根据UID合并两个JSONL文件保留每个UID的一组'data',并整合'answer''ins_tag_label'
'answer'字段重命名为'prompt_label',同时保留'uid'字段。
"""
# 从第一个文件读取数据并存储在字典中键为uid
data_dict = {}
with open(file1, 'r', encoding='utf-8') as f1:
for line in f1:
record = json.loads(line.strip())
uid = record['uid']
if uid not in data_dict:
data_dict[uid] = {
'uid': uid, # 保留uid字段
'data': record['data'],
'prompt_label': record.get('answer')
}
else:
print(f"Warning: Duplicate UID found in the first file: {uid}")
# 处理第二个文件根据UID合并数据
with open(file2, 'r', encoding='utf-8') as f2:
for line in f2:
record = json.loads(line.strip())
uid = record['uid']
if uid in data_dict:
# 如果UID存在则添加或更新ins_tag_label字段
if 'ins_tag_label' in record:
data_dict[uid]['ins_tag_label'] = record['ins_tag_label']
else:
# 如果UID不存在于第一个文件中则直接添加到结果集中
new_record = {
'uid': uid, # 保留uid字段
'data': record['data']
}
if 'ins_tag_label' in record:
new_record['ins_tag_label'] = record['ins_tag_label']
data_dict[uid] = new_record
# 将合并后的数据写入输出文件
with open(output_file, 'w', encoding='utf-8') as out_f:
for uid in data_dict:
out_f.write(json.dumps(data_dict[uid], ensure_ascii=False) + '\n')
# 示例用法
file1 = './dhbq/dhbq_prompt.jsonl'
file2 = './dhbq/dhbq_instag.jsonl'
output_file = './dhbq/dhbq.jsonl'
merge_jsonl_files_by_uid(file1, file2, output_file)
import json
def merge_jsonl_files(file1, file2, output_file):
"""
合并两个JSONL文件
- 第一个文件包含'uid'和其他内容。
- 第二个文件包含'index''cluster_center''embedding'
- 根据'uid''index'匹配,将第二个文件的'cluster_center''embedding'添加到第一个文件中。
"""
# 读取第一个文件,构建以'uid'为键的字典
data_dict = {}
with open(file1, 'r', encoding='utf-8') as f1:
for line in f1:
record = json.loads(line.strip())
uid = record['uid']
data_dict[uid] = record
# 读取第二个文件,提取'index'、'cluster_center'和'embedding'
with open(file2, 'r', encoding='utf-8') as f2:
for line in f2:
record = json.loads(line.strip())
index = record.get('data_idx')
cluster_center = record.get('cluster_center')
#embedding = record.get('embedding')
# 如果'index'存在于第一个文件的'uid'中,则合并数据
if index in data_dict:
data_dict[index]['cluster_center'] = cluster_center
#data_dict[index]['embedding'] = embedding
# 将合并后的数据写入输出文件
with open(output_file, 'w', encoding='utf-8') as out_f:
for uid in data_dict:
out_f.write(json.dumps(data_dict[uid], ensure_ascii=False) + '\n')
# 示例用法
file1 = './dhbq/dhbq.jsonl'
file2 = './dhbq/dhbq_cluster_kmeans_result.jsonl'
output_file = './dhbq/dhbq_merged.jsonl'
merge_jsonl_files(file1, file2, output_file)
import json
def merge_jsonl_files(file1, file2, output_file):
"""
合并两个JSONL文件
- 第一个文件包含'uid'和其他内容。
- 第二个文件包含'index''cluster_center''embedding'
- 根据'uid''index'匹配,将第二个文件的'cluster_center''embedding'添加到第一个文件中。
"""
# 读取第一个文件,构建以'uid'为键的字典
data_dict = {}
with open(file1, 'r', encoding='utf-8') as f1:
for line in f1:
record = json.loads(line.strip())
uid = record['uid']
data_dict[uid] = record
# 读取第二个文件,提取'index'、'cluster_center'和'embedding'
with open(file2, 'r', encoding='utf-8') as f2:
for line in f2:
record = json.loads(line.strip())
index = record.get('uid')
score = record.get('answer')
embedding = record.get('embedding')
# 如果'index'存在于第一个文件的'uid'中,则合并数据
if index in data_dict:
data_dict[index]['score'] = score
data_dict[index]['embedding'] = embedding
# 将合并后的数据写入输出文件
with open(output_file, 'w', encoding='utf-8') as out_f:
for uid in data_dict:
out_f.write(json.dumps(data_dict[uid], ensure_ascii=False) + '\n')
# 示例用法
file1 = './dhbq/dhbq_merged.jsonl'
file2 = './dhbq/dhbq_score.jsonl'
output_file = './dhbq/dhbq_merged_with_score.jsonl'
merge_jsonl_files(file1, file2, output_file)
import json
def filter_records(input_file_path, output_file_path, target_field='score', target_value=5):
"""
筛选出jsonl文件中指定字段等于特定值的记录并保存为新的jsonl文件。
:param input_file_path: 输入jsonl文件路径
:param output_file_path: 输出jsonl文件路径
:param target_field: 目标字段,默认为'score'
:param target_value: 目标字段需要匹配的值默认为5
"""
with open(input_file_path, 'r', encoding='utf-8') as infile, \
open(output_file_path, 'w', encoding='utf-8') as outfile:
for line in infile:
record = json.loads(line.strip())
# 检查记录中目标字段的值是否等于目标值
try:
if int(record.get(target_field)) == target_value:
# 将符合条件的记录写入输出文件
outfile.write(json.dumps(record, ensure_ascii=False) + '\n')
except Exception as e:
continue
# 使用示例
if __name__ == '__main__':
input_file_path = './dhbq/dhbq_merged_with_score.jsonl' # 输入jsonl文件路径请替换为你的实际路径
output_file_path = './dhbq/dhbq_merged_with_score_5.jsonl' # 输出jsonl文件路径请替换为你的实际路径
filter_records(input_file_path, output_file_path)
print(f"筛选完成,已将符合条件的记录保存至{output_file_path}")