249 lines
9.3 KiB
Python
249 lines
9.3 KiB
Python
#!/usr/bin/env python
|
||
# -*- coding: utf-8 -*-
|
||
#
|
||
# Copyright @2024 AI. Inspur Inc.
|
||
#
|
||
# @author: suojiayi <suojiayi@inspur.com>
|
||
# @date: 2025/05/13
|
||
|
||
import json
|
||
|
||
def clean_answer(answer):
|
||
"""
|
||
提取 ```json 和 ``` 包裹的内容,并清理多余标记和空格。
|
||
如果包裹的内容是合法的 JSON,则返回紧凑格式的 JSON 字符串;
|
||
否则返回原始内容。
|
||
"""
|
||
# 查找是否包含 ```json 开头和 ``` 结尾
|
||
if "```" in answer:
|
||
# 提取 ```json 和 ``` 之间的内容
|
||
start = answer.find("```json")
|
||
end = answer.find("```", start+len('```json'))
|
||
if end != -1:
|
||
content = answer[start+len('```json'):end].strip()
|
||
# print(content)
|
||
else:
|
||
content = answer.strip() # 如果没有找到结束的 ```,保留原始内容
|
||
else:
|
||
content = answer.strip() # 如果没有匹配到 ```json 标记,保留原始内容
|
||
|
||
# 尝试解析为 JSON 对象
|
||
try:
|
||
cleaned_json = json.loads(content) # 解析为 JSON 对象
|
||
return json.dumps(cleaned_json, ensure_ascii=False) # 返回紧凑格式的 JSON 字符串
|
||
except json.JSONDecodeError:
|
||
return content # 如果不是合法 JSON,返回原始内容
|
||
|
||
|
||
def merge_jsonl_files(file1, file2, output_file):
|
||
"""
|
||
合并两个JSONL文件,并清理answer字段。
|
||
"""
|
||
merged_data = []
|
||
count1 = 0
|
||
count2 = 0
|
||
# 读取第一个文件
|
||
with open(file1, 'r', encoding='utf-8') as f1:
|
||
for line in f1:
|
||
data = json.loads(line.strip())
|
||
if 'answer' in data:
|
||
data['answer'] = clean_answer(data['answer'])
|
||
merged_data.append(data)
|
||
|
||
# 读取第二个文件
|
||
with open(file2, 'r', encoding='utf-8') as f2:
|
||
for line in f2:
|
||
data = json.loads(line.strip())
|
||
if 'answer' in data:
|
||
data['answer'] = clean_answer(data['answer'])
|
||
merged_data.append(data)
|
||
|
||
|
||
# 写入合并后的文件
|
||
with open(output_file, 'w', encoding='utf-8') as out_f:
|
||
for item in merged_data:
|
||
out_f.write(json.dumps(item, ensure_ascii=False) + '\n')
|
||
|
||
|
||
示例用法
|
||
file1 = './dhbq/duihuabuquan_prompt_1.jsonl'
|
||
file2 = './dhbq/duihuabuquan_prompt_2.jsonl'
|
||
output_file = './dhbq/dhbq_prompt.jsonl'
|
||
file1 = './dhbq/duihuabuquan_score_1.jsonl'
|
||
file2 = './dhbq/duihuabuquan_score_2.jsonl'
|
||
output_file = './dhbq/dhbq_score.jsonl'
|
||
|
||
merge_jsonl_files(file1, file2, output_file)
|
||
|
||
|
||
import json
|
||
|
||
def merge_jsonl_files_by_uid(file1, file2, output_file):
|
||
"""
|
||
根据UID合并两个JSONL文件,保留每个UID的一组'data',并整合'answer'和'ins_tag_label'。
|
||
'answer'字段重命名为'prompt_label',同时保留'uid'字段。
|
||
"""
|
||
# 从第一个文件读取数据并存储在字典中,键为uid
|
||
data_dict = {}
|
||
with open(file1, 'r', encoding='utf-8') as f1:
|
||
for line in f1:
|
||
record = json.loads(line.strip())
|
||
uid = record['uid']
|
||
if uid not in data_dict:
|
||
data_dict[uid] = {
|
||
'uid': uid, # 保留uid字段
|
||
'data': record['data'],
|
||
'prompt_label': record.get('answer')
|
||
}
|
||
else:
|
||
print(f"Warning: Duplicate UID found in the first file: {uid}")
|
||
|
||
# 处理第二个文件,根据UID合并数据
|
||
with open(file2, 'r', encoding='utf-8') as f2:
|
||
for line in f2:
|
||
record = json.loads(line.strip())
|
||
uid = record['uid']
|
||
if uid in data_dict:
|
||
# 如果UID存在,则添加或更新ins_tag_label字段
|
||
if 'ins_tag_label' in record:
|
||
data_dict[uid]['ins_tag_label'] = record['ins_tag_label']
|
||
else:
|
||
# 如果UID不存在于第一个文件中,则直接添加到结果集中
|
||
new_record = {
|
||
'uid': uid, # 保留uid字段
|
||
'data': record['data']
|
||
}
|
||
if 'ins_tag_label' in record:
|
||
new_record['ins_tag_label'] = record['ins_tag_label']
|
||
data_dict[uid] = new_record
|
||
|
||
# 将合并后的数据写入输出文件
|
||
with open(output_file, 'w', encoding='utf-8') as out_f:
|
||
for uid in data_dict:
|
||
out_f.write(json.dumps(data_dict[uid], ensure_ascii=False) + '\n')
|
||
|
||
# 示例用法
|
||
file1 = './dhbq/dhbq_prompt.jsonl'
|
||
file2 = './dhbq/dhbq_instag.jsonl'
|
||
output_file = './dhbq/dhbq.jsonl'
|
||
|
||
merge_jsonl_files_by_uid(file1, file2, output_file)
|
||
|
||
import json
|
||
|
||
def merge_jsonl_files(file1, file2, output_file):
|
||
"""
|
||
合并两个JSONL文件:
|
||
- 第一个文件包含'uid'和其他内容。
|
||
- 第二个文件包含'index'、'cluster_center'和'embedding'。
|
||
- 根据'uid'和'index'匹配,将第二个文件的'cluster_center'和'embedding'添加到第一个文件中。
|
||
"""
|
||
# 读取第一个文件,构建以'uid'为键的字典
|
||
data_dict = {}
|
||
with open(file1, 'r', encoding='utf-8') as f1:
|
||
for line in f1:
|
||
record = json.loads(line.strip())
|
||
uid = record['uid']
|
||
data_dict[uid] = record
|
||
|
||
# 读取第二个文件,提取'index'、'cluster_center'和'embedding'
|
||
with open(file2, 'r', encoding='utf-8') as f2:
|
||
for line in f2:
|
||
record = json.loads(line.strip())
|
||
index = record.get('data_idx')
|
||
cluster_center = record.get('cluster_center')
|
||
#embedding = record.get('embedding')
|
||
|
||
# 如果'index'存在于第一个文件的'uid'中,则合并数据
|
||
if index in data_dict:
|
||
data_dict[index]['cluster_center'] = cluster_center
|
||
#data_dict[index]['embedding'] = embedding
|
||
|
||
# 将合并后的数据写入输出文件
|
||
with open(output_file, 'w', encoding='utf-8') as out_f:
|
||
for uid in data_dict:
|
||
out_f.write(json.dumps(data_dict[uid], ensure_ascii=False) + '\n')
|
||
|
||
# 示例用法
|
||
file1 = './dhbq/dhbq.jsonl'
|
||
file2 = './dhbq/dhbq_cluster_kmeans_result.jsonl'
|
||
output_file = './dhbq/dhbq_merged.jsonl'
|
||
|
||
merge_jsonl_files(file1, file2, output_file)
|
||
|
||
|
||
import json
|
||
|
||
def merge_jsonl_files(file1, file2, output_file):
|
||
"""
|
||
合并两个JSONL文件:
|
||
- 第一个文件包含'uid'和其他内容。
|
||
- 第二个文件包含'index'、'cluster_center'和'embedding'。
|
||
- 根据'uid'和'index'匹配,将第二个文件的'cluster_center'和'embedding'添加到第一个文件中。
|
||
"""
|
||
# 读取第一个文件,构建以'uid'为键的字典
|
||
data_dict = {}
|
||
with open(file1, 'r', encoding='utf-8') as f1:
|
||
for line in f1:
|
||
record = json.loads(line.strip())
|
||
uid = record['uid']
|
||
data_dict[uid] = record
|
||
|
||
# 读取第二个文件,提取'index'、'cluster_center'和'embedding'
|
||
with open(file2, 'r', encoding='utf-8') as f2:
|
||
for line in f2:
|
||
record = json.loads(line.strip())
|
||
index = record.get('uid')
|
||
score = record.get('answer')
|
||
embedding = record.get('embedding')
|
||
|
||
# 如果'index'存在于第一个文件的'uid'中,则合并数据
|
||
if index in data_dict:
|
||
data_dict[index]['score'] = score
|
||
data_dict[index]['embedding'] = embedding
|
||
|
||
# 将合并后的数据写入输出文件
|
||
with open(output_file, 'w', encoding='utf-8') as out_f:
|
||
for uid in data_dict:
|
||
out_f.write(json.dumps(data_dict[uid], ensure_ascii=False) + '\n')
|
||
|
||
# 示例用法
|
||
file1 = './dhbq/dhbq_merged.jsonl'
|
||
file2 = './dhbq/dhbq_score.jsonl'
|
||
output_file = './dhbq/dhbq_merged_with_score.jsonl'
|
||
|
||
merge_jsonl_files(file1, file2, output_file)
|
||
|
||
|
||
import json
|
||
|
||
def filter_records(input_file_path, output_file_path, target_field='score', target_value=5):
|
||
"""
|
||
筛选出jsonl文件中指定字段等于特定值的记录,并保存为新的jsonl文件。
|
||
|
||
:param input_file_path: 输入jsonl文件路径
|
||
:param output_file_path: 输出jsonl文件路径
|
||
:param target_field: 目标字段,默认为'score'
|
||
:param target_value: 目标字段需要匹配的值,默认为5
|
||
"""
|
||
with open(input_file_path, 'r', encoding='utf-8') as infile, \
|
||
open(output_file_path, 'w', encoding='utf-8') as outfile:
|
||
for line in infile:
|
||
record = json.loads(line.strip())
|
||
# 检查记录中目标字段的值是否等于目标值
|
||
try:
|
||
if int(record.get(target_field)) == target_value:
|
||
# 将符合条件的记录写入输出文件
|
||
outfile.write(json.dumps(record, ensure_ascii=False) + '\n')
|
||
except Exception as e:
|
||
continue
|
||
|
||
|
||
# 使用示例
|
||
if __name__ == '__main__':
|
||
input_file_path = './dhbq/dhbq_merged_with_score.jsonl' # 输入jsonl文件路径,请替换为你的实际路径
|
||
output_file_path = './dhbq/dhbq_merged_with_score_5.jsonl' # 输出jsonl文件路径,请替换为你的实际路径
|
||
|
||
filter_records(input_file_path, output_file_path)
|
||
print(f"筛选完成,已将符合条件的记录保存至{output_file_path}")
|