feat: 对话补全训练数据生成

This commit is contained in:
suojiayi 2025-05-13 05:00:51 +00:00
parent 4af0f9c4a1
commit ee5f4585c9
14 changed files with 1749 additions and 0 deletions

View File

@ -0,0 +1,26 @@
# 数据处理流程
## 数据打分
- **Score**: 对数据进行评分,生成带有分数的数据集。
## 数据标签
- **Instag**: 生成`dhbq_instag.jsonl`。
- **Prompt Label**
- **Export Embedding + Cluster KMeans**: 使用KMeans聚类算法对导出的embedding进行聚类得到`dhbq_cluster_kmeans_result.jsonl`。
## 数据合并
- **Merge**: 将得分和三种类型的标签(`instag`、`prompt_label`、`cluster`)合并到一起,生成`dhbq_merged_with_score.jsonl`。
- 获取得分等于5的数据生成`dhbq_merged_with_score_5.jsonl`。
## 高频问题处理
- **Frequency**: 基于原始文件统计问题出现的频率并识别得分等于5的问题。
- **Embedding Similarity**: 使用embedding相似度过滤重复问题对于无法通过此方法过滤的问题将采取人工审查的方式。
## 类别统计
- **Count Instag**: 统计`instag`标签的数量。
- **Count Label**: 统计`prompt_label`的数量。
- **Count Cluster**: 统计通过KMeans聚类结果的数量。
## 数据获取
- **Select Final Data**: 按照类别从最终数据集中抽取所需数据。
- **Gen Train Data**: 根据选定的数据生成适合训练模型使用的数据格式。

View File

@ -0,0 +1,100 @@
# -*- coding: utf-8 -*-
import json
import sys
import os
import torch
import numpy as np
from sklearnex import patch_sklearn, unpatch_sklearn
from matplotlib import pyplot as plt
import time
import pandas as pd
def train_kmeans_model(train_data, n_clusters, data_index_list):
# 注意需要先做 patch_sklearn() 的操作之后再正常导入 sklearn 的工具包。
patch_sklearn()
# unpatch_sklearn()
from sklearn.cluster import KMeans
start_time = time.time()
# 这里 训练数据 要求 输入 是 np.ndarray 格式
data_set = train_data
print("train_data_len:" + str(len(data_set)))
print("n_clusters:" + str(n_clusters))
# init 默认就是 k-means++
model = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=300, n_init="auto", random_state=0)
model.fit(data_set)
# kmeans 模型本身就是包含了聚类中心
center = model.cluster_centers_
print(center)
cluster_label = model.predict(data_set)
print("cluster_label_size:" + str(len(cluster_label)))
print("type:" + str(type(cluster_label)))
train_dfs = pd.DataFrame(data_set)
train_dfs["predict_label"] = cluster_label
train_dfs["data_index"] = data_index_list
print(train_dfs.columns.values)
end_time = time.time()
avg_time_cost = (end_time - start_time) * 1.0
print("train_kmeans_time:" + str(avg_time_cost) + " s")
return train_dfs
# step1: 读取保存的 embeding 二进制文件
embed_file = "./dhbq/dhbq_embedding.pth"
embed_dict_list = torch.load(embed_file)
print("len_embed_dict_list:", len(embed_dict_list))
# step2: 数据 parse
raw_embeding_list = []
raw_index_list = []
for line in enumerate(embed_dict_list):
# line[0] 是 index, line[1] 是保存的 dict, 这里 读取进来 直接 就是 dict 对象
cur_dict = line[1]
cur_embeding = cur_dict["embedding"]
cur_data_idx = cur_dict["data_index"]
raw_embeding_list.append(cur_embeding)
raw_index_list.append(cur_data_idx)
train_array = np.array(raw_embeding_list)
print("train_array_shape:", train_array.shape)
# 总共聚类 1000类,先聚类 50大类,每个大类里选20小类
# 好吧直接干到1000个大类,要是效果不好,在继续选择
num_cluster = 300
# 这里会自动进行 pandas row index 对齐
train_dfs = train_kmeans_model(train_array, num_cluster, raw_index_list)
predict_label = train_dfs['predict_label'].tolist()
print("len_predict_label:", len(predict_label))
# 这里不保存csv,因为 pandas 的 csv 数据保存在遇到特殊字符的时候有意想不到的异常!!
#data_to_save = {'embeding': raw_embeding_list, 'cluster_center': predict_label, "data_idx": raw_index_list}
data_to_save = [
{
"embedding": raw_embeding_list[i],
"cluster_center": predict_label[i],
"data_idx": raw_index_list[i]
}
for i in range(len(raw_embeding_list))
]
output_file = "./dhbq/dhbq_cluster_kmeans_result.jsonl"
with open(output_file, 'w', encoding='utf-8') as f:
for record in data_to_save:
f.write(json.dumps(record, ensure_ascii=False) + '\n')
print(f"Results saved to {output_file}")
data_to_save_df = pd.DataFrame(data_to_save)
data_to_save_df.to_pickle("./dhbq/dhbq_cluster_kmeans_result.pkl")

View File

@ -0,0 +1,71 @@
import json
from collections import defaultdict
import pandas as pd
def classify_data_by_cluster_center(input_file, output_jsonl, output_excel):
"""
根据 cluster_center 对数据进行分类
每个 cluster_center 对应一个列表存储去掉了 embedding 的数据
输出 JSONL 文件按类型的数据量从多到少排序并记录每种类型的数量
同时将类型及其数量导出到 Excel 表格中
每个 cluster 对应的 data_list 会根据 'score' 字段降序排序
"""
# 初始化分类字典
classified_data = defaultdict(list)
# 读取输入文件
with open(input_file, 'r', encoding='utf-8') as f:
for line in f:
record = json.loads(line.strip())
# 提取 cluster_center 部分
cluster_center = record.get("cluster_center")
# 如果 cluster_center 存在,则根据其值分类
if cluster_center is not None:
record_without_embedding = {k: v for k, v in record.items() if k != "embedding"}
classified_data[cluster_center].append(record_without_embedding)
else:
# 如果没有 cluster_center则归类到 "null"
record_without_embedding = {k: v for k, v in record.items() if k != "embedding"}
classified_data["null"].append(record_without_embedding)
# 对每个 cluster_center 下的 data_list 按照 score 排序默认为0
for center in classified_data:
classified_data[center].sort(key=lambda x: x.get('score', 0), reverse=True)
# 按类型的数据量从多到少排序
sorted_classified_data = sorted(classified_data.items(), key=lambda x: len(x[1]), reverse=True)
# 写入 JSONL 文件
total_types = len(sorted_classified_data)
with open(output_jsonl, 'w', encoding='utf-8') as out_f:
for cluster_center, data_list in sorted_classified_data:
entry = {
str(cluster_center): data_list,
#"count": len(data_list)
}
out_f.write(json.dumps(entry, ensure_ascii=False) + '\n')
# 准备 Excel 数据
excel_data = []
for cluster_center, data_list in sorted_classified_data:
excel_data.append({"Cluster Center": cluster_center, "Count": len(data_list)})
# 导出到 Excel 文件
df = pd.DataFrame(excel_data)
df.to_excel(output_excel, index=False)
print(f"Total types: {total_types}")
return total_types
# 示例用法
if __name__ == "__main__":
input_file = './dhbq/dhbq_merged_with_score_0513.jsonl'
output_jsonl = './dhbq/dhbq_count_cluster_0513.jsonl'
output_excel = './dhbq/dhbq_count_cluster_0513.xlsx'
total_types = classify_data_by_cluster_center(input_file, output_jsonl, output_excel)
print(f"Total types found: {total_types}")

View File

@ -0,0 +1,89 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright @2024 AI. Inspur Inc.
#
# @author: suojiayi <suojiayi@inspur.com>
# @date: 2025/05/13
import json
from collections import defaultdict
import pandas as pd
def classify_data_by_ins_tag(input_file, output_jsonl, output_excel):
"""
根据 ins_tag_label 中的标签对数据进行分类
每个类型对应一个列表存储去掉了 embedding 的数据
输出 JSONL 文件按类型的数据量从多到少排序并记录每种类型的数量
同时将类型及其数量导出到 Excel 表格中
"""
# 初始化分类字典
classified_data = defaultdict(list)
# 读取输入文件
with open(input_file, 'r', encoding='utf-8') as f:
for line in f:
record = json.loads(line.strip())
# 提取 ins_tag_label 部分
ins_tag_label = record.get("ins_tag_label", "[]")
try:
ins_tag_label = json.loads(ins_tag_label)
except json.JSONDecodeError:
#print(f"JSON decode error: {ins_tag_label}")
ins_tag_label = []
# 如果 ins_tag_label 存在,则根据类型分类
if ins_tag_label:
for tag in ins_tag_label:
record_without_embedding = {k: v for k, v in record.items() if k != "embedding"}
classified_data[tag].append(record_without_embedding)
else:
# 如果没有 ins_tag_label则归类到 "null"
record_without_embedding = {k: v for k, v in record.items() if k != "embedding"}
classified_data["null"].append(record_without_embedding)
# 对每个 data_list 按照 score 降序排序(如果不存在 score默认为 0
for tag in classified_data:
classified_data[tag].sort(key=lambda x: x.get('score', 0), reverse=True)
# 过滤掉长度小于83的类别并排除 "null" 类别
filtered_data = {
tag: records for tag, records in classified_data.items()
if len(records) >= 83 and tag != "null"
}
# 按照记录数从多到少排序
sorted_filtered_data = sorted(filtered_data.items(), key=lambda x: len(x[1]), reverse=True)
# 写入 JSONL 文件
total_types = len(sorted_filtered_data)
with open(output_jsonl, 'w', encoding='utf-8') as out_f:
for type_name, data_list in sorted_filtered_data:
entry = {
type_name: data_list,
#"count": len(data_list)
}
out_f.write(json.dumps(entry, ensure_ascii=False) + '\n')
# 准备 Excel 数据
excel_data = []
for type_name, data_list in sorted_filtered_data:
excel_data.append({"Type": type_name, "Count": len(data_list)})
# 导出到 Excel 文件
df = pd.DataFrame(excel_data)
df.to_excel(output_excel, index=False)
print(f"Total types after filtering: {total_types}")
return total_types
# 示例用法
input_file = './dhbq/dhbq_merged_with_score.jsonl'
output_jsonl = './dhbq/dhbq_count_instag.jsonl'
output_excel = './dhbq/dhbq_count_instag.xlsx'
total_types = classify_data_by_ins_tag(input_file, output_jsonl, output_excel)
print(f"Final types found: {total_types}")

View File

@ -0,0 +1,97 @@
import json
from collections import defaultdict
import pandas as pd
def classify_data_by_labels(input_file, output_file, output_excel):
"""
根据 prompt_label 中的 labels 提取类型::后面的内容
每个类型对应一个列表存储去掉了 embedding 的数据
输出文件按类型的数据量从多到少排序并记录每种类型的数量
每个类型的 data_list 会根据 score 字段降序排序
并且会过滤掉 data_list 长度小于 19 的类型和特定的 label
"""
# 初始化分类字典
classified_data = defaultdict(list)
excluded_labels = {"null", "", "无指代", "无指代消解", "无明显指代消解",
"无上下文依赖", "无明显指代消解需求", "无明确指代",
"无明显上下文依赖", "无依赖", "无上下文", "无明显指代",
}
# 读取输入文件
with open(input_file, 'r', encoding='utf-8') as f:
for line in f:
record = json.loads(line.strip())
# 提取 prompt_label 中的 labels 部分
prompt_label = record.get("prompt_label", "{}")
try:
prompt_label = json.loads(prompt_label)
except json.JSONDecodeError:
prompt_label = {}
if isinstance(prompt_label, list):
labels = prompt_label[0].get("labels", [])
else:
labels = prompt_label.get("labels", [])
# 如果 labels 存在,则根据类型分类
if labels:
for label in labels:
if "::" in label:
type_name = label.split("::")[-1] # 提取 :: 后面的内容
# 排除特定的 label 值
if any(excluded_label in label for excluded_label in excluded_labels):
continue
record_without_embedding = {k: v for k, v in record.items() if k != "embedding"}
classified_data[type_name].append(record_without_embedding)
# else:
# # 如果没有 labels则归类到 "null"
# record_without_embedding = {k: v for k, v in record.items() if k != "embedding"}
# classified_data["null"].append(record_without_embedding)
# 对每个类型的 data_list 按照 score 字段降序排序
for type_name in classified_data.keys():
classified_data[type_name].sort(key=lambda x: x.get('score', 0), reverse=True)
# 过滤掉 data_list 长度小于 19 的类型
filtered_classified_data = {k: v for k, v in classified_data.items() if len(v) >= 19}
# 按类型的数据量从多到少排序
sorted_classified_data = sorted(filtered_classified_data.items(), key=lambda x: len(x[1]), reverse=True)
# 写入输出文件
total_types = len(sorted_classified_data)
with open(output_file, 'w', encoding='utf-8') as out_f:
for type_name, data_list in sorted_classified_data:
entry = {
type_name: data_list,
#"count": len(data_list)
}
out_f.write(json.dumps(entry, ensure_ascii=False) + '\n')
print(f"Total types after filtering: {total_types}")
# 准备 Excel 数据
excel_data = []
for type_name, data_list in sorted_classified_data:
excel_data.append({"Type": type_name, "Count": len(data_list)})
# 导出到 Excel 文件
df = pd.DataFrame(excel_data)
df.to_excel(output_excel, index=False)
# 将类型为 null 的数据单独保存到一个 JSONL 文件中
# null_data = classified_data.get("null", [])
# if len(null_data) >= 19: # 只有当 null 类型的数据长度大于等于19时才保存
# with open('./dhbq/prompt_null.jsonl', 'w', encoding='utf-8') as null_f:
# for record in null_data:
# null_f.write(json.dumps(record, ensure_ascii=False) + '\n')
return total_types
# 示例用法
input_file = './dhbq/dhbq_merged_with_score.jsonl'
output_file = './dhbq/dhbq_count_prompt_label.jsonl'
output_excel = './dhbq/dhbq_count_prompt_label.xlsx'
total_types = classify_data_by_labels(input_file, output_file, output_excel)
print(f"Total types found: {total_types}")

View File

@ -0,0 +1,136 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright @2024 AI. Inspur Inc.
#
# @author: suojiayi <suojiayi@inspur.com>
# @date: 2025/05/13
import torch
import pandas as pd
from torch.nn.functional import normalize
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
from loguru import logger
# 设置随机种子
def set_seed(seed=42):
import random
import numpy as np
import torch
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
# 文本去重类
class SemanticDeduplicator:
def __init__(self, model_name_or_path: str, device: str = "cuda"):
self.device = torch.device(device if torch.cuda.is_available() else "cpu")
self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
self.model = AutoModel.from_pretrained(model_name_or_path).to(self.device)
self.model.eval()
self.dimension = self.model.config.hidden_size
self.index = faiss.IndexFlatIP(self.dimension) # 内积即余弦相似度
self.seen_embeddings = []
@torch.no_grad()
def get_embedding(self, texts):
inputs = self.tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt").to(self.device)
outputs = self.model(**inputs)
embeddings = outputs.last_hidden_state[:, 0, :] # 取 [CLS] 向量作为句子表示
embeddings = normalize(embeddings, p=2, dim=1).cpu().numpy()
return embeddings
def deduplicate(self, texts, threshold=0.85):
result = []
for text in tqdm(texts, desc="De-duplicating"):
if not text.strip():
continue
emb = self.get_embedding([text])
if len(self.seen_embeddings) > 0:
self.index.add(np.array(self.seen_embeddings))
D, I = self.index.search(emb, k=1)
if '6+6' in texts:
print(D[0][0])
if '4+4' in texts:
print(D[0][0])
if D[0][0] < threshold:
result.append(text)
self.seen_embeddings.append(emb[0])
self.index.reset() # 每次 search 后要清空 index 缓存
else:
result.append(text)
self.seen_embeddings.append(emb[0])
return result
def main():
# 配置参数
input_path = './dhbq/frequency-score-5.xlsx' # 输入Excel文件路径
output_path = './dhbq/frequency-score-5-deduplicated.xlsx' # 输出Excel文件路径
model_name_or_path = "/model-pvc/suojiayi/bge-base-zh-v1.5/" # 使用的预训练模型名称
col_index = 0 # 需要去重的列索引
threshold = 0.65 # 相似度阈值
logger.info("加载数据...")
df = pd.read_excel(input_path).fillna("")
texts = df.iloc[:, col_index].astype(str).str.strip().tolist()
logger.info(f"开始语义去重(模型: {model_name_or_path} | 相似度阈值: {threshold}")
deduplicator = SemanticDeduplicator(model_name_or_path)
unique_texts = deduplicator.deduplicate(texts=texts, threshold=threshold)
logger.info(f"去重完成,共保留 {len(unique_texts)} 条文本")
new_df = pd.DataFrame({df.columns[col_index]: unique_texts})
merged_df = pd.merge(new_df, df, on=df.columns[col_index], how='left')
# 方法2: 使用 map适合映射单列
# new_df[df.columns[1]] = new_df[df.columns[col_index]].map(df.set_index(df.columns[col_index])[df.columns[1]])
# new_df[df.columns[2]] = new_df[df.columns[col_index]].map(df.set_index(df.columns[col_index])[df.columns[2]])
# 如果你只想保留 new_df 中存在的列,可以这样:
final_df = merged_df[df.columns] # 按原始顺序保留所有列
logger.info(f"保存结果到 {output_path}")
final_df.to_excel(output_path, index=False)
logger.info("保存成功!")
if __name__ == "__main__":
set_seed()
main()
#
# from transformers import AutoTokenizer, AutoModel
# import torch
# import numpy as np
#
# def get_embedding(model, tokenizer, text):
# inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt").to("cuda")
# with torch.no_grad():
# outputs = model(**inputs)
# embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy() # [CLS] 向量
# return embeddings
#
# def cos_sim(a, b):
# return np.dot(a, b.T) / (np.linalg.norm(a) * np.linalg.norm(b))
#
# # 加载模型
# model_path = "/model-pvc/suojiayi/bge-base-zh-v1.5/"
# tokenizer = AutoTokenizer.from_pretrained(model_path)
# model = AutoModel.from_pretrained(model_path).to("cuda")
#
# # 测试文本
# text1 = "今天天气"
# text2 = "今天天气怎么样"
#
# emb1 = get_embedding(model, tokenizer, text1)
# emb2 = get_embedding(model, tokenizer, text2)
#
# similarity = cos_sim(emb1, emb2)
# print(f"相似度:{similarity[0][0]:.4f}")

View File

@ -0,0 +1,192 @@
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from torch.utils.data import DataLoader
import pandas as pd
from tqdm import tqdm
from loguru import logger
import random
import argparse
import json
import pandas
from typing import List
def set_seed(seed=128):
random.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed_all(seed)
class MyTorchDataset(torch.utils.data.Dataset):
def __init__(self, input_file, tokenizer, max_seq_length):
self.tokenizer = tokenizer
self.max_seq_length = max_seq_length
logger.info('Loading data: {}'.format(input_file))
with open(input_file, 'r', encoding='utf8') as f:
data_list = f.readlines()
logger.info("There are {} data in dataset".format(len(data_list)))
# 测试:这里仅取 top 100 条数据
self.raw_data_list = data_list #[:100]
# 提取对话内容
self.query_list = []
self.index_list = []
for line in self.raw_data_list:
json_line = json.loads(line)
uid = json_line["uid"]
dialog_content = ""
# 遍历每一轮对话
for turn in json_line["data"]:
if turn['role'] == 'user':
content = turn["content"]
# 如果 content 是字符串形式的 JSON先解析
if isinstance(content, str) and content.startswith("{"):
try:
content = json.loads(content)
except json.JSONDecodeError:
pass
# 将 content 转换为字符串并拼接到对话中
if isinstance(content, dict):
content_str = " ".join(f"{k}: {v}" for k, v in content.items())
else:
content_str = str(content)
dialog_content += content_str+"\n"
# 去掉最后的换行符
dialog_content = dialog_content.strip()
#print(dialog_content)
self.query_list.append(dialog_content)
self.index_list.append(uid)
assert len(self.query_list) == len(self.index_list)
logger.info(f"final len_query_list:{len(self.query_list)}")
logger.info(f"final len_index_list:{len(self.index_list)}")
# 批量 Tokenize 所有数据
logger.info(f"开始批量 tokenize 所有数据 ...")
self.all_data_token = self.tokenizer(
self.query_list,
padding='max_length',
truncation=True,
max_length=self.max_seq_length,
return_tensors='pt'
)
logger.info(f"批量 tokenize 所有数据 完成 ")
def __getitem__(self, idx):
# item 包含了 input_idsattention_mask 等参数,已经转化为 tensor
item = {key: torch.as_tensor(value[idx]) for key, value in self.all_data_token.items()}
item['data_index'] = self.index_list[idx]
return item
def __len__(self):
return len(self.query_list)
class TextEmbedder:
def __init__(self, args):
self.args = args
self.device = torch.device("cuda:0")
self.tokenizer = AutoTokenizer.from_pretrained(self.args.model_name_or_path,
model_max_length=args.max_length,
# 这里是推理过程,指定 左侧padding,不然取推理结果有问题
padding_side="left",
use_fast=False)
self.model = AutoModel.from_pretrained(self.args.model_name_or_path).to(self.device)
logger.info('tokenizer,model 加载完成')
self.all_dataset = MyTorchDataset(self.args.raw_data_path, self.tokenizer, self.args.max_length)
self.data_loader = DataLoader(dataset=self.all_dataset, batch_size=self.args.batch_size,
# pin_memory=True,num_workers=1, prefetch_factor=8
)
def save_data(self, results: List) -> None:
logger.info(f'需要保存的数据长度: {len(results)}')
df = pandas.DataFrame(results)
df.sort_values(by="data_index", inplace=True)
df.reset_index(drop=True, inplace=True)
df.to_pickle(self.args.output_path)
logger.info(f"Saved pickle to {self.args.output_path}")
'''
# 这个地方 .csv 会保存失败
csv_file_name = "./rl_demo_emb_240624.csv"
df.to_csv(csv_file_name, index=False)
logger.info(f"Saved csv to {csv_file_name}")
'''
# 指定保存的文件名
torch_py_bin_name = './dhbq/dhbq_embedding.pth'
# 请注意torch.save()和torch.load()函数不仅可以用于保存和加载张量还可以用于保存和加载任何Python对象
# 包括但不限于字典、集合、自定义类实例等。
torch.save(results, torch_py_bin_name)
logger.info(f'List has been saved to {torch_py_bin_name}')
def encode_samples(self):
total_samples = len(self.all_dataset)
logger.info(f"total_samples: {total_samples}")
total_batches = len(self.data_loader)
logger.info(f"total_batches: {total_batches}")
all_embeddings_list = []
for b_idx, batch in enumerate(tqdm(self.data_loader, total=total_batches)):
self.model.eval()
batch_idx = batch["data_index"]
input_ids = batch['input_ids'].to(self.device)
attention_mask = batch['attention_mask'].to(self.device)
output = None
with torch.no_grad():
output = self.model(input_ids=input_ids, attention_mask=attention_mask)
cls_embedding = output.pooler_output.detach().cpu().numpy().tolist()
# 记录 embedding 和 index
cur_sample_idx = batch_idx#.tolist() # 如果 batch_idx 是张量
cur_sample_dict_list = [
{"embedding": cls_emb, "data_index": s_id}
for cls_emb, s_id in zip(cls_embedding, cur_sample_idx)
]
all_embeddings_list.extend(cur_sample_dict_list)
return all_embeddings_list
if __name__ == '__main__':
torch.cuda.empty_cache()
# 参数设置
set_seed()
# 若干参数设置
parser = argparse.ArgumentParser()
# 不要改该参数,系统会自动分配
parser.add_argument('--device', default='cuda', help='device id (i.e. 0 or 0,1 or cpu)')
# 开启的进程数(注意不是线程),不用设置该参数会根据nproc_per_node自动设置
parser.add_argument('--world_size', default=1, type=int, help='number of distributed processes')
args = parser.parse_args()
args.max_length = 512
args.batch_size = 2048
args.model_name_or_path = "/model-pvc/suojiayi/bge-base-zh-v1.5/"
#args.raw_data_path = "/dataset-pvc/suojiayi/new/train_prepare/20250423_020157/tmp_data/instruct_data_COIG_filtered_2504212014.jsonl"
args.raw_data_path = '/dataset-pvc/suojiayi/duihuabuquan/train_prepare/20250425_103516/tmp_data/instruct_data_dhbq_0425_filtered_2504251824.jsonl'
args.output_path = "./dhbq/dhbq_embedding.bin"
logger.info(f"\nargs:{args}\n")
text_embeder = TextEmbedder(args)
all_embeddings_list = text_embeder.encode_samples()
logger.info(f"len_all_embeddings_list:{len(all_embeddings_list)}")
logger.info("Finished embedding")
text_embeder.save_data(all_embeddings_list)
logger.info(f"Pipeline run complete.")

View File

@ -0,0 +1,91 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright @2024 AI. Inspur Inc.
#
# @author: suojiayi <suojiayi@inspur.com>
# @date: 2025/05/13
import json
import pandas as pd
import json
from collections import Counter
import pandas as pd
def process_jsonl_and_save_to_excel(jsonl_file, output_excel):
"""
读取 JSONL 文件提取最后一个 content 的内容统计频率并保存到 Excel 表格中
参数:
jsonl_file: 输入的 JSONL 文件路径
output_excel: 输出的 Excel 文件路径
"""
# 存储所有最后一行的 content
last_contents = []
# 读取 JSONL 文件
with open(jsonl_file, 'r', encoding='utf-8') as f:
for line in f:
# 解析每一行为 JSON 对象
data = json.loads(line)
# 提取 data 列表的最后一个 content
if 'data' in data and isinstance(data['data'], list) and len(data['data']) > 0:
last_content = data['data'][-2].get('content', '')
last_contents.append(last_content)
# 统计相同内容出现的频率
content_counter = Counter(last_contents)
# 转换为 DataFrame
df = pd.DataFrame(content_counter.items(), columns=['Content', 'Frequency'])
# 按频率降序排序
df = df.sort_values(by='Frequency', ascending=False)
# 保存到 Excel 文件
df.to_excel(output_excel, index=False)
print(f"统计结果已保存到 {output_excel}")
# 示例调用
jsonl_file = '/dataset-pvc/suojiayi/duihuabuquan/train_prepare/20250425_103516/tmp_data/instruct_data_dhbq_0425_filtered_2504251824.jsonl' # 输入的 JSONL 文件路径
output_excel = "./dhbq/frequency.xlsx" # 输出的 Excel 文件路径
process_jsonl_and_save_to_excel(jsonl_file, output_excel)
def index_jsonl(jsonl_file_path):
"""索引jsonl文件返回一个字典键是'data'字段数组中最后一个对象的'content'"""
index_dict = {}
with open(jsonl_file_path, 'r', encoding='utf-8') as file:
for line in file:
record = json.loads(line.strip())
data_list = record.get('data', [])
if data_list: # 确保'data'字段存在且非空
content_value = data_list[-1].get('content')
if content_value is not None:
index_dict[content_value] = record
return index_dict
def update_excel_with_matched_data(input_excel_path, output_excel_path, jsonl_index):
"""根据索引更新Excel文件"""
df = pd.read_excel(input_excel_path)
matched_data = [jsonl_index.get(value) for value in df.iloc[:, 0]]
df['matched_json'] = matched_data
df.to_excel(output_excel_path, index=False)
# 使用示例
if __name__ == '__main__':
input_excel_path = './dhbq/frequency.xlsx' # 输入Excel文件路径请替换为你的实际路径
output_excel_path = './dhbq/frequency-score-5.xlsx' # 输出Excel文件路径请替换为你的实际路径
jsonl_file_path = './dhbq/dhbq_merged_with_score_5.jsonl' # jsonl文件路径请替换为你的实际路径
print("开始索引jsonl文件...")
jsonl_index = index_jsonl(jsonl_file_path)
print(f"完成索引,共索引 {len(jsonl_index)} 条记录")
update_excel_with_matched_data(input_excel_path, output_excel_path, jsonl_index)
print(f"处理完成,已将结果保存至{output_excel_path}")

View File

@ -0,0 +1,86 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright @2024 AI. Inspur Inc.
#
# @author: suojiayi <suojiayi@inspur.com>
# @date: 2025/05/13
import json
import re
def is_empty(value):
"""
判断一个值是否为空包括 None空字符串空列表空字典等
:param value: 检查的值
:return: 如果值为空则返回 True否则返回 False
"""
if value is None:
return True
if isinstance(value, str) and not value.strip():
return True
if isinstance(value, (list, dict)) and len(value) == 0:
return True
return False
def filter_empty_fields(data):
"""
递归过滤 JSON 数据中的空字段数据项
:param data: 当前处理的数据可以是 dictlist
:return: 过滤后的数据
"""
if isinstance(data, list):
filtered_list = [filter_empty_fields(item) for item in data if not is_empty(item)]
return [item for item in filtered_list if not is_empty(item)]
elif isinstance(data, dict):
filtered_dict = {k: filter_empty_fields(v) for k, v in data.items() if not is_empty(v)}
final_dict = {k: v for k, v in filtered_dict.items() if not is_empty(v)}
if not final_dict:
return None
return final_dict
else:
return data
def process_jsonl(input_file, output_file):
with open(input_file, 'r', encoding='utf-8') as fin, open(output_file, 'w', encoding='utf-8') as fout:
for line in fin:
data = json.loads(line.strip())['data']
last_user = None
last_assistant = None
history = []
for i in range(len(data)):
if data[i]['role'] == 'user':
if i + 1 < len(data) and data[i+1]['role'] == 'assistant':
history.append([data[i]['content'], data[i+1]['content']])
else:
last_user = data[i]['content']
elif data[i]['role'] == 'assistant' and data[i-1]['role'] != 'user':
pass
input_content = last_user or (data[-2]['content'] if len(data) >= 2 and data[-1]['role'] == 'assistant' else "")
output_content = data[-1]['content'] if data[-1]['role'] == 'assistant' else ""
if output_content:
target_pattern = "问题:"
output_content = re.sub(target_pattern, "", output_content)
if output_content:
result = {
"input": input_content,
"output": output_content,
"history": history[:-1] if history and input_content in history[-1] else history
}
result = filter_empty_fields(result)
fout.write(json.dumps(result, ensure_ascii=False) + '\n')
# input_file = './dhbq/merged_result.jsonl'
# output_file = './dhbq/train_data_0506.jsonl'
# process_jsonl(input_file, output_file)
input_file = './dhbq/test_data.jsonl'
output_file = './dhbq/test_data_0512.jsonl'
process_jsonl(input_file, output_file)

View File

@ -0,0 +1,126 @@
import json
from vllm import LLM, SamplingParams
from vllm.transformers_utils.tokenizer import get_tokenizer
from tqdm import tqdm, trange
from copy import deepcopy
# need vllm==0.4.2
# firefly_copy 使用
# step1 读入待跑 tag 的 jsonl 文件
input_jsonl_file = "/dataset-pvc/suojiayi/duihuabuquan/train_prepare/20250425_103516/tmp_data/instruct_data_dhbq_0425_filtered_2504251824.jsonl"
with open(input_jsonl_file, 'r', encoding='utf8') as f:
raw_data_list = f.readlines()
print("len_raw_data_list:", len(raw_data_list))
# 这里测试,先跑前100个数据
# raw_data_list = raw_data_list[:100]
# step2 读取 instag 模型 配置 tokenizer
model_path = '/model-pvc/instagger_qwen1_8B/'
tokenizer = get_tokenizer(model_path, trust_remote_code=True)
tokenizer.chat_template = (
"{% for message in messages %}"
"{{ '### ' + message['role'].capitalize() + ':\\n' }}"
"{{ message['content'] | string + '\\n' }}"
"{% endfor %}"
"{% if add_generation_prompt %}"
"{{ '### Assistant:\\n' }}"
"{% endif %}"
)
# step3 这里内存够用,一次性读取所有文件进行拼接 prompt 处理
prompt_text_list = []
for index, line in enumerate(raw_data_list):
# print("process:", index)
json_obj = json.loads(line)
# 遍历每一轮对话
query = ""
for turn in json_obj["data"]:
if turn['role'] == 'user':
content = turn["content"]
# 如果 content 是字符串形式的 JSON先解析
if isinstance(content, str) and content.startswith("{"):
try:
content = json.loads(content)
except json.JSONDecodeError:
pass
# 将 content 转换为字符串并拼接到对话中
if isinstance(content, dict):
content_str = " ".join(f"{k}: {v}" for k, v in content.items())
else:
content_str = str(content)
query += content_str + "\n"
prompt = query
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
prompt_text_list.append(text)
print("len_prompt_text_list:", len(prompt_text_list))
assert len(raw_data_list) == len(prompt_text_list)
def extract_raw_qwen_res(response):
if "<|im_end|>" in response:
response = response.split("<|im_end|>")[0]
return response
# step4: vllm 推理参数设置,官方推荐这里要求 temperature=0
sampling_params = SamplingParams(max_tokens=64,
temperature=0,
top_p=0.8,
repetition_penalty=1.05)
# step5: 读入模型 这里读入模型往后放数据有问题在load 模型之前暴露出来节省时间
# set gpu_memory_utilization=0.95 for OOM error
vllm_model = LLM(model=model_path,
tensor_parallel_size=1,
gpu_memory_utilization=0.95,
trust_remote_code=True)
vllm_model.set_tokenizer(tokenizer)
# step6 分批量推理并且得到结果的过程
tag_result_list = []
batch_size = 1024
for start in trange(0, len(prompt_text_list), batch_size):
print("index:", start)
batch_list = prompt_text_list[start: start + batch_size]
vllm_outputs = vllm_model.generate(batch_list, sampling_params)
for output in vllm_outputs:
prompt = output.prompt
cur_tag_result = extract_raw_qwen_res(output.outputs[0].text)
tag_result_list.append(cur_tag_result)
print("len_tag_result_list:", len(tag_result_list))
assert len(tag_result_list) == len(prompt_text_list)
# step7: 写入结果
print("开始写 instag 结果 数据")
single_turn_instag_path = "./dhbq/dhbq_instag.jsonl"
print("single_turn_instag_path:", single_turn_instag_path)
single_f_sample_save = open(single_turn_instag_path, "w")
write_single_sample_count = 0
for line, tag in zip(raw_data_list, tag_result_list):
json_line = json.loads(line)
json_line["ins_tag_label"] = tag
cur_final_line = json.dumps(deepcopy(json_line), ensure_ascii=False)
write_single_sample_count = write_single_sample_count + 1
single_f_sample_save.write(cur_final_line + "\n")
if write_single_sample_count % 100000 == 0:
print("write_single_count:", write_single_sample_count)
print("all finish success !")

View File

@ -0,0 +1,248 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright @2024 AI. Inspur Inc.
#
# @author: suojiayi <suojiayi@inspur.com>
# @date: 2025/05/13
import json
def clean_answer(answer):
"""
提取 ```json ``` 包裹的内容并清理多余标记和空格
如果包裹的内容是合法的 JSON则返回紧凑格式的 JSON 字符串
否则返回原始内容
"""
# 查找是否包含 ```json 开头和 ``` 结尾
if "```" in answer:
# 提取 ```json 和 ``` 之间的内容
start = answer.find("```json")
end = answer.find("```", start+len('```json'))
if end != -1:
content = answer[start+len('```json'):end].strip()
# print(content)
else:
content = answer.strip() # 如果没有找到结束的 ```,保留原始内容
else:
content = answer.strip() # 如果没有匹配到 ```json 标记,保留原始内容
# 尝试解析为 JSON 对象
try:
cleaned_json = json.loads(content) # 解析为 JSON 对象
return json.dumps(cleaned_json, ensure_ascii=False) # 返回紧凑格式的 JSON 字符串
except json.JSONDecodeError:
return content # 如果不是合法 JSON返回原始内容
def merge_jsonl_files(file1, file2, output_file):
"""
合并两个JSONL文件并清理answer字段
"""
merged_data = []
count1 = 0
count2 = 0
# 读取第一个文件
with open(file1, 'r', encoding='utf-8') as f1:
for line in f1:
data = json.loads(line.strip())
if 'answer' in data:
data['answer'] = clean_answer(data['answer'])
merged_data.append(data)
# 读取第二个文件
with open(file2, 'r', encoding='utf-8') as f2:
for line in f2:
data = json.loads(line.strip())
if 'answer' in data:
data['answer'] = clean_answer(data['answer'])
merged_data.append(data)
# 写入合并后的文件
with open(output_file, 'w', encoding='utf-8') as out_f:
for item in merged_data:
out_f.write(json.dumps(item, ensure_ascii=False) + '\n')
示例用法
file1 = './dhbq/duihuabuquan_prompt_1.jsonl'
file2 = './dhbq/duihuabuquan_prompt_2.jsonl'
output_file = './dhbq/dhbq_prompt.jsonl'
file1 = './dhbq/duihuabuquan_score_1.jsonl'
file2 = './dhbq/duihuabuquan_score_2.jsonl'
output_file = './dhbq/dhbq_score.jsonl'
merge_jsonl_files(file1, file2, output_file)
import json
def merge_jsonl_files_by_uid(file1, file2, output_file):
"""
根据UID合并两个JSONL文件保留每个UID的一组'data'并整合'answer''ins_tag_label'
'answer'字段重命名为'prompt_label'同时保留'uid'字段
"""
# 从第一个文件读取数据并存储在字典中键为uid
data_dict = {}
with open(file1, 'r', encoding='utf-8') as f1:
for line in f1:
record = json.loads(line.strip())
uid = record['uid']
if uid not in data_dict:
data_dict[uid] = {
'uid': uid, # 保留uid字段
'data': record['data'],
'prompt_label': record.get('answer')
}
else:
print(f"Warning: Duplicate UID found in the first file: {uid}")
# 处理第二个文件根据UID合并数据
with open(file2, 'r', encoding='utf-8') as f2:
for line in f2:
record = json.loads(line.strip())
uid = record['uid']
if uid in data_dict:
# 如果UID存在则添加或更新ins_tag_label字段
if 'ins_tag_label' in record:
data_dict[uid]['ins_tag_label'] = record['ins_tag_label']
else:
# 如果UID不存在于第一个文件中则直接添加到结果集中
new_record = {
'uid': uid, # 保留uid字段
'data': record['data']
}
if 'ins_tag_label' in record:
new_record['ins_tag_label'] = record['ins_tag_label']
data_dict[uid] = new_record
# 将合并后的数据写入输出文件
with open(output_file, 'w', encoding='utf-8') as out_f:
for uid in data_dict:
out_f.write(json.dumps(data_dict[uid], ensure_ascii=False) + '\n')
# 示例用法
file1 = './dhbq/dhbq_prompt.jsonl'
file2 = './dhbq/dhbq_instag.jsonl'
output_file = './dhbq/dhbq.jsonl'
merge_jsonl_files_by_uid(file1, file2, output_file)
import json
def merge_jsonl_files(file1, file2, output_file):
"""
合并两个JSONL文件
- 第一个文件包含'uid'和其他内容
- 第二个文件包含'index''cluster_center''embedding'
- 根据'uid''index'匹配将第二个文件的'cluster_center''embedding'添加到第一个文件中
"""
# 读取第一个文件,构建以'uid'为键的字典
data_dict = {}
with open(file1, 'r', encoding='utf-8') as f1:
for line in f1:
record = json.loads(line.strip())
uid = record['uid']
data_dict[uid] = record
# 读取第二个文件,提取'index'、'cluster_center'和'embedding'
with open(file2, 'r', encoding='utf-8') as f2:
for line in f2:
record = json.loads(line.strip())
index = record.get('data_idx')
cluster_center = record.get('cluster_center')
#embedding = record.get('embedding')
# 如果'index'存在于第一个文件的'uid'中,则合并数据
if index in data_dict:
data_dict[index]['cluster_center'] = cluster_center
#data_dict[index]['embedding'] = embedding
# 将合并后的数据写入输出文件
with open(output_file, 'w', encoding='utf-8') as out_f:
for uid in data_dict:
out_f.write(json.dumps(data_dict[uid], ensure_ascii=False) + '\n')
# 示例用法
file1 = './dhbq/dhbq.jsonl'
file2 = './dhbq/dhbq_cluster_kmeans_result.jsonl'
output_file = './dhbq/dhbq_merged.jsonl'
merge_jsonl_files(file1, file2, output_file)
import json
def merge_jsonl_files(file1, file2, output_file):
"""
合并两个JSONL文件
- 第一个文件包含'uid'和其他内容
- 第二个文件包含'index''cluster_center''embedding'
- 根据'uid''index'匹配将第二个文件的'cluster_center''embedding'添加到第一个文件中
"""
# 读取第一个文件,构建以'uid'为键的字典
data_dict = {}
with open(file1, 'r', encoding='utf-8') as f1:
for line in f1:
record = json.loads(line.strip())
uid = record['uid']
data_dict[uid] = record
# 读取第二个文件,提取'index'、'cluster_center'和'embedding'
with open(file2, 'r', encoding='utf-8') as f2:
for line in f2:
record = json.loads(line.strip())
index = record.get('uid')
score = record.get('answer')
embedding = record.get('embedding')
# 如果'index'存在于第一个文件的'uid'中,则合并数据
if index in data_dict:
data_dict[index]['score'] = score
data_dict[index]['embedding'] = embedding
# 将合并后的数据写入输出文件
with open(output_file, 'w', encoding='utf-8') as out_f:
for uid in data_dict:
out_f.write(json.dumps(data_dict[uid], ensure_ascii=False) + '\n')
# 示例用法
file1 = './dhbq/dhbq_merged.jsonl'
file2 = './dhbq/dhbq_score.jsonl'
output_file = './dhbq/dhbq_merged_with_score.jsonl'
merge_jsonl_files(file1, file2, output_file)
import json
def filter_records(input_file_path, output_file_path, target_field='score', target_value=5):
"""
筛选出jsonl文件中指定字段等于特定值的记录并保存为新的jsonl文件
:param input_file_path: 输入jsonl文件路径
:param output_file_path: 输出jsonl文件路径
:param target_field: 目标字段默认为'score'
:param target_value: 目标字段需要匹配的值默认为5
"""
with open(input_file_path, 'r', encoding='utf-8') as infile, \
open(output_file_path, 'w', encoding='utf-8') as outfile:
for line in infile:
record = json.loads(line.strip())
# 检查记录中目标字段的值是否等于目标值
try:
if int(record.get(target_field)) == target_value:
# 将符合条件的记录写入输出文件
outfile.write(json.dumps(record, ensure_ascii=False) + '\n')
except Exception as e:
continue
# 使用示例
if __name__ == '__main__':
input_file_path = './dhbq/dhbq_merged_with_score.jsonl' # 输入jsonl文件路径请替换为你的实际路径
output_file_path = './dhbq/dhbq_merged_with_score_5.jsonl' # 输出jsonl文件路径请替换为你的实际路径
filter_records(input_file_path, output_file_path)
print(f"筛选完成,已将符合条件的记录保存至{output_file_path}")

View File

@ -0,0 +1,165 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright @2024 AI. Inspur Inc.
#
# @author: suojiayi <suojiayi@inspur.com>
# @date: 2025/05/13
import json
import requests
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
def read_jsonl_lines_in_batches(file_path, batch_size=10000):
"""按批次读取 JSONL 文件"""
batch = []
with open(file_path, mode="r", encoding="utf-8") as f:
for line in f:
try:
batch.append(json.loads(line.strip()))
if len(batch) == batch_size:
yield batch
batch = []
except json.JSONDecodeError as e:
print(f"Error decoding JSON: {e}")
if batch:
yield batch
def process_data_concurrently(data_list, api_url, headers, max_workers=10):
"""并发处理数据并调用 API"""
result_data = []
def process_single_data(data):
try:
query = data.get('data')
if query:
input_content = f'''你是一个多轮对话分析专家。请根据以下维度对给定的多轮对话2-3轮进行细粒度分类并按统一格式返回结果。
# 输入规范
1. 输入为完整的多轮对话2-3组问答对
2. 每组问答包含明确的用户提问和系统/客服响应
# 核心分析维度
## 1. 指代消解模式
- 显式指代直接使用完整名词订单号123
- 代词依赖需解析"它/这个/那个"等代词
- 省略恢复需补全省略成分"还有呢?""其他问题"
- 零指代无主语句"还没处理"
## 2. 信息流结构
- 自足单元当前轮次信息完整
- 跨轮依赖需前文关键信息
- 隐式跳转无过渡的话题切换
- 意图延续延续前轮任务目标
- 推理关联需逻辑推算建立联系
## 3. 上下文管理
- 短期记忆依赖1-2轮内信息
- 长期记忆引用3轮前历史记录
- 推理记忆需计算/推理关联信息
## 4. 状态演化
- 静态查询纯信息检索
- 动态演进逐步完善任务参数
- 混合操作查询+修改组合
- 信息修正更正前轮错误数据
# 输出规范'''+'''```json
{
"labels": ["指代消解::显式指代", "信息流::跨轮依赖", "上下文::推理记忆", "状态演化::动态演进"],
"analysis": {
"指代消解": {
"类型": "显式指代",
"证据": "例句:'订单A123'直接使用完整标识符"
},
"信息流": {
"类型": "跨轮依赖",
"证据": "客服要求提供订单号是对用户首轮请求的响应"
},
"上下文": {
"类型": "推理记忆",
"证据": "需要根据首轮日期推算当前状态"
},
"状态演化": {
"类型": "动态演进",
"证据": "从查询请求逐步收集必要参数"
}
}
}```'''+f'''让我们一步一步思考给出最后的返回结果输入的多轮对话{query}'''
response = requests.post(
api_url,
headers=headers,
json={
"model": "Qwen2.5-72B-Instruct",
"stream": False,
"temperature": 0.01,
"messages": [{"role": "user", "content": input_content}]
}
)
if response.status_code == 200:
try:
content = response.json()["choices"][0]["message"]["content"]
except (KeyError, IndexError, json.JSONDecodeError):
content = "无法解析返回内容"
else:
content = f"API请求失败状态码{response.status_code}"
return {
"uid": data.get('uid'),
"data": query,
"answer": content
}
except Exception as e:
print(e)
return None
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [executor.submit(process_single_data, data) for data in data_list]
for future in as_completed(futures):
result = future.result()
if result:
result_data.append(result)
return result_data
def save_to_excel_in_batches(data_list, output_file, batch_size=10000):
"""按批次保存数据到 Excel 文件"""
df = pd.DataFrame(data_list)
writer = pd.ExcelWriter(output_file, engine='openpyxl')
for i in range(0, len(df), batch_size):
batch_df = df.iloc[i:i + batch_size]
batch_df.to_excel(writer, index=False, startrow=i)
writer.close()
print(f"数据已成功保存到 {output_file}")
def save_to_jsonl_in_batches(data_list, output_file, batch_size=10000):
"""按批次保存数据到 JSONL 文件"""
with open(output_file, 'w', encoding='utf-8') as f:
for i in range(0, len(data_list), batch_size):
# 获取当前批次的数据
batch_data = data_list[i:i + batch_size]
# 将每个数据对象写入文件,每行一个 JSON 对象
for item in batch_data:
f.write(json.dumps(item, ensure_ascii=False) + '\n')
print(f"数据已成功保存到 {output_file}")
if __name__ == "__main__":
#output_excel_file = 'result-taoli-5.xlsx'
# api_url = "http://100.105.149.39:8000/v1/chat/completions"
api_url = "http://100.105.230.95:8000/v1/chat/completions"
headers = {
"Content-Type": "application/json",
"Authorization": "7c3eafb5-2d6e-100d-ab0f-7b2c1cdafb3c"
}
#file_path = '/dataset-pvc/suojiayi/new/train_prepare/20250423_020157/tmp_data/instruct_data_BELLE_Multiturn_Chat_filtered_2504232014.jsonl'
file_path = '/data/suojiayi/buquan/split_dhbq/part_2.jsonl'
output_file = './dhbq/duihuabuquan_prompt_2.jsonl'
#file_path = '/dataset-pvc/suojiayi/new/train_prepare/20250423_020157/tmp_data/instruct_data_COIG_filtered_2504212014.jsonl'
all_results = []
for batch in read_jsonl_lines_in_batches(file_path, batch_size=10000):
processed_batch = process_data_concurrently(batch, api_url, headers, max_workers=20)
all_results.extend(processed_batch)
# save_to_excel_in_batches(all_results, output_excel_file, batch_size=23000)
save_to_jsonl_in_batches(all_results, output_file, batch_size=10000)

View File

@ -0,0 +1,143 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright @2024 AI. Inspur Inc.
#
# @author: suojiayi <suojiayi@inspur.com>
# @date: 2025/05/13
import json
import requests
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
def read_jsonl_lines_in_batches(file_path, batch_size=10000):
"""按批次读取 JSONL 文件"""
batch = []
with open(file_path, mode="r", encoding="utf-8") as f:
for line in f:
try:
batch.append(json.loads(line.strip()))
if len(batch) == batch_size:
yield batch
batch = []
except json.JSONDecodeError as e:
print(f"Error decoding JSON: {e}")
if batch:
yield batch
def process_data_concurrently(data_list, api_url, headers, max_workers=10):
"""并发处理数据并调用 API"""
result_data = []
def process_single_data(data):
try:
query = data.get('data')
if query:
question = query[-1]['content']
conversation = query[:len(query)-1]
input_content = f'''任务说明:
请对给定的历史多轮对话进行分析结合上下文逻辑和多轮对话特性识别最后一轮问题的缺失信息 对改写后的问题进行评分1-5
改写要求
忠实性确保改写后的问题与原问题意图一致无歧义无信息偏差且无冗余修改
完整性若上下文可提供必要信息应填补空白使问题更完整同时保持语义连贯
保守性若上下文无法有效澄清原问题则保持原问题不变不强行补全
语义一致性不得改变原问题的语义或缩小其范围
中立性不得针对模型回答进行优化即避免引导性改写
时间相关性除非原问题涉及时间否则不得引入无关时间信息
评分标准1-5
分数 标准描述
5优秀 完全符合原意图填补信息精准且必要语义连贯无冗余或无关修改
4良好 基本符合原意图补充信息合理但可能稍显冗余或不够精准语义基本连贯
3中等 大体保留原意图但补充信息可能不精准或必要性不足语义稍显生硬
2较差 部分偏离原意图补充信息冗余或关联性弱语义不连贯或存在歧义
1 严重偏离原意图补充信息错误或无关语义混乱或强行补全导致问题失真
特殊情况处理
上下文无关若无法从上下文合理推断缺失信息则保持原问题不变5
强行补全若补充信息导致语义改变或范围收窄应扣分视情况给2分或更低
原多轮对话及问题{conversation}
改写后问题{question}
评分请基于上述标准给出1-5分的评分
输出格式
仅返回1-5之间的整数分数无需解释'''
# print(input_content)
response = requests.post(
api_url,
headers=headers,
json={
"model": "Qwen2.5-72B-Instruct",
"stream": False,
"temperature": 0.01,
"messages": [{"role": "user", "content": input_content}]
}
)
if response.status_code == 200:
try:
content = response.json()["choices"][0]["message"]["content"]
except (KeyError, IndexError, json.JSONDecodeError):
content = "无法解析返回内容"
else:
content = f"API请求失败状态码{response.status_code}"
return {
"uid": data.get('uid'),
"data": query,
"answer": content
}
except Exception as e:
print(e)
return None
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [executor.submit(process_single_data, data) for data in data_list]
for future in as_completed(futures):
result = future.result()
if result:
result_data.append(result)
return result_data
def save_to_excel_in_batches(data_list, output_file, batch_size=10000):
"""按批次保存数据到 Excel 文件"""
df = pd.DataFrame(data_list)
writer = pd.ExcelWriter(output_file, engine='openpyxl')
for i in range(0, len(df), batch_size):
batch_df = df.iloc[i:i + batch_size]
batch_df.to_excel(writer, index=False, startrow=i)
writer.close()
print(f"数据已成功保存到 {output_file}")
def save_to_jsonl_in_batches(data_list, output_file, batch_size=10000):
"""按批次保存数据到 JSONL 文件"""
with open(output_file, 'w', encoding='utf-8') as f:
for i in range(0, len(data_list), batch_size):
# 获取当前批次的数据
batch_data = data_list[i:i + batch_size]
# 将每个数据对象写入文件,每行一个 JSON 对象
for item in batch_data:
f.write(json.dumps(item, ensure_ascii=False) + '\n')
print(f"数据已成功保存到 {output_file}")
if __name__ == "__main__":
#output_excel_file = 'result-taoli-5.xlsx'
api_url = "http://100.105.48.35:8000/v1/chat/completions"
# api_url = "http://100.105.107.150:8000/v1/chat/completions"
headers = {
"Content-Type": "application/json",
"Authorization": "7c3eafb5-2d6e-100d-ab0f-7b2c1cdafb3c"
}
#file_path = '/dataset-pvc/suojiayi/new/train_prepare/20250423_020157/tmp_data/instruct_data_BELLE_Multiturn_Chat_filtered_2504232014.jsonl'
file_path = '/data/suojiayi/buquan/split_dhbq/part_1.jsonl'
output_file = './dhbq/duihuabuquan_score_1.jsonl'
#file_path = '/dataset-pvc/suojiayi/new/train_prepare/20250423_020157/tmp_data/instruct_data_COIG_filtered_2504212014.jsonl'
all_results = []
for batch in read_jsonl_lines_in_batches(file_path, batch_size=10000):
processed_batch = process_data_concurrently(batch, api_url, headers, max_workers=20)
all_results.extend(processed_batch)
# save_to_excel_in_batches(all_results, output_excel_file, batch_size=23000)
save_to_jsonl_in_batches(all_results, output_file, batch_size=10000)

View File

@ -0,0 +1,179 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright @2024 AI. Inspur Inc.
#
# @author: suojiayi <suojiayi@inspur.com>
# @date: 2025/05/13
import json
import pandas as pd
def load_excel_column(file_path, column_index=2):
"""读取 Excel 文件的第三列,并将每一行解析为 dict"""
df = pd.read_excel(file_path)
result = []
seen_uids = set()
for _, row in df.iterrows():
try:
data = json.loads(row.iloc[column_index])
if isinstance(data, dict) and 'uid' in data:
if data['uid'] not in seen_uids:
seen_uids.add(data['uid'])
result.append(data)
except (json.JSONDecodeError, IndexError):
continue
return result
def process_jsonl_file(file_path, top_n_per_group, result, seen_uids):
"""
读取 jsonl 文件对每一行 group 数据尽力从中选出 top_n_per_group 条不重复数据
如果无法选够指定数量的数据则跳过该 group 并打印警告信息
"""
with open(file_path, 'r', encoding='utf-8') as f:
for line_idx, line in enumerate(f):
line = line.strip()
if not line:
continue
try:
entry = json.loads(line)
# 获取 group key 和对应的数据列表
key = next(k for k in entry if k != "count")
records = entry[key]
selected = []
added_count = 0
# 遍历 records直到选够 top_n_per_group 个不重复的数据
for item in records:
uid = item.get('uid')
if added_count >= top_n_per_group:
break # 已经选够了,退出
if uid and uid not in seen_uids:
seen_uids.add(uid)
selected.append(item)
result.append(item)
added_count += 1
# 如果最终无法选满 top_n_per_group 条,可以给出警告或处理
if added_count < top_n_per_group:
print(f"[Group {key}] Only found {added_count}/{top_n_per_group} unique items. Skipping this group.")
# 如果需要的话,可以从结果中移除已添加的记录
for item in selected:
result.remove(item)
seen_uids.discard(item.get('uid'))
else:
print(f"[Group {key}] Successfully added {added_count} items.")
except Exception as e:
print(f"Error processing line {line_idx + 1}: {e}")
continue
def is_similar_to_existing(embedding, existing_embeddings, threshold=0.85):
"""
判断当前 embedding 是否与已有 embeddings 的最大余弦相似度超过阈值
"""
if not existing_embeddings:
return False
current_emb = torch.tensor(embedding, dtype=torch.float32)
existing_emb = torch.tensor(existing_embeddings, dtype=torch.float32)
similarity = F.cosine_similarity(current_emb.unsqueeze(0), existing_emb)
return (similarity > threshold).any().item()
def process_cluster_jsonl(file_path, top_n_per_group, result, seen_uids, similarity_threshold=0.5):
"""
处理 dhbq_count_cluster.jsonl每组最多取 top_n_per_group
并根据 embedding 字段进行余弦相似度过滤
"""
with open(file_path, 'r', encoding='utf-8') as f:
for line_idx, line in enumerate(f):
line = line.strip()
if not line:
continue
try:
entry = json.loads(line)
key = next(k for k in entry if k != "count")
records = entry[key]
selected = []
added_count = 0
existing_embeddings = [item['embedding'] for item in result if 'embedding' in item] # 已有所有 embedding
for item in records:
uid = item.get('uid')
emb = item.get('embedding')
if added_count >= top_n_per_group:
break
if not emb or not isinstance(emb, list):
print(f"Skipping item {uid}: missing or invalid embedding.")
continue
if uid and uid not in seen_uids:
if is_similar_to_existing(emb, existing_embeddings, similarity_threshold):
print(f"Skipping item {uid} due to high similarity.")
continue
seen_uids.add(uid)
selected.append(item)
result.append(item)
existing_embeddings.append(emb)
added_count += 1
print(f"[Group {key}] Successfully added {added_count} items.")
except Exception as e:
print(f"Error processing line {line_idx + 1}: {e}")
continue
def main():
result = []
seen_uids = set()
# Step 1: 读取 Excel 第三列
print("Step 1: Reading Excel file...")
excel_data = load_excel_column("./dhbq/frequency-score-5.xlsx", column_index=2)
for item in excel_data:
uid = item.get('uid')
if uid and uid not in seen_uids:
seen_uids.add(uid)
result.append(item)
# Step 2: 处理 dhbq_count_prompt_label.jsonl每组前3条
print("Step 2: Processing dhbq_count_prompt_label.jsonl...")
process_jsonl_file("./dhbq/dhbq_count_prompt_label.jsonl", 170, result, seen_uids)
# Step 3: 处理 dhbq_count_instag.jsonl每组前130条
print("Step 3: Processing dhbq_count_instag.jsonl...")
process_jsonl_file("./dhbq/dhbq_count_instag.jsonl", 3, result, seen_uids)
# # Step 4: 处理 dhbq_count_cluster.jsonl每组前4条
# print("Step 4: Processing dhbq_count_cluster.jsonl...")
# process_jsonl_file("./dhbq/dhbq_count_cluster_0513.jsonl", 5, result, seen_uids)
print("Step 4: Processing dhbq_count_cluster.jsonl...")
process_cluster_jsonl("./dhbq/dhbq_count_cluster_0513.jsonl", 5, result, seen_uids, similarity_threshold=0.5)
# 最终写入输出文件
output_file = "./dhbq/merged_result_for_more_test_0513.jsonl"
with open(output_file, 'w', encoding='utf-8') as f:
for item in result:
f.write(json.dumps(item, ensure_ascii=False) + '\n')
print(f"Total merged items: {len(result)}")
print(f"Saved to: {output_file}")
if __name__ == "__main__":
main()