166 lines
6.6 KiB
Python
166 lines
6.6 KiB
Python
|
#!/usr/bin/env python
|
|||
|
# -*- coding: utf-8 -*-
|
|||
|
#
|
|||
|
# Copyright @2024 AI. Inspur Inc.
|
|||
|
#
|
|||
|
# @author: suojiayi <suojiayi@inspur.com>
|
|||
|
# @date: 2025/05/13
|
|||
|
|
|||
|
import json
|
|||
|
import requests
|
|||
|
import pandas as pd
|
|||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|||
|
|
|||
|
def read_jsonl_lines_in_batches(file_path, batch_size=10000):
|
|||
|
"""按批次读取 JSONL 文件"""
|
|||
|
batch = []
|
|||
|
with open(file_path, mode="r", encoding="utf-8") as f:
|
|||
|
for line in f:
|
|||
|
try:
|
|||
|
batch.append(json.loads(line.strip()))
|
|||
|
if len(batch) == batch_size:
|
|||
|
yield batch
|
|||
|
batch = []
|
|||
|
except json.JSONDecodeError as e:
|
|||
|
print(f"Error decoding JSON: {e}")
|
|||
|
if batch:
|
|||
|
yield batch
|
|||
|
|
|||
|
|
|||
|
def process_data_concurrently(data_list, api_url, headers, max_workers=10):
|
|||
|
"""并发处理数据并调用 API"""
|
|||
|
result_data = []
|
|||
|
|
|||
|
def process_single_data(data):
|
|||
|
try:
|
|||
|
query = data.get('data')
|
|||
|
if query:
|
|||
|
input_content = f'''你是一个多轮对话分析专家。请根据以下维度对给定的多轮对话(2-3轮)进行细粒度分类,并按统一格式返回结果。
|
|||
|
# 输入规范
|
|||
|
1. 输入为完整的多轮对话(2-3组问答对)
|
|||
|
2. 每组问答包含明确的用户提问和系统/客服响应
|
|||
|
|
|||
|
# 核心分析维度
|
|||
|
## 1. 指代消解模式
|
|||
|
- 显式指代:直接使用完整名词(订单号123)
|
|||
|
- 代词依赖:需解析"它/这个/那个"等代词
|
|||
|
- 省略恢复:需补全省略成分("还有呢?"→"其他问题")
|
|||
|
- 零指代:无主语句("还没处理")
|
|||
|
|
|||
|
## 2. 信息流结构
|
|||
|
- 自足单元:当前轮次信息完整
|
|||
|
- 跨轮依赖:需前文关键信息
|
|||
|
- 隐式跳转:无过渡的话题切换
|
|||
|
- 意图延续:延续前轮任务目标
|
|||
|
- 推理关联:需逻辑推算建立联系
|
|||
|
|
|||
|
## 3. 上下文管理
|
|||
|
- 短期记忆:依赖1-2轮内信息
|
|||
|
- 长期记忆:引用3轮前历史记录
|
|||
|
- 推理记忆:需计算/推理关联信息
|
|||
|
|
|||
|
## 4. 状态演化
|
|||
|
- 静态查询:纯信息检索
|
|||
|
- 动态演进:逐步完善任务参数
|
|||
|
- 混合操作:查询+修改组合
|
|||
|
- 信息修正:更正前轮错误数据
|
|||
|
|
|||
|
# 输出规范'''+'''```json
|
|||
|
{
|
|||
|
"labels": ["指代消解::显式指代", "信息流::跨轮依赖", "上下文::推理记忆", "状态演化::动态演进"],
|
|||
|
"analysis": {
|
|||
|
"指代消解": {
|
|||
|
"类型": "显式指代",
|
|||
|
"证据": "例句:'订单A123'直接使用完整标识符"
|
|||
|
},
|
|||
|
"信息流": {
|
|||
|
"类型": "跨轮依赖",
|
|||
|
"证据": "客服要求提供订单号是对用户首轮请求的响应"
|
|||
|
},
|
|||
|
"上下文": {
|
|||
|
"类型": "推理记忆",
|
|||
|
"证据": "需要根据首轮日期推算当前状态"
|
|||
|
},
|
|||
|
"状态演化": {
|
|||
|
"类型": "动态演进",
|
|||
|
"证据": "从查询请求逐步收集必要参数"
|
|||
|
}
|
|||
|
}
|
|||
|
}```'''+f'''让我们一步一步思考,给出最后的返回结果,输入的多轮对话:{query}'''
|
|||
|
response = requests.post(
|
|||
|
api_url,
|
|||
|
headers=headers,
|
|||
|
json={
|
|||
|
"model": "Qwen2.5-72B-Instruct",
|
|||
|
"stream": False,
|
|||
|
"temperature": 0.01,
|
|||
|
"messages": [{"role": "user", "content": input_content}]
|
|||
|
}
|
|||
|
)
|
|||
|
if response.status_code == 200:
|
|||
|
try:
|
|||
|
content = response.json()["choices"][0]["message"]["content"]
|
|||
|
except (KeyError, IndexError, json.JSONDecodeError):
|
|||
|
content = "无法解析返回内容"
|
|||
|
else:
|
|||
|
content = f"API请求失败,状态码:{response.status_code}"
|
|||
|
return {
|
|||
|
"uid": data.get('uid'),
|
|||
|
"data": query,
|
|||
|
"answer": content
|
|||
|
}
|
|||
|
except Exception as e:
|
|||
|
print(e)
|
|||
|
return None
|
|||
|
|
|||
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|||
|
futures = [executor.submit(process_single_data, data) for data in data_list]
|
|||
|
for future in as_completed(futures):
|
|||
|
result = future.result()
|
|||
|
if result:
|
|||
|
result_data.append(result)
|
|||
|
|
|||
|
return result_data
|
|||
|
|
|||
|
|
|||
|
def save_to_excel_in_batches(data_list, output_file, batch_size=10000):
|
|||
|
"""按批次保存数据到 Excel 文件"""
|
|||
|
df = pd.DataFrame(data_list)
|
|||
|
writer = pd.ExcelWriter(output_file, engine='openpyxl')
|
|||
|
for i in range(0, len(df), batch_size):
|
|||
|
batch_df = df.iloc[i:i + batch_size]
|
|||
|
batch_df.to_excel(writer, index=False, startrow=i)
|
|||
|
writer.close()
|
|||
|
print(f"数据已成功保存到 {output_file}")
|
|||
|
|
|||
|
def save_to_jsonl_in_batches(data_list, output_file, batch_size=10000):
|
|||
|
"""按批次保存数据到 JSONL 文件"""
|
|||
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|||
|
for i in range(0, len(data_list), batch_size):
|
|||
|
# 获取当前批次的数据
|
|||
|
batch_data = data_list[i:i + batch_size]
|
|||
|
# 将每个数据对象写入文件,每行一个 JSON 对象
|
|||
|
for item in batch_data:
|
|||
|
f.write(json.dumps(item, ensure_ascii=False) + '\n')
|
|||
|
print(f"数据已成功保存到 {output_file}")
|
|||
|
|
|||
|
if __name__ == "__main__":
|
|||
|
#output_excel_file = 'result-taoli-5.xlsx'
|
|||
|
# api_url = "http://100.105.149.39:8000/v1/chat/completions"
|
|||
|
api_url = "http://100.105.230.95:8000/v1/chat/completions"
|
|||
|
headers = {
|
|||
|
"Content-Type": "application/json",
|
|||
|
"Authorization": "7c3eafb5-2d6e-100d-ab0f-7b2c1cdafb3c"
|
|||
|
}
|
|||
|
|
|||
|
#file_path = '/dataset-pvc/suojiayi/new/train_prepare/20250423_020157/tmp_data/instruct_data_BELLE_Multiturn_Chat_filtered_2504232014.jsonl'
|
|||
|
file_path = '/data/suojiayi/buquan/split_dhbq/part_2.jsonl'
|
|||
|
output_file = './dhbq/duihuabuquan_prompt_2.jsonl'
|
|||
|
#file_path = '/dataset-pvc/suojiayi/new/train_prepare/20250423_020157/tmp_data/instruct_data_COIG_filtered_2504212014.jsonl'
|
|||
|
all_results = []
|
|||
|
for batch in read_jsonl_lines_in_batches(file_path, batch_size=10000):
|
|||
|
processed_batch = process_data_concurrently(batch, api_url, headers, max_workers=20)
|
|||
|
all_results.extend(processed_batch)
|
|||
|
# save_to_excel_in_batches(all_results, output_excel_file, batch_size=23000)
|
|||
|
save_to_jsonl_in_batches(all_results, output_file, batch_size=10000)
|