205 lines
6.8 KiB
Python
205 lines
6.8 KiB
Python
import json
|
||
import random
|
||
import asyncio
|
||
from typing import List, Dict, Any, Tuple
|
||
from concurrent.futures import ThreadPoolExecutor
|
||
import pandas as pd
|
||
from openpyxl import Workbook
|
||
import requests
|
||
import logging
|
||
logging.basicConfig(level=logging.INFO)
|
||
# 配置参数
|
||
class Config:
|
||
OUTPUT_FILE = "recruitment_data.xlsx"
|
||
FIXED_QUESTIONS = [
|
||
"上个月面试了多少人",
|
||
"本周安排了几个面试",
|
||
"招聘进度如何",
|
||
"有多少候选人进入二面",
|
||
"销售岗位的招聘情况",
|
||
"技术岗位的简历筛选数量",
|
||
"最近一周的offer发放数量",
|
||
"哪个部门的招聘完成率最高",
|
||
"招聘成本是否超出预算",
|
||
"候选人平均面试周期是多长"
|
||
]
|
||
LOCATIONS = ["北京", "上海", "广州", "深圳", "杭州", "", "成都"]
|
||
INTENTS = ["招聘数据", "招聘进度", "其他", "成本分析", "效率统计"]
|
||
COMMISSIONER_TYPES = ["yxz", "hrbp", "recruiter", "manager"]
|
||
USER_NAMES = ["张招聘", "李HR", "王人事", "赵经理", "刘专员"]
|
||
|
||
|
||
|
||
|
||
async def chat(input_content):
|
||
response = requests.post(
|
||
api_url = "http://100.105.1.227:8000/v1/chat/completions",
|
||
headers = {
|
||
"Content-Type": "application/json",
|
||
"Authorization": "7c3eafb5-2d6e-100d-ab0f-7b2c1cdafb3c"
|
||
},
|
||
json={
|
||
"model": "Qwen3-72B",
|
||
"stream": False,
|
||
"temperature": 0.6,
|
||
"TopP": 0.95,
|
||
"TopK": 20,
|
||
"MinP": 0,
|
||
"messages": [{"role": "user", "content": input_content}]
|
||
},
|
||
timeout=180
|
||
)
|
||
|
||
if response.status_code == 200:
|
||
try:
|
||
result = response.json()["choices"][0]["message"]["content"]
|
||
except Exception as e:
|
||
logging.error(f"Error processing API response: {e}")
|
||
else:
|
||
logging.error(f"API request failed with status code: {response.status_code}")
|
||
await asyncio.sleep(0.1)
|
||
return result
|
||
|
||
|
||
|
||
# 模拟模型生成多样化问题
|
||
async def generate_diverse_questions() -> List[str]:
|
||
# 这里应该是实际调用模型生成多样化问题的代码
|
||
# 模拟生成几个变体问题
|
||
|
||
input_content = """你是一个资深HR分析师。请生成一个招聘数据分析的查询请求,要求:
|
||
- 聚焦在以下至少一个方面:面试、offer、入职、渠道效果、成本、周期时间
|
||
- 包含具体的时间范围(如最近一周/上月/本季度)
|
||
- 可选项包含部门/岗位/地域等维度
|
||
- 直接返回问题,不要任何解释
|
||
|
||
例如:
|
||
对比北京和上海地区过去两个月销售岗位的offer接受率"""
|
||
gen_question = chat(input_content)
|
||
await asyncio.sleep(0.1)
|
||
|
||
return gen_question
|
||
|
||
|
||
|
||
# 生成招聘相关的输入数据
|
||
async def generate_input_data(use_fixed: bool = True) -> Dict[str, Any]:
|
||
if random.random() > 0.3:
|
||
base_question = random.choice(Config.FIXED_QUESTIONS)
|
||
else:
|
||
|
||
base_question = await generate_diverse_questions()
|
||
|
||
|
||
return {
|
||
"messages": [{
|
||
"role": "user",
|
||
"content": base_question
|
||
}],
|
||
"location": random.choice(Config.LOCATIONS),
|
||
"uuid": str(random.randint(1e18, 1e19-1)),
|
||
"intent": random.choice(Config.INTENTS),
|
||
"loginUserName": random.choice(Config.USER_NAMES),
|
||
"loginUserId": "hr_" + str(random.randint(1000, 9999)),
|
||
"commissioner_type": random.choice(Config.COMMISSIONER_TYPES)
|
||
}
|
||
|
||
# 处理单个请求
|
||
async def process_request(input_data: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
||
try:
|
||
input_content = f"""
|
||
你是一个专业招聘数据分析助手。请按以下规则处理问题:
|
||
1. 如果问题已包含明确且可清晰回答,直接返回原问题
|
||
2. 如果问题模糊或不完整,按标准改写:
|
||
- 补充时间范围(最近/上月/本季度等)
|
||
- 明确量化指标(数量/比率/趋势等)
|
||
- 指定具体对象(岗位/部门/渠道等)
|
||
3. 直接返回最终问题,不要任何解释
|
||
|
||
待处理问题:{input_data}
|
||
"""
|
||
user_content = input_data["messages"][0]["content"]
|
||
rewritten_question = await chat(input_content)
|
||
|
||
output_data = {
|
||
"code": "0",
|
||
"message": "",
|
||
"result": rewritten_question
|
||
}
|
||
return input_data, output_data
|
||
except Exception as e:
|
||
output_data = {
|
||
"code": "1",
|
||
"message": str(e),
|
||
"result": ""
|
||
}
|
||
return input_data, output_data
|
||
|
||
# 保存数据到Excel
|
||
def save_to_excel(data: List[Dict[str, Any]], filename: str):
|
||
rows = []
|
||
for item in data:
|
||
input_data = item["input"]
|
||
output_data = item["output"]
|
||
|
||
row = {
|
||
"输入问题": input_data["messages"][0]["content"],
|
||
"输出问题": output_data["result"],
|
||
"地点": input_data["location"],
|
||
"UUID": input_data["uuid"],
|
||
"意图": input_data["intent"],
|
||
"用户名": input_data["loginUserName"],
|
||
"用户ID": input_data["loginUserId"],
|
||
"专员类型": input_data["commissioner_type"],
|
||
"状态码": output_data["code"],
|
||
"消息": output_data["message"]
|
||
}
|
||
rows.append(row)
|
||
|
||
df = pd.DataFrame(rows)
|
||
df.to_excel(filename, index=False, engine='openpyxl')
|
||
print(f"数据已保存到 {filename}")
|
||
|
||
# 并发生成数据
|
||
async def generate_data(num_samples: int) -> List[Dict[str, Any]]:
|
||
# 首先生成所有输入数据
|
||
input_tasks = [generate_input_data() for _ in range(num_samples)]
|
||
input_data_list = await asyncio.gather(*input_tasks)
|
||
|
||
# 然后并发处理所有请求
|
||
process_tasks = [process_request(input_data) for input_data in input_data_list]
|
||
results = await asyncio.gather(*process_tasks)
|
||
|
||
# 组合结果
|
||
output = []
|
||
for input_data, output_data in results:
|
||
output.append({
|
||
"input": input_data,
|
||
"output": output_data
|
||
})
|
||
|
||
return output
|
||
|
||
# 主函数
|
||
async def main():
|
||
try:
|
||
|
||
num_samples = 2000
|
||
print(f"开始生成 {num_samples} 条招聘数据...")
|
||
data_pairs = await generate_data(num_samples)
|
||
|
||
save_to_excel(data_pairs, Config.OUTPUT_FILE)
|
||
|
||
# 打印前3条样本
|
||
print("\n样本示例:")
|
||
for i, pair in enumerate(data_pairs[:3], 1):
|
||
print(f"样本 {i}:")
|
||
print("输入问题:", pair["input"]["messages"][0]["content"])
|
||
print("输出问题:", pair["output"]["result"])
|
||
print("-" * 50)
|
||
|
||
except Exception as e:
|
||
print(f"发生错误: {e}")
|
||
|
||
if __name__ == "__main__":
|
||
asyncio.run(main()) |