数智员工造数

This commit is contained in:
@zhaochsh01 2025-05-29 16:05:00 +08:00
parent 0655f98c70
commit 0a6da985dd

View File

@ -0,0 +1,205 @@
import json
import random
import asyncio
from typing import List, Dict, Any, Tuple
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
from openpyxl import Workbook
import requests
import logging
logging.basicConfig(level=logging.INFO)
# 配置参数
class Config:
OUTPUT_FILE = "recruitment_data.xlsx"
FIXED_QUESTIONS = [
"上个月面试了多少人",
"本周安排了几个面试",
"招聘进度如何",
"有多少候选人进入二面",
"销售岗位的招聘情况",
"技术岗位的简历筛选数量",
"最近一周的offer发放数量",
"哪个部门的招聘完成率最高",
"招聘成本是否超出预算",
"候选人平均面试周期是多长"
]
LOCATIONS = ["北京", "上海", "广州", "深圳", "杭州", "", "成都"]
INTENTS = ["招聘数据", "招聘进度", "其他", "成本分析", "效率统计"]
COMMISSIONER_TYPES = ["yxz", "hrbp", "recruiter", "manager"]
USER_NAMES = ["张招聘", "李HR", "王人事", "赵经理", "刘专员"]
async def chat(input_content):
response = requests.post(
api_url = "http://100.105.1.227:8000/v1/chat/completions",
headers = {
"Content-Type": "application/json",
"Authorization": "7c3eafb5-2d6e-100d-ab0f-7b2c1cdafb3c"
},
json={
"model": "Qwen3-72B",
"stream": False,
"temperature": 0.6,
"TopP": 0.95,
"TopK": 20,
"MinP": 0,
"messages": [{"role": "user", "content": input_content}]
},
timeout=180
)
if response.status_code == 200:
try:
result = response.json()["choices"][0]["message"]["content"]
except Exception as e:
logging.error(f"Error processing API response: {e}")
else:
logging.error(f"API request failed with status code: {response.status_code}")
await asyncio.sleep(0.1)
return result
# 模拟模型生成多样化问题
async def generate_diverse_questions() -> List[str]:
# 这里应该是实际调用模型生成多样化问题的代码
# 模拟生成几个变体问题
input_content = """你是一个资深HR分析师。请生成一个招聘数据分析的查询请求要求
- 聚焦在以下至少一个方面面试offer入职渠道效果成本周期时间
- 包含具体的时间范围如最近一周/上月/本季度
- 可选项包含部门/岗位/地域等维度
- 直接返回问题不要任何解释
例如
对比北京和上海地区过去两个月销售岗位的offer接受率"""
gen_question = chat(input_content)
await asyncio.sleep(0.1)
return gen_question
# 生成招聘相关的输入数据
async def generate_input_data(use_fixed: bool = True) -> Dict[str, Any]:
if random.random() > 0.3:
base_question = random.choice(Config.FIXED_QUESTIONS)
else:
base_question = await generate_diverse_questions()
return {
"messages": [{
"role": "user",
"content": base_question
}],
"location": random.choice(Config.LOCATIONS),
"uuid": str(random.randint(1e18, 1e19-1)),
"intent": random.choice(Config.INTENTS),
"loginUserName": random.choice(Config.USER_NAMES),
"loginUserId": "hr_" + str(random.randint(1000, 9999)),
"commissioner_type": random.choice(Config.COMMISSIONER_TYPES)
}
# 处理单个请求
async def process_request(input_data: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
try:
input_content = f"""
你是一个专业招聘数据分析助手请按以下规则处理问题
1. 如果问题已包含明确且可清晰回答直接返回原问题
2. 如果问题模糊或不完整按标准改写
- 补充时间范围最近/上月/本季度等
- 明确量化指标数量/比率/趋势等
- 指定具体对象岗位/部门/渠道等
3. 直接返回最终问题不要任何解释
待处理问题{input_data}
"""
user_content = input_data["messages"][0]["content"]
rewritten_question = await chat(input_content)
output_data = {
"code": "0",
"message": "",
"result": rewritten_question
}
return input_data, output_data
except Exception as e:
output_data = {
"code": "1",
"message": str(e),
"result": ""
}
return input_data, output_data
# 保存数据到Excel
def save_to_excel(data: List[Dict[str, Any]], filename: str):
rows = []
for item in data:
input_data = item["input"]
output_data = item["output"]
row = {
"输入问题": input_data["messages"][0]["content"],
"输出问题": output_data["result"],
"地点": input_data["location"],
"UUID": input_data["uuid"],
"意图": input_data["intent"],
"用户名": input_data["loginUserName"],
"用户ID": input_data["loginUserId"],
"专员类型": input_data["commissioner_type"],
"状态码": output_data["code"],
"消息": output_data["message"]
}
rows.append(row)
df = pd.DataFrame(rows)
df.to_excel(filename, index=False, engine='openpyxl')
print(f"数据已保存到 {filename}")
# 并发生成数据
async def generate_data(num_samples: int) -> List[Dict[str, Any]]:
# 首先生成所有输入数据
input_tasks = [generate_input_data() for _ in range(num_samples)]
input_data_list = await asyncio.gather(*input_tasks)
# 然后并发处理所有请求
process_tasks = [process_request(input_data) for input_data in input_data_list]
results = await asyncio.gather(*process_tasks)
# 组合结果
output = []
for input_data, output_data in results:
output.append({
"input": input_data,
"output": output_data
})
return output
# 主函数
async def main():
try:
num_samples = 2000
print(f"开始生成 {num_samples} 条招聘数据...")
data_pairs = await generate_data(num_samples)
save_to_excel(data_pairs, Config.OUTPUT_FILE)
# 打印前3条样本
print("\n样本示例:")
for i, pair in enumerate(data_pairs[:3], 1):
print(f"样本 {i}:")
print("输入问题:", pair["input"]["messages"][0]["content"])
print("输出问题:", pair["output"]["result"])
print("-" * 50)
except Exception as e:
print(f"发生错误: {e}")
if __name__ == "__main__":
asyncio.run(main())