offline_data_model_pipline/智数员工/zhaopin_zaoshu.py
2025-05-29 16:05:00 +08:00

205 lines
6.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import random
import asyncio
from typing import List, Dict, Any, Tuple
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
from openpyxl import Workbook
import requests
import logging
logging.basicConfig(level=logging.INFO)
# 配置参数
class Config:
OUTPUT_FILE = "recruitment_data.xlsx"
FIXED_QUESTIONS = [
"上个月面试了多少人",
"本周安排了几个面试",
"招聘进度如何",
"有多少候选人进入二面",
"销售岗位的招聘情况",
"技术岗位的简历筛选数量",
"最近一周的offer发放数量",
"哪个部门的招聘完成率最高",
"招聘成本是否超出预算",
"候选人平均面试周期是多长"
]
LOCATIONS = ["北京", "上海", "广州", "深圳", "杭州", "", "成都"]
INTENTS = ["招聘数据", "招聘进度", "其他", "成本分析", "效率统计"]
COMMISSIONER_TYPES = ["yxz", "hrbp", "recruiter", "manager"]
USER_NAMES = ["张招聘", "李HR", "王人事", "赵经理", "刘专员"]
async def chat(input_content):
response = requests.post(
api_url = "http://100.105.1.227:8000/v1/chat/completions",
headers = {
"Content-Type": "application/json",
"Authorization": "7c3eafb5-2d6e-100d-ab0f-7b2c1cdafb3c"
},
json={
"model": "Qwen3-72B",
"stream": False,
"temperature": 0.6,
"TopP": 0.95,
"TopK": 20,
"MinP": 0,
"messages": [{"role": "user", "content": input_content}]
},
timeout=180
)
if response.status_code == 200:
try:
result = response.json()["choices"][0]["message"]["content"]
except Exception as e:
logging.error(f"Error processing API response: {e}")
else:
logging.error(f"API request failed with status code: {response.status_code}")
await asyncio.sleep(0.1)
return result
# 模拟模型生成多样化问题
async def generate_diverse_questions() -> List[str]:
# 这里应该是实际调用模型生成多样化问题的代码
# 模拟生成几个变体问题
input_content = """你是一个资深HR分析师。请生成一个招聘数据分析的查询请求要求
- 聚焦在以下至少一个方面面试、offer、入职、渠道效果、成本、周期时间
- 包含具体的时间范围(如最近一周/上月/本季度)
- 可选项包含部门/岗位/地域等维度
- 直接返回问题,不要任何解释
例如:
对比北京和上海地区过去两个月销售岗位的offer接受率"""
gen_question = chat(input_content)
await asyncio.sleep(0.1)
return gen_question
# 生成招聘相关的输入数据
async def generate_input_data(use_fixed: bool = True) -> Dict[str, Any]:
if random.random() > 0.3:
base_question = random.choice(Config.FIXED_QUESTIONS)
else:
base_question = await generate_diverse_questions()
return {
"messages": [{
"role": "user",
"content": base_question
}],
"location": random.choice(Config.LOCATIONS),
"uuid": str(random.randint(1e18, 1e19-1)),
"intent": random.choice(Config.INTENTS),
"loginUserName": random.choice(Config.USER_NAMES),
"loginUserId": "hr_" + str(random.randint(1000, 9999)),
"commissioner_type": random.choice(Config.COMMISSIONER_TYPES)
}
# 处理单个请求
async def process_request(input_data: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
try:
input_content = f"""
你是一个专业招聘数据分析助手。请按以下规则处理问题:
1. 如果问题已包含明确且可清晰回答,直接返回原问题
2. 如果问题模糊或不完整,按标准改写:
- 补充时间范围(最近/上月/本季度等)
- 明确量化指标(数量/比率/趋势等)
- 指定具体对象(岗位/部门/渠道等)
3. 直接返回最终问题,不要任何解释
待处理问题:{input_data}
"""
user_content = input_data["messages"][0]["content"]
rewritten_question = await chat(input_content)
output_data = {
"code": "0",
"message": "",
"result": rewritten_question
}
return input_data, output_data
except Exception as e:
output_data = {
"code": "1",
"message": str(e),
"result": ""
}
return input_data, output_data
# 保存数据到Excel
def save_to_excel(data: List[Dict[str, Any]], filename: str):
rows = []
for item in data:
input_data = item["input"]
output_data = item["output"]
row = {
"输入问题": input_data["messages"][0]["content"],
"输出问题": output_data["result"],
"地点": input_data["location"],
"UUID": input_data["uuid"],
"意图": input_data["intent"],
"用户名": input_data["loginUserName"],
"用户ID": input_data["loginUserId"],
"专员类型": input_data["commissioner_type"],
"状态码": output_data["code"],
"消息": output_data["message"]
}
rows.append(row)
df = pd.DataFrame(rows)
df.to_excel(filename, index=False, engine='openpyxl')
print(f"数据已保存到 {filename}")
# 并发生成数据
async def generate_data(num_samples: int) -> List[Dict[str, Any]]:
# 首先生成所有输入数据
input_tasks = [generate_input_data() for _ in range(num_samples)]
input_data_list = await asyncio.gather(*input_tasks)
# 然后并发处理所有请求
process_tasks = [process_request(input_data) for input_data in input_data_list]
results = await asyncio.gather(*process_tasks)
# 组合结果
output = []
for input_data, output_data in results:
output.append({
"input": input_data,
"output": output_data
})
return output
# 主函数
async def main():
try:
num_samples = 2000
print(f"开始生成 {num_samples} 条招聘数据...")
data_pairs = await generate_data(num_samples)
save_to_excel(data_pairs, Config.OUTPUT_FILE)
# 打印前3条样本
print("\n样本示例:")
for i, pair in enumerate(data_pairs[:3], 1):
print(f"样本 {i}:")
print("输入问题:", pair["input"]["messages"][0]["content"])
print("输出问题:", pair["output"]["result"])
print("-" * 50)
except Exception as e:
print(f"发生错误: {e}")
if __name__ == "__main__":
asyncio.run(main())