import json import random import asyncio from typing import List, Dict, Any, Tuple from concurrent.futures import ThreadPoolExecutor import pandas as pd from openpyxl import Workbook import requests import logging logging.basicConfig(level=logging.INFO) # 配置参数 class Config: OUTPUT_FILE = "recruitment_data.xlsx" FIXED_QUESTIONS = [ "上个月面试了多少人", "本周安排了几个面试", "招聘进度如何", "有多少候选人进入二面", "销售岗位的招聘情况", "技术岗位的简历筛选数量", "最近一周的offer发放数量", "哪个部门的招聘完成率最高", "招聘成本是否超出预算", "候选人平均面试周期是多长" ] LOCATIONS = ["北京", "上海", "广州", "深圳", "杭州", "", "成都"] INTENTS = ["招聘数据", "招聘进度", "其他", "成本分析", "效率统计"] COMMISSIONER_TYPES = ["yxz", "hrbp", "recruiter", "manager"] USER_NAMES = ["张招聘", "李HR", "王人事", "赵经理", "刘专员"] async def chat(input_content): response = requests.post( api_url = "http://100.105.1.227:8000/v1/chat/completions", headers = { "Content-Type": "application/json", "Authorization": "7c3eafb5-2d6e-100d-ab0f-7b2c1cdafb3c" }, json={ "model": "Qwen3-72B", "stream": False, "temperature": 0.6, "TopP": 0.95, "TopK": 20, "MinP": 0, "messages": [{"role": "user", "content": input_content}] }, timeout=180 ) if response.status_code == 200: try: result = response.json()["choices"][0]["message"]["content"] except Exception as e: logging.error(f"Error processing API response: {e}") else: logging.error(f"API request failed with status code: {response.status_code}") await asyncio.sleep(0.1) return result # 模拟模型生成多样化问题 async def generate_diverse_questions() -> List[str]: # 这里应该是实际调用模型生成多样化问题的代码 # 模拟生成几个变体问题 input_content = """你是一个资深HR分析师。请生成一个招聘数据分析的查询请求,要求: - 聚焦在以下至少一个方面:面试、offer、入职、渠道效果、成本、周期时间 - 包含具体的时间范围(如最近一周/上月/本季度) - 可选项包含部门/岗位/地域等维度 - 直接返回问题,不要任何解释 例如: 对比北京和上海地区过去两个月销售岗位的offer接受率""" gen_question = chat(input_content) await asyncio.sleep(0.1) return gen_question # 生成招聘相关的输入数据 async def generate_input_data(use_fixed: bool = True) -> Dict[str, Any]: if random.random() > 0.3: base_question = random.choice(Config.FIXED_QUESTIONS) else: base_question = await generate_diverse_questions() return { "messages": [{ "role": "user", "content": base_question }], "location": random.choice(Config.LOCATIONS), "uuid": str(random.randint(1e18, 1e19-1)), "intent": random.choice(Config.INTENTS), "loginUserName": random.choice(Config.USER_NAMES), "loginUserId": "hr_" + str(random.randint(1000, 9999)), "commissioner_type": random.choice(Config.COMMISSIONER_TYPES) } # 处理单个请求 async def process_request(input_data: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]: try: input_content = f""" 你是一个专业招聘数据分析助手。请按以下规则处理问题: 1. 如果问题已包含明确且可清晰回答,直接返回原问题 2. 如果问题模糊或不完整,按标准改写: - 补充时间范围(最近/上月/本季度等) - 明确量化指标(数量/比率/趋势等) - 指定具体对象(岗位/部门/渠道等) 3. 直接返回最终问题,不要任何解释 待处理问题:{input_data} """ user_content = input_data["messages"][0]["content"] rewritten_question = await chat(input_content) output_data = { "code": "0", "message": "", "result": rewritten_question } return input_data, output_data except Exception as e: output_data = { "code": "1", "message": str(e), "result": "" } return input_data, output_data # 保存数据到Excel def save_to_excel(data: List[Dict[str, Any]], filename: str): rows = [] for item in data: input_data = item["input"] output_data = item["output"] row = { "输入问题": input_data["messages"][0]["content"], "输出问题": output_data["result"], "地点": input_data["location"], "UUID": input_data["uuid"], "意图": input_data["intent"], "用户名": input_data["loginUserName"], "用户ID": input_data["loginUserId"], "专员类型": input_data["commissioner_type"], "状态码": output_data["code"], "消息": output_data["message"] } rows.append(row) df = pd.DataFrame(rows) df.to_excel(filename, index=False, engine='openpyxl') print(f"数据已保存到 {filename}") # 并发生成数据 async def generate_data(num_samples: int) -> List[Dict[str, Any]]: # 首先生成所有输入数据 input_tasks = [generate_input_data() for _ in range(num_samples)] input_data_list = await asyncio.gather(*input_tasks) # 然后并发处理所有请求 process_tasks = [process_request(input_data) for input_data in input_data_list] results = await asyncio.gather(*process_tasks) # 组合结果 output = [] for input_data, output_data in results: output.append({ "input": input_data, "output": output_data }) return output # 主函数 async def main(): try: num_samples = 2000 print(f"开始生成 {num_samples} 条招聘数据...") data_pairs = await generate_data(num_samples) save_to_excel(data_pairs, Config.OUTPUT_FILE) # 打印前3条样本 print("\n样本示例:") for i, pair in enumerate(data_pairs[:3], 1): print(f"样本 {i}:") print("输入问题:", pair["input"]["messages"][0]["content"]) print("输出问题:", pair["output"]["result"]) print("-" * 50) except Exception as e: print(f"发生错误: {e}") if __name__ == "__main__": asyncio.run(main())