""" 政务12345全国数据生成系统 功能: 1. 支持全国范围地理位置生成 2. 多层级分类扩展 3. 数据保存至Excel 4. 真实业务场景模拟 """ import pandas as pd from openai import OpenAI import random import time import re from typing import List, Dict, Tuple client = OpenAI(api_key="your-api-key") class NationalDataGenerator: def __init__(self, excel_path: str, category_column: str): self.base_categories = self._load_excel_categories(excel_path, category_column) self.location_pool = self._generate_national_locations() self.expanded_categories = self._expand_categories_with_gpt() self.used_records = set() def _load_excel_categories(self, path: str, column: str) -> List[str]: """从Excel读取基础分类""" df = pd.read_excel(path) return df[column].dropna().unique().tolist() def _generate_national_locations(self, num=200) -> List[str]: """生成全国真实地理位置库""" response = client.chat.completions.create( model="gpt-4-turbo", messages=[{ "role": "user", "content": f"生成{num}个中国各城市真实存在的地理位置,按省市区三级格式,示例:\n- 广东省广州市天河区珠江新城\n- 浙江省杭州市余杭区未来科技城" }] ) return [line.split(" ")[1] for line in response.choices[0].message.content.strip().split("\n")] def _expand_categories_with_gpt(self) -> Dict[str, List[str]]: """GPT扩展分类体系""" category_map = {} for base_cat in self.base_categories: response = client.chat.completions.create( model="gpt-4-turbo", messages=[{ "role": "user", "content": f"生成与【{base_cat}】相关但具有政务场景区分度的5个细分类型,示例:\n- 类型1:施工许可违规\n- 类型2:夜间施工超时" }] ) sub_cats = [re.sub(r":.*", "", line.split(" ")[1]) for line in response.choices[0].message.content.strip().split("\n")] category_map[base_cat] = sub_cats time.sleep(1) return category_map def generate_dataset(self, num_records: int) -> pd.DataFrame: """生成核心数据集""" data = [] while len(data) < num_records: base_cat = random.choice(self.base_categories) sub_cat = random.choice(self.expanded_categories[base_cat]) location = random.choice(self.location_pool) content, keywords = self._generate_content(base_cat, sub_cat, location) if content and self._validate_record(content, keywords, base_cat): data.append({ "ID": len(data)+1, "内容": content, "关键词": " ".join(keywords), "参考答案": base_cat, "细分类型": sub_cat, "地理位置": location }) time.sleep(1.2) return pd.DataFrame(data) def _generate_content(self, base_cat: str, sub_cat: str, location: str) -> Tuple[str, List[str]]: """生成政务工单内容""" prompt = f"""生成真实可信的12345政务工单,要求: 1. 主分类:【{base_cat}】 2. 细分类型:【{sub_cat}】 3. 发生地点:【{location}】 4. 包含要素:时间、具体问题、影响范围、市民诉求 5. 生成5个关键词(必须包含{base_cat}) 6. 内容长度80-150字 示例格式: 市民反映{location}某建筑工地违规夜间施工至凌晨,噪音严重干扰周边居民。已向环保部门投诉3次未解决,要求立即停工整顿。 关键词:夜间施工 噪音污染 环保投诉 施工许可 居民维权""" try: response = client.chat.completions.create( model="gpt-4-turbo", messages=[ {"role": "system", "content": "你是政务数据生成专家"}, {"role": "user", "content": prompt} ], temperature=0.7, max_tokens=600 ) raw_text = response.choices[0].message.content.strip() return self._parse_generated_text(raw_text) except Exception as e: print(f"生成失败:{str(e)}") return None, [] def _parse_generated_text(self, text: str) -> Tuple[str, List[str]]: """解析生成文本""" content = re.sub(r"关键词:.*", "", text).strip() keywords = re.findall(r"关键词:(.+)", text)[0].split()[:5] return content, keywords def _validate_record(self, content: str, keywords: List[str], category: str) -> bool: """五重数据校验""" return ( len(content) >= 80 and len(keywords) == 5 and category in keywords and content not in self.used_records and any(c.isdigit() for c in content) # 包含数字要素 ) # 输入文件示例(input.xlsx) """ | 基础分类 | |--------------| | 施工管理 | | 消费维权 | | 城市管理 | | 公共服务 | """ if __name__ == "__main__": # 初始化生成器 generator = NationalDataGenerator( excel_path="input.xlsx", category_column="基础分类" ) # 生成100条数据 df = generator.generate_dataset(100) # 保存到Excel with pd.ExcelWriter("government_12345_data.xlsx") as writer: df.to_excel(writer, index=False) print("生成数据示例:") print(df[["ID", "内容", "关键词", "参考答案"]].head(3).to_string(index=False))