154 lines
5.7 KiB
Python
154 lines
5.7 KiB
Python
"""
|
||
政务12345全国数据生成系统
|
||
功能:
|
||
1. 支持全国范围地理位置生成
|
||
2. 多层级分类扩展
|
||
3. 数据保存至Excel
|
||
4. 真实业务场景模拟
|
||
"""
|
||
|
||
import pandas as pd
|
||
from openai import OpenAI
|
||
import random
|
||
import time
|
||
import re
|
||
from typing import List, Dict, Tuple
|
||
|
||
client = OpenAI(api_key="your-api-key")
|
||
|
||
class NationalDataGenerator:
|
||
def __init__(self, excel_path: str, category_column: str):
|
||
self.base_categories = self._load_excel_categories(excel_path, category_column)
|
||
self.location_pool = self._generate_national_locations()
|
||
self.expanded_categories = self._expand_categories_with_gpt()
|
||
self.used_records = set()
|
||
|
||
def _load_excel_categories(self, path: str, column: str) -> List[str]:
|
||
"""从Excel读取基础分类"""
|
||
df = pd.read_excel(path)
|
||
return df[column].dropna().unique().tolist()
|
||
|
||
def _generate_national_locations(self, num=200) -> List[str]:
|
||
"""生成全国真实地理位置库"""
|
||
response = client.chat.completions.create(
|
||
model="gpt-4-turbo",
|
||
messages=[{
|
||
"role": "user",
|
||
"content": f"生成{num}个中国各城市真实存在的地理位置,按省市区三级格式,示例:\n- 广东省广州市天河区珠江新城\n- 浙江省杭州市余杭区未来科技城"
|
||
}]
|
||
)
|
||
return [line.split(" ")[1] for line in response.choices[0].message.content.strip().split("\n")]
|
||
|
||
def _expand_categories_with_gpt(self) -> Dict[str, List[str]]:
|
||
"""GPT扩展分类体系"""
|
||
category_map = {}
|
||
for base_cat in self.base_categories:
|
||
response = client.chat.completions.create(
|
||
model="gpt-4-turbo",
|
||
messages=[{
|
||
"role": "user",
|
||
"content": f"生成与【{base_cat}】相关但具有政务场景区分度的5个细分类型,示例:\n- 类型1:施工许可违规\n- 类型2:夜间施工超时"
|
||
}]
|
||
)
|
||
sub_cats = [re.sub(r":.*", "", line.split(" ")[1])
|
||
for line in response.choices[0].message.content.strip().split("\n")]
|
||
category_map[base_cat] = sub_cats
|
||
time.sleep(1)
|
||
return category_map
|
||
|
||
def generate_dataset(self, num_records: int) -> pd.DataFrame:
|
||
"""生成核心数据集"""
|
||
data = []
|
||
while len(data) < num_records:
|
||
base_cat = random.choice(self.base_categories)
|
||
sub_cat = random.choice(self.expanded_categories[base_cat])
|
||
location = random.choice(self.location_pool)
|
||
|
||
content, keywords = self._generate_content(base_cat, sub_cat, location)
|
||
if content and self._validate_record(content, keywords, base_cat):
|
||
data.append({
|
||
"ID": len(data)+1,
|
||
"内容": content,
|
||
"关键词": " ".join(keywords),
|
||
"参考答案": base_cat,
|
||
"细分类型": sub_cat,
|
||
"地理位置": location
|
||
})
|
||
time.sleep(1.2)
|
||
|
||
return pd.DataFrame(data)
|
||
|
||
def _generate_content(self, base_cat: str, sub_cat: str, location: str) -> Tuple[str, List[str]]:
|
||
"""生成政务工单内容"""
|
||
prompt = f"""生成真实可信的12345政务工单,要求:
|
||
1. 主分类:【{base_cat}】
|
||
2. 细分类型:【{sub_cat}】
|
||
3. 发生地点:【{location}】
|
||
4. 包含要素:时间、具体问题、影响范围、市民诉求
|
||
5. 生成5个关键词(必须包含{base_cat})
|
||
6. 内容长度80-150字
|
||
|
||
示例格式:
|
||
市民反映{location}某建筑工地违规夜间施工至凌晨,噪音严重干扰周边居民。已向环保部门投诉3次未解决,要求立即停工整顿。
|
||
关键词:夜间施工 噪音污染 环保投诉 施工许可 居民维权"""
|
||
|
||
try:
|
||
response = client.chat.completions.create(
|
||
model="gpt-4-turbo",
|
||
messages=[
|
||
{"role": "system", "content": "你是政务数据生成专家"},
|
||
{"role": "user", "content": prompt}
|
||
],
|
||
temperature=0.7,
|
||
max_tokens=600
|
||
)
|
||
raw_text = response.choices[0].message.content.strip()
|
||
return self._parse_generated_text(raw_text)
|
||
except Exception as e:
|
||
print(f"生成失败:{str(e)}")
|
||
return None, []
|
||
|
||
def _parse_generated_text(self, text: str) -> Tuple[str, List[str]]:
|
||
"""解析生成文本"""
|
||
content = re.sub(r"关键词:.*", "", text).strip()
|
||
keywords = re.findall(r"关键词:(.+)", text)[0].split()[:5]
|
||
return content, keywords
|
||
|
||
def _validate_record(self, content: str, keywords: List[str], category: str) -> bool:
|
||
"""五重数据校验"""
|
||
return (
|
||
len(content) >= 80 and
|
||
len(keywords) == 5 and
|
||
category in keywords and
|
||
content not in self.used_records and
|
||
any(c.isdigit() for c in content) # 包含数字要素
|
||
)
|
||
|
||
|
||
# 输入文件示例(input.xlsx)
|
||
"""
|
||
| 基础分类 |
|
||
|--------------|
|
||
| 施工管理 |
|
||
| 消费维权 |
|
||
| 城市管理 |
|
||
| 公共服务 |
|
||
"""
|
||
|
||
|
||
if __name__ == "__main__":
|
||
# 初始化生成器
|
||
generator = NationalDataGenerator(
|
||
excel_path="input.xlsx",
|
||
category_column="基础分类"
|
||
)
|
||
|
||
# 生成100条数据
|
||
df = generator.generate_dataset(100)
|
||
|
||
# 保存到Excel
|
||
with pd.ExcelWriter("government_12345_data.xlsx") as writer:
|
||
df.to_excel(writer, index=False)
|
||
|
||
print("生成数据示例:")
|
||
print(df[["ID", "内容", "关键词", "参考答案"]].head(3).to_string(index=False)) |